#### Topic Modelling

Collect randomly as many as news content from different fields for training. (thru any form, web scrape, pdf, newspaper etc.)

#### Use case

Collect randomly another set of content for test.

Build topic modelling to understand the topics in the test data.

In [1]:
# !pip install PyPDF2
import PyPDF2

import warnings
warnings.filterwarnings('ignore')

In [2]:
def readpdf(file_name):
    pdf = open(file_name, 'rb')
    read_pdf = PyPDF2.PdfFileReader(pdf)
    page_count = read_pdf.getNumPages()
    text=''
    for page in range(1, page_count):
        msg = read_pdf.getPage(page).extractText()
        text = text + str(msg.encode('utf-8'))
    return text

In [3]:
train_text = readpdf('AJES_article_1_135_EarningPaper.pdf')
print('Train Text :: ', len(train_text))
test_text = readpdf('03_systematicapproachtotraining.pdf')
print('Test Text :: ', len(test_text))

Train Text ::  44006
Test Text ::  11512


In [4]:
train_text

'b\'Academic Journal of Economic Studies \\n \\nV\\nol.\\n \\n3\\n, \\nNo.\\n \\n4\\n,\\n \\nDecember \\n2017\\n,\\n \\npp. \\n9\\n6\\n\\n1\\n04\\n \\nISSN 2393\\n-\\n4913, ISSN\\n \\nOn\\n-\\nline 2457\\n-\\n5836\\n \\n9\\n6\\n \\nEmployee Learning Theories \\na\\nnd Their Organizational Applications\\n \\n \\nAbdussalaam Iyanda Ismail\\n1\\n, \\nAbdul\\n-\\nHalim Abdul\\n-\\nMajid\\n2\\n, \\nHammed Oluwaseyi\\n \\nMusibau\\n3\\n \\n \\n \\n1,2\\nSchool of Business M\\nanagement, College of Business, \\nUniversiti Utara Malaysia, Sintok, 06010 Kedah, Malaysia\\n \\n3\\nFaculty of Economics and Management of Science, \\nUniversiti Sultan Zainal Abidin, Kuala Terengannu, Malaysia\\n,\\n \\n3\\nE\\n-\\nmail: \\nmhameed.transinex@gmail.com\\n \\n(\\nCorresponding author\\n)\\n \\n \\n \\nAbstract\\n \\nEmpirical evidence identifies that organizational success hinges on employees with the required knowledge, skills, and abilit\\nies and \\nt\\n\\norganization adopts. Given this, this work 

In [5]:
import numpy as np
import pandas as pd

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import sent_tokenize

In [6]:
stemmer = SnowballStemmer("english")
def lemma_stem(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [7]:
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
            result.append(lemma_stem(token))
    return result

In [8]:
def processed_text(text):
    final_text = []
    sent = sent_tokenize(text)
    for i in sent:
        final_text.append(preprocess(i))

    print('Length of Processed Document ::', len(final_text))
    return final_text

In [9]:
final_text = processed_text(train_text)

Length of Processed Document :: 333


In [10]:
dictionary = gensim.corpora.Dictionary(final_text)

count=0
for k,v in dictionary.iteritems():
    print (k, v)
    count +=1
    if count >20:
        break

0 academ
1 econom
2 journal
3 ndecemb
4 studi
5 abdul
6 abidin
7 abilit
8 adopt
9 applic
10 author
11 busi
12 colleg
13 employe
14 evid
15 gmail
16 hing
17 identifi
18 ismail
19 issn
20 iyanda


In [11]:
#remove rare and repeatative words
dictionary.filter_extremes(no_below=15,no_above=0.1,keep_n=100000)
print(dictionary)

Dictionary(15 unique tokens: ['employe', 'organiz', 'behavior', 'organ', 'enhanc']...)


In [12]:
bow_corpus = [dictionary.doc2bow(doc) for doc in final_text]
bow_corpus[5]

[(0, 1), (1, 2), (2, 2), (4, 1), (5, 2)]

In [13]:
document_num = 10
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], dictionary[bow_doc_x[i][0]], bow_doc_x[i][1]))

Word 1 ("organiz") appears 1 time.
Word 2 ("behavior") appears 1 time.
Word 4 ("enhanc") appears 2 time.
Word 6 ("develop") appears 1 time.
Word 8 ("reinforc") appears 1 time.


In [16]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, num_topics = 10, id2word = dictionary, passes = 10, workers = 2)

for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.872*"develop" + 0.058*"human" + 0.013*"process" + 0.007*"enhanc" + 0.006*"organiz" + 0.005*"inform" + 0.005*"nthe" + 0.004*"organ" + 0.004*"individu" + 0.004*"reinforc"


Topic: 1 
Words: 0.916*"reinforc" + 0.011*"perform" + 0.010*"goal" + 0.007*"employe" + 0.006*"enhanc" + 0.006*"process" + 0.005*"motiv" + 0.005*"develop" + 0.005*"inform" + 0.005*"human"


Topic: 2 
Words: 0.472*"employe" + 0.276*"individu" + 0.135*"perform" + 0.093*"enhanc" + 0.004*"goal" + 0.003*"motiv" + 0.002*"inform" + 0.002*"develop" + 0.002*"organ" + 0.002*"behavior"


Topic: 3 
Words: 0.468*"organiz" + 0.141*"organ" + 0.138*"employe" + 0.126*"perform" + 0.064*"behavior" + 0.036*"nthe" + 0.007*"motiv" + 0.004*"human" + 0.003*"reinforc" + 0.002*"process"


Topic: 4 
Words: 0.569*"process" + 0.357*"inform" + 0.044*"nthe" + 0.005*"organ" + 0.003*"behavior" + 0.003*"human" + 0.002*"reinforc" + 0.002*"employe" + 0.002*"enhanc" + 0.002*"perform"


Topic: 5 
Words: 0.860*"goal" + 0.043*"individu" + 

In [18]:
bow_vector = dictionary.doc2bow(preprocess(test_text))

for index, score in sorted(lda_model[bow_vector]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 30)))

Score: 0.10757310688495636	 Topic: 0.872*"develop" + 0.058*"human" + 0.013*"process" + 0.007*"enhanc" + 0.006*"organiz" + 0.005*"inform" + 0.005*"nthe" + 0.004*"organ" + 0.004*"individu" + 0.004*"reinforc" + 0.004*"behavior" + 0.004*"goal" + 0.004*"perform" + 0.004*"employe" + 0.004*"motiv"
Score: 0.19678042829036713	 Topic: 0.472*"employe" + 0.276*"individu" + 0.135*"perform" + 0.093*"enhanc" + 0.004*"goal" + 0.003*"motiv" + 0.002*"inform" + 0.002*"develop" + 0.002*"organ" + 0.002*"behavior" + 0.002*"organiz" + 0.002*"reinforc" + 0.002*"nthe" + 0.002*"process" + 0.002*"human"
Score: 0.11338149756193161	 Topic: 0.569*"process" + 0.357*"inform" + 0.044*"nthe" + 0.005*"organ" + 0.003*"behavior" + 0.003*"human" + 0.002*"reinforc" + 0.002*"employe" + 0.002*"enhanc" + 0.002*"perform" + 0.002*"develop" + 0.002*"goal" + 0.002*"organiz" + 0.002*"motiv" + 0.002*"individu"
Score: 0.20373786985874176	 Topic: 0.512*"organ" + 0.396*"nthe" + 0.027*"enhanc" + 0.019*"human" + 0.015*"employe" + 0.005*"