In [1]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import numpy as np
np.random.seed(400)

In [2]:
data=pd.read_csv('news.csv')

In [3]:
data.head(3)

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit


In [4]:
text=data[['headline_text']]

In [5]:
text['index'] = text.index
documents = text
print(documents.head(5))

                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


In [6]:
documents.shape

(10000, 2)

In [7]:
WordNetLemmatizer().lemmatize('runs')

'run'

In [8]:
stemmer = SnowballStemmer("english")
original_words=['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']

singles= [WordNetLemmatizer().lemmatize(plural,pos='v') for plural in original_words]

pd.DataFrame(data={'Original Words':original_words, 'Lemma':singles})

Unnamed: 0,Original Words,Lemma
0,caresses,caress
1,flies,fly
2,dies,die
3,mules,mules
4,denied,deny
5,died,die
6,agreed,agree
7,owned,own
8,humbled,humble
9,sized,size


In [9]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='v'))
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
            result.append(lemmatize_stemming(token))
            
    return result        

In [10]:
doc_sample = documents[documents['index'] == 1].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['act', 'fire', 'witnesses', 'must', 'be', 'aware', 'of', 'defamation']


 tokenized and lemmatized document: 
['wit', 'awar', 'defam']


In [11]:
processed_docs = []

for doc in documents['headline_text']:
    processed_docs.append(preprocess(doc))

In [12]:
len(processed_docs)

10000

In [13]:
print(processed_docs[:2])

[['decid', 'communiti', 'broadcast', 'licenc'], ['wit', 'awar', 'defam']]


In [14]:
#bag of words on the data set
dictionary = gensim.corpora.Dictionary(processed_docs)
print (dictionary)

Dictionary(6518 unique tokens: ['broadcast', 'communiti', 'decid', 'licenc', 'awar']...)


In [15]:
count=0
for k,v in dictionary.iteritems():
    print (k, v)
    count +=1
    if count >20:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit
11 aust
12 rise
13 staff
14 strike
15 affect
16 australian
17 travel
18 ambiti
19 jump
20 olsson


In [16]:
dictionary.filter_extremes(no_below=15,no_above=0.1,keep_n=100000)
print (dictionary)

Dictionary(723 unique tokens: ['communiti', 'decid', 'wit', 'call', 'protect']...)


In [17]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[1]

[(2, 1)]

In [18]:
document_num = 2
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]],
                                                     bow_doc_x[i][1]))

Word 3 ("call") appears 1 time.
Word 4 ("protect") appears 1 time.
Word 5 ("summit") appears 1 time.


In [19]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [20]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.6441604901095495), (1, 0.7648903601051755)]


In [21]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)


In [22]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.024*"hospit" + 0.020*"forc" + 0.017*"troop" + 0.016*"plan" + 0.015*"iraqi" + 0.013*"power" + 0.013*"arrest" + 0.012*"industri" + 0.011*"polic" + 0.010*"face"
Topic: 1 
Words: 0.086*"iraq" + 0.020*"say" + 0.018*"crash" + 0.017*"report" + 0.016*"saddam" + 0.013*"king" + 0.013*"iraqi" + 0.013*"season" + 0.013*"govt" + 0.012*"council"
Topic: 2 
Words: 0.023*"polic" + 0.020*"meet" + 0.014*"play" + 0.014*"critic" + 0.013*"miss" + 0.012*"continu" + 0.012*"injuri" + 0.011*"drug" + 0.010*"action" + 0.009*"consid"
Topic: 3 
Words: 0.041*"baghdad" + 0.041*"kill" + 0.037*"sar" + 0.021*"case" + 0.017*"troop" + 0.014*"australian" + 0.014*"injur" + 0.013*"report" + 0.012*"year" + 0.011*"airport"
Topic: 4 
Words: 0.032*"iraqi" + 0.031*"protest" + 0.024*"anti" + 0.019*"claim" + 0.018*"iraq" + 0.017*"dead" + 0.015*"suicid" + 0.014*"attack" + 0.014*"test" + 0.012*"jail"
Topic: 5 
Words: 0.019*"charg" + 0.019*"start" + 0.018*"water" + 0.018*"polic" + 0.015*"face" + 0.012*"court" + 0.012

In [23]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [24]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.012*"charg" + 0.012*"despit" + 0.011*"woman" + 0.009*"land" + 0.009*"death" + 0.008*"govt" + 0.008*"storm" + 0.008*"rais" + 0.008*"week" + 0.008*"fight"
Topic: 1 Word: 0.018*"protest" + 0.017*"anti" + 0.014*"murder" + 0.012*"hop" + 0.012*"baghdad" + 0.011*"report" + 0.009*"king" + 0.009*"critic" + 0.009*"plan" + 0.009*"iraq"
Topic: 2 Word: 0.016*"death" + 0.012*"council" + 0.012*"deni" + 0.012*"hous" + 0.012*"forc" + 0.011*"hold" + 0.011*"polic" + 0.011*"releas" + 0.011*"shark" + 0.011*"track"
Topic: 3 Word: 0.038*"iraq" + 0.017*"warn" + 0.012*"protest" + 0.011*"port" + 0.010*"urg" + 0.010*"prepar" + 0.010*"restrict" + 0.009*"player" + 0.009*"water" + 0.009*"bush"
Topic: 4 Word: 0.017*"boost" + 0.013*"doctor" + 0.011*"studi" + 0.010*"trade" + 0.010*"report" + 0.009*"expect" + 0.009*"island" + 0.009*"damag" + 0.009*"australia" + 0.009*"drop"
Topic: 5 Word: 0.028*"baghdad" + 0.017*"kill" + 0.014*"plan" + 0.012*"face" + 0.011*"tour" + 0.011*"test" + 0.010*"concern" + 0.00

In [25]:
unseen_document="""It is my understanding that all True-Type fonts in Windows are loaded in
prior to starting Windows - this makes getting into Windows quite slow if you
have hundreds of them as I do.  First off, am I correct in this thinking -
secondly, if that is the case - can you get Windows to ignore them on boot and
maybe make something like a PIF file to load them only when you enter the
applications that need fonts?  Any ideas?"""


bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))


Score: 0.38157644867897034	 Topic: 0.036*"plan" + 0.021*"lead" + 0.019*"support" + 0.019*"iraq" + 0.015*"take"
Score: 0.3157622218132019	 Topic: 0.041*"baghdad" + 0.041*"kill" + 0.037*"sar" + 0.021*"case" + 0.017*"troop"
Score: 0.2248576283454895	 Topic: 0.019*"charg" + 0.019*"start" + 0.018*"water" + 0.018*"polic" + 0.015*"face"
Score: 0.01111773680895567	 Topic: 0.028*"govt" + 0.020*"warn" + 0.018*"minist" + 0.017*"court" + 0.016*"death"
Score: 0.011116727255284786	 Topic: 0.031*"govt" + 0.026*"urg" + 0.026*"charg" + 0.016*"group" + 0.014*"missil"
Score: 0.011115304194390774	 Topic: 0.032*"iraqi" + 0.031*"protest" + 0.024*"anti" + 0.019*"claim" + 0.018*"iraq"
Score: 0.011114493012428284	 Topic: 0.086*"iraq" + 0.020*"say" + 0.018*"crash" + 0.017*"report" + 0.016*"saddam"
Score: 0.011113994754850864	 Topic: 0.024*"hospit" + 0.020*"forc" + 0.017*"troop" + 0.016*"plan" + 0.015*"iraqi"
Score: 0.011113286018371582	 Topic: 0.028*"world" + 0.027*"fund" + 0.025*"say" + 0.020*"open" + 0.015*"e