In [2]:
import os
import pandas as pd
import nltk

from nltk.stem import *

#import relevant packages for conduct topic modeling analysis
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [3]:
# working directory
pwd: str = os.environ['HOME'] + '/work/assignment/assignment-8'

In [4]:
data = pd.read_csv(pwd + '/abcnews-date-text.csv')

In [5]:
#take a look at the content of the 'headline_text' column
data_text = data[['headline_text']]

In [6]:
data_text

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers
...,...
271877,tractor towing may have sparked blaze
271878,truck crash hampers pacific highway traffic
271879,turning pitch gives australia hope clarke
271880,two die in weekend accidents


In [7]:
#add a column to data_text for the row index
data_text['index'] = data_text.index
documents = data_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [33]:
print(documents.shape)
print(documents.head())

(271882, 2)
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
type(documents)

pandas.core.frame.DataFrame

In [11]:
#take a look at the content of a document with Index# 4310
doc_sample = documents[documents['index'] == 4310].values[0][0]

In [12]:
doc_sample

'rain helps dampen bushfires'

In [13]:
documents[documents['index'] == 4310]

Unnamed: 0,headline_text,index
4310,rain helps dampen bushfires,4310


In [14]:
#We will perform the following steps:
 #Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
 #Words that have fewer than 3 characters are removed.
 #All stopwords are removed.
 #Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
 #Words are stemmed — words are reduced to their root form.

In [15]:
stemmer = PorterStemmer()

In [16]:
#a function to perform lemmatize and stem pre-processing steps on the data set. You may "Google" each Python method to get the meaning of the parameters.

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [17]:
# processed_docs[:10]

In [18]:
#remove na values from the column 'headline_text'
documents = documents.dropna(subset=['headline_text'])

In [19]:
#select a document to preview after pre-processing
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

['rain', 'helps', 'dampen', 'bushfires']


 tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfir']


In [20]:
#preprocess'headline_text', save the results as 'processed_docs'
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0               [decid, commun, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [21]:
#create a dictionary from ‘processed_docs’ containing the number of times a word appears in the document set
dictionary = gensim.corpora.Dictionary(processed_docs)

In [22]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f3966f83810>

In [23]:
#Filter out tokens that appear in
 #less than 15 documents (absolute number) or
 #more than 0.5 documents (fraction of total corpus size, not absolute number).
 #after the above two steps, keep only the first 100000 most frequent tokens.

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [24]:
#For each document we create a dictionary reporting how many
#words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(71, 1), (107, 1), (460, 1), (3490, 1)]

In [25]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], dictionary[bow_doc_4310[i][0]], bow_doc_4310[i][1]))

Word 71 ("bushfir") appears 1 time.
Word 107 ("help") appears 1 time.
Word 460 ("rain") appears 1 time.
Word 3490 ("dampen") appears 1 time.


In [26]:
#run LDA using bag of words

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [27]:
#for each topic, we will explore the words occurring in that topic and its relative weight

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.020*"council" + 0.020*"hospit" + 0.018*"concern" + 0.017*"opposit" + 0.016*"plan" + 0.015*"work" + 0.013*"govt" + 0.012*"union" + 0.012*"sale" + 0.012*"worker"
Topic: 1 
Words: 0.069*"polic" + 0.030*"crash" + 0.028*"death" + 0.023*"investig" + 0.023*"jail" + 0.016*"probe" + 0.014*"die" + 0.014*"driver" + 0.013*"victim" + 0.012*"inquiri"
Topic: 2 
Words: 0.040*"warn" + 0.025*"miss" + 0.016*"fear" + 0.016*"search" + 0.014*"israel" + 0.013*"hop" + 0.011*"high" + 0.011*"issu" + 0.011*"cancer" + 0.010*"hit"
Topic: 3 
Words: 0.026*"water" + 0.015*"doctor" + 0.015*"strike" + 0.014*"expect" + 0.014*"liber" + 0.014*"close" + 0.012*"lebanon" + 0.012*"nation" + 0.012*"west" + 0.011*"rain"
Topic: 4 
Words: 0.042*"kill" + 0.023*"reject" + 0.023*"push" + 0.017*"farmer" + 0.017*"drought" + 0.014*"look" + 0.013*"blaze" + 0.011*"dead" + 0.011*"fuel" + 0.011*"injur"
Topic: 5 
Words: 0.037*"govt" + 0.035*"urg" + 0.030*"plan" + 0.023*"fund" + 0.018*"council" + 0.016*"group" + 0.015*"ris

In [28]:
lda_model[bow_corpus[4310]]

[(0, 0.020005198),
 (1, 0.28801462),
 (2, 0.020003535),
 (3, 0.55195034),
 (4, 0.020003535),
 (5, 0.02000861),
 (6, 0.020003535),
 (7, 0.02000354),
 (8, 0.020003535),
 (9, 0.020003535)]

In [29]:
processed_docs[4310]

['rain', 'help', 'dampen', 'bushfir']

In [30]:
#check the topic distribution for the Document# 4310.
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5517880916595459	 
Topic: 0.026*"water" + 0.015*"doctor" + 0.015*"strike" + 0.014*"expect" + 0.014*"liber" + 0.014*"close" + 0.012*"lebanon" + 0.012*"nation" + 0.012*"west" + 0.011*"rain"

Score: 0.2881767153739929	 
Topic: 0.069*"polic" + 0.030*"crash" + 0.028*"death" + 0.023*"investig" + 0.023*"jail" + 0.016*"probe" + 0.014*"die" + 0.014*"driver" + 0.013*"victim" + 0.012*"inquiri"

Score: 0.02000872604548931	 
Topic: 0.037*"govt" + 0.035*"urg" + 0.030*"plan" + 0.023*"fund" + 0.018*"council" + 0.016*"group" + 0.015*"rise" + 0.014*"health" + 0.013*"boost" + 0.012*"seek"

Score: 0.020005198195576668	 
Topic: 0.020*"council" + 0.020*"hospit" + 0.018*"concern" + 0.017*"opposit" + 0.016*"plan" + 0.015*"work" + 0.013*"govt" + 0.012*"union" + 0.012*"sale" + 0.012*"worker"

Score: 0.020003542304039	 
Topic: 0.040*"charg" + 0.035*"face" + 0.034*"court" + 0.023*"accus" + 0.021*"drug" + 0.014*"trial" + 0.014*"murder" + 0.013*"attack" + 0.013*"terror" + 0.013*"arrest"

Score: 0.02000353

In [31]:
#check the topic distribution for a new document (not in the corpus)

unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.3369967043399811	 Topic: 0.037*"govt" + 0.035*"urg" + 0.030*"plan" + 0.023*"fund" + 0.018*"council"
Score: 0.1965998113155365	 Topic: 0.021*"closer" + 0.019*"elect" + 0.015*"market" + 0.014*"nuclear" + 0.013*"chief"
Score: 0.18326738476753235	 Topic: 0.069*"polic" + 0.030*"crash" + 0.028*"death" + 0.023*"investig" + 0.023*"jail"
Score: 0.18303442001342773	 Topic: 0.040*"charg" + 0.035*"face" + 0.034*"court" + 0.023*"accus" + 0.021*"drug"
Score: 0.01668868213891983	 Topic: 0.018*"talk" + 0.018*"defend" + 0.016*"hold" + 0.016*"iraq" + 0.015*"break"
Score: 0.016683712601661682	 Topic: 0.022*"test" + 0.018*"protest" + 0.018*"open" + 0.018*"world" + 0.014*"australia"
Score: 0.016683179885149002	 Topic: 0.020*"council" + 0.020*"hospit" + 0.018*"concern" + 0.017*"opposit" + 0.016*"plan"
Score: 0.016682641580700874	 Topic: 0.040*"warn" + 0.025*"miss" + 0.016*"fear" + 0.016*"search" + 0.014*"israel"
Score: 0.01668216660618782	 Topic: 0.042*"kill" + 0.023*"reject" + 0.023*"push" + 0.017