In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%cd /content/drive/My Drive/ColabData

/content/drive/My Drive/ColabData


In [3]:
import pandas as pd
data = pd.read_csv('abcnews-date-text.csv')

  data = pd.read_csv('abcnews-date-text.csv')


In [4]:
#take a look at the content of the 'headline_text' column
data_text = data[['headline_text']]

In [5]:
data_text

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers
...,...
1041788,who was alphabay founder alexandre cazes
1041789,wolfe brothers berry farming country music in ...
1041790,wollongong refugee ramps up liberian charity work
1041791,women choosing diy ivf sperm inseminsation to ...


In [6]:
#add a column to data_text for the row index
data_text['index'] = data_text.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text['index'] = data_text.index


In [7]:
len(data_text)

1041793

In [8]:
data_sample = data_text[data_text['index'] == 4310]
data_sample

Unnamed: 0,headline_text,index
4310,rain helps dampen bushfires,4310


In [39]:
#take a look at the content of a document with Index# 4310; data_sample is a dataframe. [0][0] is to extract the value of the first row and the first column
doc_sample = data_sample.values[0][0]

In [40]:
doc_sample

'rain helps dampen bushfires'

In [11]:
#We will perform the following steps:
 #Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
 #Words that have fewer than 3 characters are removed.
 #All stopwords are removed.
 #Words are lemmatized — verbs in third person are changed to first person and verbs in past and future tenses are changed into present.
 #Words are stemmed — words are reduced to their root form.


#import relvant packages for conduct topic modeling analysis
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np


In [17]:
from nltk.stem import*
stemmer = PorterStemmer()

In [12]:
len(gensim.parsing.preprocessing.STOPWORDS)

337

In [13]:
gensim.parsing.preprocessing.STOPWORDS

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [14]:
#a function to perform lemmatize and stem preprocessing steps on the data set. Without contextual information the lemmatiser’s not able to distinguish between nouns and verbs. In this case, "pos = 'v'" means treat the words as verbs.

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [15]:
#remove na values from the column 'headline_text'
data_text = data_text.dropna(subset=['headline_text'])

In [18]:
#select a document to preview after preprocessing
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['rain', 'helps', 'dampen', 'bushfires']


 tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfir']


In [19]:
#preprocess'headline_text', save the results as 'processed_docs'
processed_docs = data_text['headline_text'].map(preprocess)
processed_docs[:10]

0               [decid, commun, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [20]:
#create a dictionary from ‘processed_docs’ containing the number of times a word appears in the document set
dictionary = gensim.corpora.Dictionary(processed_docs)

In [21]:
#Filter out tokens that appear in
 #less than 15 documents (absolute number) or
 #more than 0.5 documents (fraction of total corpus size, not absolute number).
 #after the above two steps, keep only the first 100000 most frequent tokens.

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [22]:
#For each document we create a dictionary reporting how many
#words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(76, 1), (112, 1), (483, 1), (4005, 1)]

In [23]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], dictionary[bow_doc_4310[i][0]], bow_doc_4310[i][1]))

Word 76 ("bushfir") appears 1 time.
Word 112 ("help") appears 1 time.
Word 483 ("rain") appears 1 time.
Word 4005 ("dampen") appears 1 time.


In [24]:
#run LDA using bag of words
#Because LDA uses randomness in both training and inference steps, setting a random state is very important to control the randomness to make answers repeatable.
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2,random_state=2023)

In [25]:
#for each topic, we will explore the words occuring in that topic and its relative weight.

for idx, topic in lda_model.print_topics(-1): # print all the topics. (-1) means all the topics.
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.028*"second" + 0.022*"hour" + 0.018*"adelaid" + 0.018*"trump" + 0.018*"canberra" + 0.013*"australia" + 0.012*"work" + 0.012*"weather" + 0.011*"lose" + 0.010*"brisban"
Topic: 1 
Words: 0.022*"crash" + 0.020*"final" + 0.018*"die" + 0.017*"tasmanian" + 0.014*"world" + 0.013*"leav" + 0.013*"women" + 0.012*"beat" + 0.012*"life" + 0.012*"victoria"
Topic: 2 
Words: 0.055*"say" + 0.019*"report" + 0.015*"kill" + 0.015*"attack" + 0.012*"hous" + 0.011*"minist" + 0.011*"releas" + 0.010*"prison" + 0.009*"australian" + 0.009*"northern"
Topic: 3 
Words: 0.039*"australia" + 0.030*"rural" + 0.026*"queensland" + 0.026*"south" + 0.019*"china" + 0.019*"north" + 0.016*"west" + 0.012*"miss" + 0.012*"flood" + 0.012*"talk"
Topic: 4 
Words: 0.053*"polic" + 0.027*"charg" + 0.019*"sydney" + 0.018*"death" + 0.018*"perth" + 0.017*"murder" + 0.016*"jail" + 0.016*"woman" + 0.015*"drug" + 0.013*"arrest"
Topic: 5 
Words: 0.025*"countri" + 0.015*"health" + 0.014*"council" + 0.013*"hospit" + 0.013*"sc

In [26]:
lda_model[bow_corpus[4310]]

[(0, 0.020004794),
 (1, 0.020004192),
 (2, 0.020004192),
 (3, 0.5379566),
 (4, 0.020005325),
 (5, 0.3020047),
 (6, 0.020005483),
 (7, 0.020004192),
 (8, 0.020006312),
 (9, 0.020004217)]

In [27]:
#check the topic distribution for the Document# 4310.
#"tup:-1tup[1]" is a parameter for the key lambda for the sorted function. It means sorting by the reverse order of the topic socre.
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10))) #for each topic, print the top 10 words in that topic.


Score: 0.5379974246025085	 
Topic: 0.039*"australia" + 0.030*"rural" + 0.026*"queensland" + 0.026*"south" + 0.019*"china" + 0.019*"north" + 0.016*"west" + 0.012*"miss" + 0.012*"flood" + 0.012*"talk"

Score: 0.3019638657569885	 
Topic: 0.025*"countri" + 0.015*"health" + 0.014*"council" + 0.013*"hospit" + 0.013*"school" + 0.012*"fund" + 0.012*"servic" + 0.012*"chang" + 0.011*"trial" + 0.011*"worker"

Score: 0.020006312057375908	 
Topic: 0.038*"govern" + 0.029*"interview" + 0.023*"home" + 0.022*"news" + 0.018*"tasmania" + 0.016*"island" + 0.015*"leagu" + 0.012*"bank" + 0.011*"darwin" + 0.010*"john"

Score: 0.02000548131763935	 
Topic: 0.025*"market" + 0.025*"nation" + 0.025*"australian" + 0.020*"busi" + 0.018*"live" + 0.015*"share" + 0.014*"elect" + 0.014*"open" + 0.013*"return" + 0.012*"park"

Score: 0.020005322992801666	 
Topic: 0.053*"polic" + 0.027*"charg" + 0.019*"sydney" + 0.018*"death" + 0.018*"perth" + 0.017*"murder" + 0.016*"jail" + 0.016*"woman" + 0.015*"drug" + 0.013*"arrest"


In [28]:
#check the topic coverage for a new document (not in the corpus)

unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5))) #for each topic, print the top 5 words in that topic.

Score: 0.2960953116416931	 Topic: 0.039*"australia" + 0.030*"rural" + 0.026*"queensland" + 0.026*"south" + 0.019*"china"
Score: 0.22962439060211182	 Topic: 0.055*"say" + 0.019*"report" + 0.015*"kill" + 0.015*"attack" + 0.012*"hous"
Score: 0.19090256094932556	 Topic: 0.038*"govern" + 0.029*"interview" + 0.023*"home" + 0.022*"news" + 0.018*"tasmania"
Score: 0.18329013884067535	 Topic: 0.053*"polic" + 0.027*"charg" + 0.019*"sydney" + 0.018*"death" + 0.018*"perth"
Score: 0.016681775450706482	 Topic: 0.028*"second" + 0.022*"hour" + 0.018*"adelaid" + 0.018*"trump" + 0.018*"canberra"
Score: 0.016681678593158722	 Topic: 0.025*"market" + 0.025*"nation" + 0.025*"australian" + 0.020*"busi" + 0.018*"live"
Score: 0.01668151654303074	 Topic: 0.025*"countri" + 0.015*"health" + 0.014*"council" + 0.013*"hospit" + 0.013*"school"
Score: 0.01668146438896656	 Topic: 0.022*"melbourn" + 0.018*"plan" + 0.015*"water" + 0.013*"coast" + 0.013*"concern"
Score: 0.01668059080839157	 Topic: 0.039*"court" + 0.027*"fa