In [None]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import os
import re
from matplotlib import colors

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True) #add flag force_remount = True if needed

In [None]:
!cp -rf "/content/drive/MyDrive/LDA/News.zip" .
!cp -rf "/content/drive/MyDrive/LDA/Headline.zip" .
!unzip -q News.zip
!unzip -q Headline.zip

In [None]:
#df = pd.read_json('News_Category_Dataset_v3.json', orient = 'records', lines = True)
#df[['headline', 'category', 'short_description']].head()
df = pd.read_csv('abcnews-date-text.csv')
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

stopwordset = set(stopwords.words())
# for named entity recognition (NER)
from nltk import ne_chunk

In [None]:
def clean_text(headline):
  le=WordNetLemmatizer()
  word_tokens=word_tokenize(headline)
  tokens=[le.lemmatize(w) for w in word_tokens if w not in stopwordset]
  cleaned_text=" ".join(tokens)
  return cleaned_text

In [None]:
df['head_clean']=df['headline_text'].apply(clean_text)
#df['desc_clean']=df['short_description'].apply(clean_text)
#df = df.drop(['authors', 'date'], axis = 1)

In [None]:
df.sample(10)

Unnamed: 0,publish_date,headline_text,head_clean
50467,20031022,solomons pm stranded en route to meeting with ...,solomon pm stranded route meeting bush
966445,20150721,robert xie met undercover officer in custody c...,robert xie undercover officer custody court
120206,20041008,sub saharan child mortality worse than 1994,saharan child mortality worse 1994
638617,20110912,rothwell migration act amendments,rothwell migration act amendment
292591,20070319,jolies adoption not fast tracked,jolies adoption fast tracked
534321,20100428,dhoni says world t20 wide open,dhoni world t20 wide open
1130698,20180614,does the world cup make countries more aggress...,world cup make country aggressive football
661970,20111223,cross border war over weed,cross border weed
1132750,20180701,child care subsidy changes what you need to know,child subsidy change
424265,20081115,former premier files report on labor nt election,former premier file report labor nt election


In [None]:
df.shape


In [None]:
#df.drop(['headline', 'short_description'], axis=1, inplace=True)
#df['clean_combined'] = df['head_clean'] + ' ' + df['desc_clean']
df.drop(['headline_text'], axis=1, inplace=True)
df.head()

Unnamed: 0,publish_date,head_clean
0,20030219,aba decides community broadcasting licence
1,20030219,act fire witness aware defamation
2,20030219,g call infrastructure protection summit
3,20030219,air nz staff aust strike pay rise
4,20030219,air nz strike affect australian traveller


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
vect = TfidfVectorizer(stop_words=list(stopwordset), max_features=5000)

#vect_text=vect.fit_transform(df['clean_combined'])
vect_text=vect.fit_transform(df['head_clean'])
idf = vect.idf_

In [None]:
dd=dict(zip(vect.get_feature_names_out(), idf))
l=sorted(dd, key=(dd).get)
# print(l)
print(l[0],l[-1])
print(dd['trump'])
print(dd['beer'])  # police is most common and underworld is least common among the news headlines.

police underworld
6.358598448961041
8.643750587985473


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=42,max_iter=10) 
lda_top=lda_model.fit_transform(vect_text)
print(lda_top.shape) 
print(lda_top)

(1244184, 10)
[[0.0415823  0.0415823  0.0415823  ... 0.0415823  0.0415823  0.62575927]
 [0.03140408 0.03140408 0.03140408 ... 0.03140408 0.62191844 0.03140408]
 [0.03362727 0.03362474 0.03362474 ... 0.03362474 0.03363289 0.21721026]
 ...
 [0.03363973 0.03363973 0.03363973 ... 0.16718878 0.03363973 0.35369275]
 [0.24114387 0.03373106 0.03373106 ... 0.03373106 0.03373106 0.03373106]
 [0.29551169 0.03107981 0.15337297 ... 0.03107981 0.03107981 0.16075523]]


In [None]:
print("Document 2: ")
for i, prob in enumerate(lda_top[2]):
  print("Topic ",i,": ", prob)

Document 2: 
Topic  0 :  0.03362726973483273
Topic  1 :  0.033624744107486025
Topic  2 :  0.03362474410733749
Topic  3 :  0.033624744107327055
Topic  4 :  0.033624744107382684
Topic  5 :  0.5137811196804476
Topic  6 :  0.03362474410752134
Topic  7 :  0.033624744107230924
Topic  8 :  0.033632885933390415
Topic  9 :  0.2172102600070437


In [None]:
vocab = vect.get_feature_names_out()

for i, comp in enumerate(lda_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:20]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
woman covid world court 19 change country house charge life cup australia labor north funding talk afl murder guilty drum 

Topic 1: 
australian queensland home show worker former student green rate abuse png river bank vote star support town morrison pacific begin 

Topic 2: 
case child adelaide trial donald dead make sex federal protest victorian assault review arrested darwin season port drought andrew perth 

Topic 3: 
coronavirus trump coast found tasmania test final accused gold return tasmanian bushfire search leader inquiry court fight body appeal commission 

Topic 4: 
police interview death hit record time west ban driver find group royal tax station run officer fatal trade aussie art 

Topic 5: 

Topic 6: 
plan china victoria national open farmer killed work industry farm fall park jailed council climate local regional young vaccine hill 

Topic 7: 
year wa south win news attack family canberra health business minister nt budget live fear top centre resident speaks

In [None]:
results_sample = df.sample(20)
topics = []

for idx in results_sample.index:
  topicstr = ''
  for i, prob in enumerate(lda_top[idx]):
    if prob >= 0.2:
      topicstr += ('Topic ' + str(i) + ": " + str(prob)[:5] + " ")
  
  topics.append(topicstr)

results_sample['topics'] = topics
results_sample

Unnamed: 0,publish_date,head_clean,topics
498019,20091031,lift ban hiv positive traveller,
530986,20100412,former olympian head commonwealth game bid,Topic 1: 0.260 Topic 5: 0.244
575371,20101112,church sex abuse victim urged militant,Topic 0: 0.265
555448,20100808,eagle fill perry void kite,Topic 0: 0.267 Topic 8: 0.253 Topic 9: 0.221
745419,20121202,dead hit run,Topic 2: 0.248 Topic 4: 0.457
600202,20110311,ambulance attacked alice spring,Topic 4: 0.531 Topic 9: 0.201
585029,20101230,top rainfall expected ease,Topic 7: 0.540
1196639,20200425,former cop hunting cannabis army veteran ptsd,Topic 1: 0.245 Topic 4: 0.271
992778,20151114,grandstand weekender november 14,Topic 1: 0.454 Topic 9: 0.252
255836,20060815,china condemns japanese pm visit shrine,Topic 3: 0.302 Topic 6: 0.310
