## Load required libraries

In [1]:
# !pip install bertopic
# import numpy as np

In [13]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import nltk
from bertopic import BERTopic
import torch
import re
import altair as alt
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')
# from sklearn.datasets import fetch_20newsgroups

## Required Utilities 

### Pre-processing Utilities 

In [14]:
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', \
                   'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need',  \
                   'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    lst_text = text.split()
    ## choose only alphbetical words and filter words less than 3 chars
    lst_text = [token.lower() for token in lst_text if token.isalpha() and len(token) >=3]      
#     text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
#     lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
    
    lst_text = [token for token in lst_text if len(token) >=3]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

def lemmatization(texts, allowed_postags=allowed_postags):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return(texts_out)

## Load Data

In [15]:
# Read articles for 2004 for test purpose
# basepath = '/mnt/d/Amit/data-science/MADS/SIADS696/'
# filepath = 'datasets/raw_data/'
# filepath = 'scrapy/missing_articles/missing_articles/'
basepath = ''
filepath = ''
filename = 'articles_2004.jl.gz'
filepath_name = basepath + filepath + filename
filepath_name

'articles_2004.jl.gz'

In [53]:
articles_df = pd.read_json(filepath_name, lines=True)
articles_df = articles_df.dropna()
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53394 entries, 0 to 54985
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        53394 non-null  int64         
 1   date      53394 non-null  datetime64[ns]
 2   url       53394 non-null  object        
 3   title     53394 non-null  object        
 4   category  53394 non-null  object        
 5   article   53394 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 2.9+ MB


### Pre-processing Text

In [54]:
%%time
articles_df['clean_text'] = articles_df['article'].apply(lambda text : 
                        utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=stop_words))

articles_df.head()

CPU times: user 40.3 s, sys: 52 ms, total: 40.3 s
Wall time: 40.4 s


Unnamed: 0,id,date,url,title,category,article,clean_text
0,1,2004-01-01,https://timesofindia.indiatimes.com/business/i...,economy-breaks-8-pc-barrier,"[business, india-business]",NEW DELHI: The feel-good factor got a boost on...,new factor got boost last day indian economy b...
1,2,2004-01-01,https://timesofindia.indiatimes.com/world/paki...,jaish-leader-missing-after-attacks,"[world, pakistan]",ISLAMABAD: Some activists of the banned milita...,activist banned militant outfit arrested conne...
2,3,2004-01-01,https://timesofindia.indiatimes.com/world/us/1...,1m-for-jackson-interview-to-cbs,"[world, us]","&lt;div class=""section1""&gt;&lt;div class=""Nor...",michael jackson struck deal cbs paid effect ad...
3,4,2004-01-01,https://timesofindia.indiatimes.com/india/sars...,sars-screening-takes-off-at-igi,"[timesofindia.indiatimes.com, india]",NEW DELHI: With the SARS virus raising its hea...,new sars virus raising head airport authority ...
4,5,2004-01-01,https://timesofindia.indiatimes.com/india/indi...,india-to-test-fire-agni-iii-soon,"[timesofindia.indiatimes.com, india]",NEW DELHI: The nuclear-capable â€˜Agni-IIIâ€™ ...,new capable hitting strategic target deep insi...


### Run process for whole 2004 articles

### Create Model

In [22]:
# from bertopic import BERTopic

In [55]:
# choosing number of articles to process for stub testing
# n = len(articles_df)
n = 20000
sample_df = articles_df.sample(n).reset_index(drop=True)
texts = list(sample_df['clean_text'].values)

In [56]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53394 entries, 0 to 54985
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          53394 non-null  int64         
 1   date        53394 non-null  datetime64[ns]
 2   url         53394 non-null  object        
 3   title       53394 non-null  object        
 4   category    53394 non-null  object        
 5   article     53394 non-null  object        
 6   clean_text  53394 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 3.3+ MB


In [57]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [58]:
%%time
topic_model = BERTopic(language='english', calculate_probabilities=True,
                           verbose=True)
topics, probs = topic_model.fit_transform(texts)

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

2022-10-01 18:29:50,954 - BERTopic - Transformed documents to Embeddings
2022-10-01 18:30:11,962 - BERTopic - Reduced dimensionality
2022-10-01 18:30:58,090 - BERTopic - Clustered reduced embeddings


CPU times: user 20min 26s, sys: 5min 54s, total: 26min 21s
Wall time: 7min 31s


### Get Document - Topic(dominant) mapping

In [59]:
def dom_top_prob(doc_indx, topic_id):
    prob = topic_model.probabilities_[doc_indx][topic_id]
    return(prob)

In [44]:
# sample_df.head()

In [60]:
sample_df['topic_id'] = topic_model.topics_
sample_df['topic_prob'] = sample_df.apply(lambda x : dom_top_prob(x.name, x.topic_id), axis=1)
sample_df.head()

Unnamed: 0,id,date,url,title,category,article,clean_text,topic_id,topic_prob
0,6759,2004-02-05,https://timesofindia.indiatimes.com/india/indi...,india-hails-portugal-courts-decision,"[timesofindia.indiatimes.com, india]","NEW DELHI: Hailing as ""positive"" a Portuguese ...",new hailing portuguese court ruling extraditio...,-1,0.001593
1,6574,2004-02-04,https://timesofindia.indiatimes.com/city/chand...,chandigarh-to-host-first-health-mela-from-feb-8,"[city, chandigarh]",CHANDIGARH: With an aim to generate awareness ...,aim generate awareness various first ever heat...,1,1.0
2,49367,2004-11-06,https://timesofindia.indiatimes.com/india/3-bl...,3-blasts-in-hyderabad-cops-still-clueless,"[timesofindia.indiatimes.com, india]",HYDERABAD: Three blasts in quick succession in...,three blast quick succession recent one anothe...,161,0.186546
3,11643,2004-03-02,https://timesofindia.indiatimes.com/city/kolka...,hush-up-charge-hounds-hospital,"[city, kolkata]",KOLKATA: Tapan Bera required immediate medical...,tapan bera required immediate medical got bed ...,-1,0.004343
4,20880,2004-04-22,https://timesofindia.indiatimes.com/city/hyder...,snapping-cable-tv-line-brings-voters-to-booth,"[city, hyderabad]","&lt;div class=""section1""&gt;&lt;div class=""Nor...",polling finding name list biggest trouble cont...,0,0.0335


### Get Dominant Topic - Doc mapping

In [61]:
num_topics = len(topic_model.get_topic_info())-1
topic_list = list(range(num_topics))
top_term_list = []
rep_docs_list = []
for topic_id in topic_list:    
    top_term_prob_lst = topic_model.get_topic(topic=topic_id)
    top_terms = [term for term,prob in top_term_prob_lst]
    top_term_list.append(top_terms)
    doc = topic_model.get_representative_docs(topic_id)[0]
    rep_docs_list.append(doc)
    

dom_topic_doc_df = pd.DataFrame({'topic_id': topic_list, 'top_terms':top_term_list, 'rep_docs': rep_docs_list})
dom_topic_doc_df.head()

Unnamed: 0,topic_id,top_terms,rep_docs
0,0,"[polling, voter, booth, election, voting, elec...",new jammu kashmir migrant given another thumb ...
1,1,"[patient, hospital, medical, doctor, health, h...",birth healthy siamese twin indapur pune distri...
2,2,"[power, electricity, supply, consumer, energy,...",new government strike balance political concer...
3,3,"[oil, gas, price, crude, petrol, petroleum, di...",new left anger oil lpg price hike heightened i...
4,4,"[cricket, var, match, team, bowler, sport, pla...",dinesh mongia struck fine india beat england t...


### Topic Distribution over documents

In [62]:
freq = topic_model.get_topic_info()
freq.head()

Unnamed: 0,Topic,Count,Name
0,-1,9536,-1_said_party_government_police
1,0,295,0_polling_voter_booth_election
2,1,260,1_patient_hospital_medical_doctor
3,2,254,2_power_electricity_supply_consumer
4,3,217,3_oil_gas_price_crude


In [63]:
base_chart = alt.Chart(freq.iloc[:30, ])
bar = base_chart.mark_bar().encode(
        x=alt.X('Name:N', axis=alt.Axis(title='Topic ID')),
        y=alt.Y('Count:Q', axis=alt.Axis(title='Number of Articles')))

(bar
).properties(width=500, height=300, title='Topics vs Number of Article Distribution'
).configure_title(fontSize=25
).configure_axis(grid=False, domain=False, 
                 labelFontSize=15,titleFontSize=20)

## Visualize Topic Distribution

In [64]:
hierarchical_topics = topic_model.hierarchical_topics(texts)

100%|██████████| 244/244 [00:01<00:00, 189.97it/s]


## Visualize Topic Distribution

In [65]:
topic_model.visualize_topics(top_n_topics=10)

In [66]:
topic_model.visualize_topics()

In [67]:
# sample_df['date'] = pd.to_datetime(sample_df['date'])
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          20000 non-null  int64         
 1   date        20000 non-null  datetime64[ns]
 2   url         20000 non-null  object        
 3   title       20000 non-null  object        
 4   category    20000 non-null  object        
 5   article     20000 non-null  object        
 6   clean_text  20000 non-null  object        
 7   topic_id    20000 non-null  int64         
 8   topic_prob  20000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(5)
memory usage: 1.4+ MB


In [68]:
# timestamps = sample_df.date.values
topics_over_time_df = topic_model.topics_over_time(texts, sample_df.date.values)
# topics_over_time_grp = topics_over_time_df.groupby(['Timestamp', 'Topic']).agg('sum').reset_index()
# topics_over_time_grp = topics_over_time_grp.set_index('Timestamp')
# topics_over_time_grp

365it [00:45,  7.97it/s]


Unnamed: 0_level_0,Topic,Frequency
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-01-01,-1,11
2004-01-01,0,1
2004-01-01,1,2
2004-01-01,3,1
2004-01-01,4,1
...,...,...
2004-12-31,108,1
2004-12-31,112,1
2004-12-31,115,1
2004-12-31,151,1


In [70]:
# topics_over_time_grp.plot()

In [71]:
topic_model.visualize_topics_over_time(topics_over_time_df)

In [72]:
topic_model.visualize_distribution(probs[0], min_probability=0.005)

In [None]:
# tree = topic_model.get_topic_tree(hierarchical_topics)
# print(tree)

In [73]:
topic_model.visualize_hierarchy(top_n_topics=30)

In [74]:
topic_model.visualize_barchart(top_n_topics=30)

In [9]:
topic_model.visualize_heatmap(n_clusters=10, width=800, height=800)

### Topic Search by Keywords

In [54]:
# search topics
similar_topics, similarity = topic_model.find_topics('motor', top_n=5)
topic_model.get_topic(similar_topics[0])

[('car', 0.04541835921508529),
 ('motor', 0.02321142640111458),
 ('sale', 0.02054813993038629),
 ('vehicle', 0.018276088171410576),
 ('auto', 0.01774435411074634),
 ('unit', 0.01603674715456513),
 ('maruti', 0.0153669638648084),
 ('company', 0.013096985967558837),
 ('ford', 0.012939116536774502),
 ('motorcycle', 0.012742715102418487)]

In [55]:
similar_topics, similarity = topic_model.find_topics('corruption', top_n=5)
topic_model.get_topic(similar_topics[0])

[('arrested', 0.036307485926661),
 ('bank', 0.025440271197026982),
 ('allegedly', 0.024366256501534993),
 ('accused', 0.024227593101344046),
 ('shanbag', 0.023472899269771606),
 ('offence', 0.021848662902442917),
 ('loan', 0.02179601334013547),
 ('account', 0.021490049579557493),
 ('cheque', 0.02113153133928538),
 ('dinesh', 0.020837052848201334)]

In [56]:
similar_topics, similarity = topic_model.find_topics('scam', top_n=5)
topic_model.get_topic(similar_topics[0])

[('telgi', 0.08640443590695918),
 ('stamp', 0.08122845068586275),
 ('fake', 0.04349985387039188),
 ('scam', 0.03346040761696168),
 ('karim', 0.02956596909633233),
 ('abdul', 0.025648184707635524),
 ('cbi', 0.025208622782878668),
 ('paper', 0.023499662523173585),
 ('sit', 0.022483116352722415),
 ('arrested', 0.016749128778346525)]

In [57]:
similar_topics, similarity = topic_model.find_topics('gandhi', top_n=5)
topic_model.get_topic(similar_topics[0])

[('gandhi', 0.0349715321983996),
 ('rahul', 0.02662917185366613),
 ('sonia', 0.02630658540089818),
 ('priyanka', 0.017315526700670658),
 ('varun', 0.015407617665147148),
 ('rajiv', 0.01167420208233158),
 ('amethi', 0.01081046743162488),
 ('mother', 0.010426863917141737),
 ('citizenship', 0.01011494127382778),
 ('indian', 0.009727251969114573)]

### Save the topic model for future use

In [58]:
# torch.save(topic_model, '/home/amitjha/bertopic_model_20k')
# sample_df.to_csv('articles_2004_topic_20k_df.csv')

In [40]:
# load the saved model
topic_model = torch.load('/home/amitjha/bertopic_model_20k')
sample_df = pd.read_csv('articles_2004_topic_20k_df.csv')

In [5]:
similar_topics, similarity = topic_model.find_topics('corruption', top_n=5)
topic_model.get_topic(similar_topics[0])

[('arrested', 0.036307485926661),
 ('bank', 0.025440271197026982),
 ('allegedly', 0.024366256501534993),
 ('accused', 0.024227593101344046),
 ('shanbag', 0.023472899269771606),
 ('offence', 0.021848662902442917),
 ('loan', 0.02179601334013547),
 ('account', 0.021490049579557493),
 ('cheque', 0.02113153133928538),
 ('dinesh', 0.020837052848201334)]

In [66]:
# sample_df.info()