## Load required libraries

In [53]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from collections import Counter
import nltk
from bertopic import BERTopic
import torch
import re
import altair as alt
from nltk.corpus import stopwords
import os
from IPython.display import display, HTML
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import warnings
warnings.filterwarnings('ignore')
seed=696

#### Define Paths to be Used

In [54]:
# Read articles for 2004 for test purpose
basepath = '../'
filepath = 'datasets/'
# filepath_name = basepath + filepath + filename
path = basepath + filepath

model_path = '../model_extract/ModelTrainedOn3pcData/'
train_file = 'train_df_3pc.csv'
model = 'bertopic_model_3pc'

### Choose Run Type - Model Training/Use Saved Model

In [55]:
model_trained = str(input())

Y


## Required Utilities 

### Text Pre-processing Utilities 

In [56]:
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', \
                   'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need',  \
                   'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    lst_text = text.split()
    ## choose only alphbetical words and filter words less than 3 chars
    lst_text = [token.lower() for token in lst_text if token.isalpha() and len(token) >=3]      
#     text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
#     lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
    
    lst_text = [token for token in lst_text if len(token) >=3]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

def lemmatization(texts, allowed_postags=allowed_postags):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return(texts_out)

### Train/Test Split Utility

In [57]:
# divides the articles in to train/test by splitting on year/month basis 
def split_data(path, sample_size, random_state=696):
    # list all the files in diectory
    files = sorted(os.listdir(path))
    columns = ['id', 'date','article']
    train_df = pd.DataFrame({'id': [], 'date': [],'article': []}) 
    test_df = pd.DataFrame({'id': [], 'date': [], 'article': []}) 
    for indx, file in enumerate(files[:2]):
            file_path = path + file
            print(file_path)
            articles_df = pd.read_json(file_path, lines=True)
            articles_df = articles_df.dropna()    
            articles_df['date'] = pd.to_datetime(articles_df['date'])
            articles_df['month'] = articles_df['date'].dt.month
            num_months = 12
            yr_sample_df = pd.DataFrame({'id': [], 'date': [], 'article': []})
            for month in range(num_months):                
                month_df = articles_df.loc[articles_df['month'] == month + 1]
                size = round(len(month_df) * sample_size/100)  
                month_df = month_df.sample(size, random_state=random_state)
#                 print(month+1, len(month_df), size)                
                month_df = month_df.loc[: , columns]
                yr_sample_df = pd.concat([yr_sample_df, month_df], ignore_index=False)
            
            # choose remaining docs for topic inference
            yr_test_df = articles_df.loc[~articles_df['id'].isin(yr_sample_df.id)]
            print(len(yr_sample_df), len(yr_test_df), len(articles_df))
            
            train_df = pd.concat([train_df, yr_sample_df], ignore_index=True) 
            train_df = train_df.dropna()
            test_df = pd.concat([test_df, yr_test_df], ignore_index=False)
    return(train_df, test_df)


### Model Exploration Utilities

In [88]:
def topic_top_term(model):
    topic_list = list(model.get_topics().keys())
    top_terms_list = []
    for topic_id, term_prob in model.get_topics().items():
        terms = [term for term,prob in term_prob]
        top_terms_list.append( str(topic_id) + '_'+ ' '.join(terms))
    df = pd.DataFrame(list(zip(topic_list, top_terms_list)), columns=['topic_id', 'top_terms'])
    return(df)

def get_top_terms(topic_id):
    top_term_prob_lst = topic_model.get_topic(topic_id)
    top_terms = [term for
                 term,prob in top_term_prob_lst]
    return(' '.join(top_terms))


def topic_inference(basepath, model_path, model):
    prob_threshold=0.05
    files = sorted(os.listdir(path))
    files = [file for file in files if ".jl" in file]
    for file in tqdm(files):
        filepath = path + file
        yr_df = pd.read_json(filepath, lines=True)
        yr_df = yr_df.dropna().reset_index(drop=True)
        yr_df['clean_text'] = yr_df['article'].apply(lambda text : 
                                utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=stop_words))
        texts = list(yr_df['clean_text'].values)
        topics, probs = model.transform(texts)    
        
        new_topics = [np.argmax(prob) if max(prob) >= prob_threshold else -1 for prob in probs ]
        print('topic inference done for file:', file)
        
        yr_df['topic_id'] = new_topics
        yr_df['top_topics'],yr_df['top_probs'] = zip(*yr_df.apply(lambda x : get_top_5_topics(x.name, probs), axis=1)) 
        
        columns = ['id', 'date', 'topic_id', 'top_topics', 'top_probs']
        yr_df = yr_df.loc[:, columns]
        yr_df['id'] = yr_df['id'].astype('int32')
        yr_df['topic_id'] = yr_df['topic_id'].astype('int')
        filename = file[:13] + '.csv'
        filepath = model_path  + filename
        yr_df.to_csv( filepath)
        print('processing done for file:', file)
        
def get_top_5_topics(idx, probs):
    ind = np.argpartition(probs[idx], -5)[-5:]
    top_topics = list(ind[np.argsort(probs[idx][ind])[::-1]])
    top_probs = [round(prob, 6) for prob in probs[idx][top_topics]]
    return(top_topics, top_probs)

## Load Data

In [58]:
# split the news articles in train/test
# this train dataset would be used to train the BERTopic model from which topic inference can be done
if  model_trained != 'Y':
    train_df, test_df = split_data(path, 3)

    filepath = model_path + train_file
    train_df.to_csv(filepath)

### Pre-processing Text

In [59]:
# %%time
if  model_trained != 'Y':
    train_df['clean_text'] = train_df['article'].apply(lambda text : 
                            utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=stop_words))

    train_df.head()

In [60]:
if model_trained == 'Y':
    filepath = model_path + train_file
    train_df = pd.read_csv(filepath, usecols=['id','date','article','clean_text','topic_id','topic_prob','top_terms'])
    train_df.head()

### Train model for 3% articles

### Create Model

In [61]:
# choosing number of articles to process for stub testing
texts = list(train_df['clean_text'].values)

In [63]:
%%time
if model_trained != 'Y':
    topic_model = BERTopic(language='english', calculate_probabilities=True,
                               verbose=True, nr_topics=100)
    topics, probs = topic_model.fit_transform(texts)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.2 µs


### Load Trained Model for Future Runs

In [64]:
if model_trained == 'Y':
    topic_model = torch.load(model_path + model)
    topics = topic_model.topics_
    probs = topic_model.probabilities_

In [89]:
# get topic id and correspodning top topic terms     
top_term_df = topic_top_term(topic_model)
top_term_df

Unnamed: 0,topic_id,top_terms
0,-1,-1_said police government state one people min...
1,0,0_student school education teacher class colle...
2,1,1_police suicide body murder found family arre...
3,2,2_bjp party candidate congress leader seat ele...
4,3,3_railway train passenger station metro rail e...
...,...,...
96,95,95_drug medicine pharma generic chemist pharma...
97,96,96_cattle cow animal bull slaughter stray meat...
98,97,97_militant killed grenade security terrorist ...
99,98,98_currency atm note bank fake cash counterfei...


### Get Document - Topic(dominant) mapping

In [66]:
# create document and top topic term mapping 
topics = train_df['topic_id'].values
train_df['topic_id'] = topics
train_df['topic_prob'] = train_df.apply(lambda x : probs[x.name][x.topic_id], axis=1)
train_df['top_terms'] = train_df['topic_id'].apply(lambda topic_id : get_top_terms(topic_id)) 

train_df.head(3)

Unnamed: 0,id,date,article,clean_text,topic_id,topic_prob,top_terms
0,4379.0,2004-01-24,NEW DELHI: A high-level BJP meeting here on Fr...,new bjp meeting friday gave official seal appr...,2,0.181298,bjp party candidate congress leader seat elect...
1,1792.0,2004-01-12,PUNE: Sugar factories controlled by deputy chi...,sugar factory controlled deputy chief minister...,21,0.046529,farmer crop sugar agriculture sugarcane paddy ...
2,749.0,2004-01-06,"KOLKATA/HOWRAH: In a bid to cover up the mess,...",bid cover state government monday set inquiry ...,25,0.037723,land housing estate developer property flat re...


### Topic-Term Matrix

In [67]:
num_topics = len(topic_model.get_topic_info())-1
topic_list = list(range(num_topics))
num_topics

100

In [68]:
def get_topic_term_matrix(model, df):
    num_topics = len(model.get_topic_info())-1
    topic_list = list(range(num_topics))
    top_term_list = []
    rep_docs_list = []
    url_list = []
    articles_list = []
    for topic_id in topic_list:    
        top_term_prob_lst = model.get_topic(topic_id)
        top_terms = [term for term,prob in top_term_prob_lst]        
        top_term_list.append(top_terms)
        doc = model.get_representative_docs(topic_id)[0]
        df = df[df.clean_text == doc]
        rep_docs_list.append(doc)    

    df = pd.DataFrame({'topic_id': topic_list, 'top_terms':top_term_list, 'rep_docs': rep_docs_list})
    return(df)

dom_topic_doc_df = get_topic_term_matrix(topic_model, train_df)
dom_topic_doc_df.to_csv('dom_topic_doc_df_whole.csv')
dom_topic_doc_df.iloc[11, :].rep_docs
# display(HTML(dom_topic_doc_df.to_html()))

'maker gujarati film sawaal question approached film certification appellate tribunal central board film certification denied certificate rajesh director gujarati said approached fcat july cbfc denied fight film freedom hope justice gohil june suggested cut questioned motive choosing emotive raised concern resemblance patel quota activist hardik pahlaj board denied certificate film stating film prove threat national security said cbfc suggested cut included removal word every board said content asked remove reference said edited film way cbfc wanted length film reduced minute current injustice gujarati film board hurt gohil'

### Topic Distribution over documents

In [69]:
prob_threshold=0.01
new_topics = [np.argmax(prob) if max(prob) >= prob_threshold else -1 for prob in probs ]

topic_dist_df = pd.DataFrame(Counter(topics).items(), columns=['topic_id', 'Count'])
topic_dist_df = topic_dist_df.sort_values(by='topic_id')

# topic_dist_df
freq = topic_dist_df.merge(top_term_df, on='topic_id', how='inner')
freq = freq.sort_values(by='topic_id')
freq.head()

Unnamed: 0,topic_id,Count,top_terms
0,-1,3977,-1_said police government state one people min...
1,0,3668,0_student school education teacher class colle...
2,1,5135,1_police suicide body murder found family arre...
3,2,5411,2_bjp party candidate congress leader seat ele...
4,3,1112,3_railway train passenger station metro rail e...


In [70]:
base_chart = alt.Chart(freq.iloc[1:30, ])
bar = base_chart.mark_bar().encode(
        x=alt.X('top_terms:N',sort='-y', axis=alt.Axis(title='Topic ID')),
        y=alt.Y('Count:Q', axis=alt.Axis(title='Number of Articles')))

(bar
).properties(width=500, height=300, title='Topics vs Number of Article Distribution'
).configure_title(fontSize=25
).configure_axis(grid=False, domain=False, 
                 labelFontSize=15,titleFontSize=5)

## Visualize Topic Distribution

In [71]:
# hierarchical_topics = topic_model.hierarchical_topics(texts)

## Visualize Topic Distribution

In [72]:
topic_model.visualize_topics(top_n_topics=10)

In [73]:
topic_model.visualize_topics()

### Visualize docs

In [74]:
# timestamps = sample_df.date.values
train_df['date'] = pd.to_datetime(train_df['date'])
# topics_over_time_df1 = topic_model.topics_over_time(texts, train_df.date.values, nr_bins=20)
# topics_over_time_grp = topics_over_time_df.groupby(['Timestamp', 'Topic']).agg('sum').reset_index()
# topics_over_time_grp = topics_over_time_grp.set_index('Timestamp')
# topics_over_time_grp

In [75]:
# topic_model.visualize_topics_over_time(topics_over_time_df, top_n_topics=10)

In [76]:
topic_model.visualize_distribution(probs[2], min_probability=0.005)

In [77]:
# tree = topic_model.get_topic_tree(hierarchical_topics)
# print(tree)

In [78]:
topic_model.visualize_hierarchy(top_n_topics=50)

In [79]:
topic_model.visualize_barchart(top_n_topics=30)

In [80]:
topic_model.visualize_heatmap(n_clusters=10, width=800, height=800)

### Topic Search by Keywords

In [81]:
# search topics
similar_topics, similarity = topic_model.find_topics('motor', top_n=5)
topic_model.get_topic(similar_topics[0])

[('tata', 0.04521627565117509),
 ('car', 0.04186881754287438),
 ('company', 0.031300275705048335),
 ('sale', 0.026205593019733166),
 ('unit', 0.023558328475499553),
 ('steel', 0.020340474994880165),
 ('motor', 0.019895529334124126),
 ('market', 0.01980425403925737),
 ('vehicle', 0.017546050785513974),
 ('maruti', 0.016655999674796335)]

In [82]:
similar_topics, similarity = topic_model.find_topics('corruption', top_n=5)
topic_model.get_topic(similar_topics[0])

[('cbi', 0.05620954619884173),
 ('bribe', 0.05444167180987892),
 ('acb', 0.0405245293188542),
 ('accepting', 0.029880981439713288),
 ('accused', 0.024220544944546365),
 ('bureau', 0.02015571643153948),
 ('court', 0.019738911125192226),
 ('arrested', 0.019024020349022043),
 ('case', 0.01806385685928585),
 ('caught', 0.017553793546657148)]

In [83]:
similar_topics, similarity = topic_model.find_topics('scam', top_n=5)
topic_model.get_topic(similar_topics[0])

[('currency', 0.08373534531652686),
 ('atm', 0.08319298615489658),
 ('note', 0.0691676578297129),
 ('bank', 0.059323629880753244),
 ('fake', 0.0437623812128182),
 ('cash', 0.03879626764482015),
 ('counterfeit', 0.03111924327907874),
 ('denomination', 0.02118457243845715),
 ('coin', 0.02093649069952777),
 ('branch', 0.01872912906314579)]

In [84]:
similar_topics, similarity = topic_model.find_topics('gandhi', top_n=5)
topic_model.get_topic(similar_topics[0])

[('congress', 0.052220964696091804),
 ('rahul', 0.03712823404357182),
 ('gandhi', 0.03363534228761747),
 ('modi', 0.027232265110904444),
 ('party', 0.026454293023728283),
 ('sonia', 0.021688830920645948),
 ('leader', 0.020473956032085086),
 ('bjp', 0.017811474072415137),
 ('prime', 0.015245430666849375),
 ('minister', 0.015166130823978084)]

### Save the topic model for future use

In [85]:
if model_trained != 'Y':
    torch.save(topic_model, '/home/amitjha/bertopic_model_3pc')
    train_df.to_csv('train_df_3pc.csv')

### Topic Inference

In [87]:
%%time
articles_df = topic_inference(basepath,model_path, topic_model)

CPU times: user 7 µs, sys: 1e+03 ns, total: 8 µs
Wall time: 6.91 µs
