## Load required libraries

In [1]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
import re
import gensim, spacy
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis 
import altair as alt
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
pyLDAvis.enable_notebook()

## Required Utilities 

### Pre-processing Utilities 

In [2]:
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', \
                   'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need',  \
                   'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    lst_text = text.split()
    ## choose only alphbetical words and filter words less than 3 chars
    lst_text = [token.lower() for token in lst_text if token.isalpha() and len(token) >=3]      
#     text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
#     lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
    
    lst_text = [token for token in lst_text if len(token) >=3]
            
    ## back to string from list
#     text = " ".join(lst_text)
    return lst_text


def ngram_processing(texts):
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)

    # Fastest way to get a sentence clubbed as trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)   
    
    bigram_words = [bigram_mod[doc] for doc in texts]
    trigram_words = [trigram_mod[bigram_mod[doc]] for doc in bigram_words]
    return(trigram_words)


def lemmatization(texts, allowed_postags=allowed_postags):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return(texts_out)

### Model Creation Utilities 

In [3]:
def model_creation(corpus, id2word, num_topics, workers=None):
    if not workers:
        lda = LdaModel(corpus, id2word = id2word, num_topics=num_topics,
                              random_state=696, update_every=1,
                              chunksize=10, passes=10, 
                              alpha='symmetric', iterations=100,
                              per_word_topics=True)
    else:
        lda = LdaMulticore(corpus, id2word = id2word, workers = workers, num_topics=num_topics,
                              random_state=696,
                              chunksize=10, passes=10, 
                              alpha='symmetric', iterations=100,
                              per_word_topics=True)
        
    
    return(lda)

def compute_coherence_values(id2word, corpus, texts, workers, limit, start=2, step=2):
    coherence_values = []
    perplexity_values = []
    model_list = []
    num_topics_list = []
    
    for num_topics in tqdm(range(start, limit, step)):
        lda_model = model_creation(corpus, id2word, num_topics, workers)
        perplexity_score = lda_model.log_perplexity(corpus)

        coherence_model = CoherenceModel(model=lda_model, texts=texts,
                                dictionary=id2word,coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        model_list.append(lda_model)
        coherence_values.append(coherence_score)
        perplexity_values.append(perplexity_score)
        num_topics_list.append(num_topics)
    return(num_topics_list, model_list, coherence_values, perplexity_values)
        

## Load Data

In [4]:
# Read articles for 2004 for test purpose
basepath = '/mnt/d/Amit/data-science/MADS/SIADS696/'
filepath = 'datasets/raw_data/'
# filepath = 'scrapy/missing_articles/missing_articles/'
filename = 'articles_2004.jl'
filepath_name = basepath + filepath + filename
filepath_name

'/mnt/d/Amit/data-science/MADS/SIADS696/datasets/raw_data/articles_2004.jl'

In [9]:
# articles_df = pd.read_json(filepath_name, lines=True)
# articles_df = articles_df.dropna()
articles_df.head()

Unnamed: 0,id,date,url,title,category,article,clean_text
0,1,2004-01-01,https://timesofindia.indiatimes.com/business/i...,economy-breaks-8-pc-barrier,"[business, india-business]",NEW DELHI: The feel-good factor got a boost on...,"[new, factor, got, boost, last, day, indian, e..."
1,2,2004-01-01,https://timesofindia.indiatimes.com/world/paki...,jaish-leader-missing-after-attacks,"[world, pakistan]",ISLAMABAD: Some activists of the banned milita...,"[activist, banned, militant, outfit, arrested,..."
2,3,2004-01-01,https://timesofindia.indiatimes.com/world/us/1...,1m-for-jackson-interview-to-cbs,"[world, us]","&lt;div class=""section1""&gt;&lt;div class=""Nor...","[michael, jackson, struck, deal, cbs, paid, ef..."
3,4,2004-01-01,https://timesofindia.indiatimes.com/india/sars...,sars-screening-takes-off-at-igi,"[timesofindia.indiatimes.com, india]",NEW DELHI: With the SARS virus raising its hea...,"[new, sars, virus, raising, head, airport, aut..."
4,5,2004-01-01,https://timesofindia.indiatimes.com/india/indi...,india-to-test-fire-agni-iii-soon,"[timesofindia.indiatimes.com, india]",NEW DELHI: The nuclear-capable â€˜Agni-IIIâ€™ ...,"[new, capable, hitting, strategic, target, dee..."


In [183]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54962 entries, 0 to 56740
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          54962 non-null  int64         
 1   date        54962 non-null  datetime64[ns]
 2   url         54962 non-null  object        
 3   title       54962 non-null  object        
 4   category    54962 non-null  object        
 5   article     54962 non-null  object        
 6   clean_text  54962 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 3.4+ MB


### Pre-processing Text

In [6]:
%%time
articles_df['clean_text'] = articles_df['article'].apply(lambda text : 
                        utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=stop_words))

articles_df.head()

CPU times: user 56.9 s, sys: 4.44 s, total: 1min 1s
Wall time: 1min 1s


Unnamed: 0,id,date,url,title,category,article,clean_text
0,1,2004-01-01,https://timesofindia.indiatimes.com/business/i...,economy-breaks-8-pc-barrier,"[business, india-business]",NEW DELHI: The feel-good factor got a boost on...,"[new, factor, got, boost, last, day, indian, e..."
1,2,2004-01-01,https://timesofindia.indiatimes.com/world/paki...,jaish-leader-missing-after-attacks,"[world, pakistan]",ISLAMABAD: Some activists of the banned milita...,"[activist, banned, militant, outfit, arrested,..."
2,3,2004-01-01,https://timesofindia.indiatimes.com/world/us/1...,1m-for-jackson-interview-to-cbs,"[world, us]","&lt;div class=""section1""&gt;&lt;div class=""Nor...","[michael, jackson, struck, deal, cbs, paid, ef..."
3,4,2004-01-01,https://timesofindia.indiatimes.com/india/sars...,sars-screening-takes-off-at-igi,"[timesofindia.indiatimes.com, india]",NEW DELHI: With the SARS virus raising its hea...,"[new, sars, virus, raising, head, airport, aut..."
4,5,2004-01-01,https://timesofindia.indiatimes.com/india/indi...,india-to-test-fire-agni-iii-soon,"[timesofindia.indiatimes.com, india]",NEW DELHI: The nuclear-capable â€˜Agni-IIIâ€™ ...,"[new, capable, hitting, strategic, target, dee..."


In [207]:
sample_df = sample_df.reset_index(drop=True)

In [184]:
# choosing number of articles to process for stub testing
n = 10000
sample_df = articles_df.sample(n).reset_index(drop=True)
texts = list(sample_df['clean_text'].values)
text_grams = ngram_processing(texts)
data_lemmatized = lemmatization(text_grams, allowed_postags)

In [209]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54962 entries, 0 to 56740
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          54962 non-null  int64         
 1   date        54962 non-null  datetime64[ns]
 2   url         54962 non-null  object        
 3   title       54962 non-null  object        
 4   category    54962 non-null  object        
 5   article     54962 non-null  object        
 6   clean_text  54962 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 3.4+ MB


In [186]:
%%time
# model creation
texts = data_lemmatized
id2word = Dictionary(texts)
corpus = [id2word.doc2bow(doc) for doc in texts]

CPU times: user 1.65 s, sys: 52.5 ms, total: 1.71 s
Wall time: 1.74 s


In [32]:
# sample_df = sample_df.reset_index(drop=True)
# sample_df

In [187]:
%%time
# time estimation with multicore LDA
# model creation
num_topics_list, model_list, coherence_vals, perplexity_vals = compute_coherence_values(id2word, corpus,    \
                                                                texts, workers=4, limit=25, start=2, step=3)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [40:08<00:00, 301.02s/it]

CPU times: user 23min, sys: 7min 1s, total: 30min 2s
Wall time: 40min 8s





In [210]:
model_df = pd.DataFrame(list(zip(num_topics_list, coherence_vals, perplexity_vals)), 
                        columns = ['num_topics', 'coherence_score', 'perplexity_score']) 

In [211]:
model_df

Unnamed: 0,num_topics,coherence_score,perplexity_score
0,2,0.373991,-8.497963
1,5,0.467941,-8.532438
2,8,0.475157,-8.744053
3,11,0.435155,-9.722371
4,14,0.439952,-12.013036
5,17,0.438637,-12.82637
6,20,0.411783,-14.536259
7,23,0.385832,-14.572049


## Visualize Topic Distribution

In [190]:
base_chart = alt.Chart(model_df)
line = base_chart.mark_line().encode(
            x=alt.X('num_topics:N', axis=alt.Axis(title='Number of Topics')),
            y=alt.Y('coherence_score', axis=alt.Axis(title='Coherence Score')),
            tooltip = ['num_topics', 'coherence_score']
)

point = base_chart.mark_point(color='orange', size=50).encode(
            x=alt.X('num_topics:N', axis=alt.Axis(title='Number of Topics')),
            y=alt.Y('coherence_score', axis=alt.Axis(title='Coherence Score')),
            tooltip = ['num_topics', 'coherence_score']
)    


(line + point
).properties(width=500, height=300, title='Number of Topics vs Coherence Score'
).configure_title(fontSize=25
).configure_axis(grid=False, domain=False, 
                 labelFontSize=15,titleFontSize=20)

In [191]:
# numbe of topics 8
optimal_model_cond = np.where(model_df['coherence_score'] == max(model_df['coherence_score'].values))
optimal_model = model_list[optimal_model_cond[0][0]]

#### Get Top - Topic terms

In [265]:
def topic_term_mapping(model):
    num_topics = len(model.show_topics())
    topic_term_df = pd.DataFrame({'topic_id': [], 'top_terms': []})
    topic_terms = [None] * num_topics
    for topic_id in range(num_topics):
        topic_terms[topic_id] = ''
        for term, prob in optimal_model.show_topic(topic_id):
                    topic_terms[topic_id] += ' ' + term 
                
    topic_term_df = pd.DataFrame(zip(*[list(range(num_topics)), topic_terms]), columns=['topic_id', 'top_topic_terms'])
    return(topic_term_df)
    
topic_term_df = topic_term_mapping(optimal_model)   
topic_term_df

Unnamed: 0,topic_id,top_topic_terms
0,0,say people time make india indian first work ...
1,1,police say case court arrest accuse singh rep...
2,2,student school college university education t...
3,3,water road bus traffic area city transport re...
4,4,party congress minister bjp say leader chief ...
5,5,company new crore per_cent indian india say p...
6,6,say government state official new minister ch...
7,7,police say hospital medical doctor station go...


### Get Document - Topic(dominant) mapping

In [216]:
def doc_topic_tagging(indx):
    text = corpus[indx]
    topic_term_list = []
    topic_dic = dict(sorted(optimal_model.get_document_topics(text)))
    dom_topic_pair = sorted(topic_dic.items(), key = lambda item: item[1], reverse=True)[0]
    dom_topic = dom_topic_pair[0]
    for term, prob in optimal_model.show_topic(dom_topic):
        topic_term_list.append(term)
    dom_topic_perc = dom_topic_pair[1]    
    return (topic_term_list, dom_topic, dom_topic_perc)

In [217]:
sample_df['topic_top_terms'], sample_df['dom_topic_id'] ,     \
                    sample_df['dom_topic_perc']  = zip(*sample_df.index.to_series().apply(doc_topic_tagging))

In [218]:
sample_df.head()

Unnamed: 0,id,date,url,title,category,article,clean_text,topic_top_terms,dom_topic_id,dom_topic_perc
0,16836,2004-03-31,https://timesofindia.indiatimes.com/city/chand...,himachal-bjp-chief-assails-virbhadra,"[city, chandigarh]","&lt;div class=""section1""&gt;&lt;div class=""Nor...","[state, bjp, chief, suresh, bharadwaj, wednesd...","(party, congress, minister, bjp, say, leader, ...",4,0.953001
1,1189,2004-01-08,https://timesofindia.indiatimes.com/city/chand...,huda-officials-bail-plea-rejected,"[city, chandigarh]",PANCHKULA: The additional districts and sessio...,"[additional, district, session, judge, bhangoo...","(say, government, state, official, new, minist...",6,0.753893
2,16699,2004-03-31,https://timesofindia.indiatimes.com/india/play...,play-it-again-sam-cong-gets-tech-hand,"[timesofindia.indiatimes.com, india]","NEW DELHI: Technology guru Sam Pitroda, who ha...","[new, technology, guru, sam, ushered, telecom,...","(say, people, time, make, india, indian, first...",0,0.519498
3,32539,2004-07-06,https://timesofindia.indiatimes.com/india/no-f...,no-fare-hike-as-lalu-express-chugs-on,"[timesofindia.indiatimes.com, india]","&lt;div class=""section1""&gt;&lt;div class=""Nor...","[populist, budget, aimed, common, railway, min...","(say, government, state, official, new, minist...",6,0.573038
4,41995,2004-09-10,https://timesofindia.indiatimes.com/world/us/w...,what-india-needs-to-learn-from-us,"[world, us]","&lt;div class=""section1""&gt;&lt;div class=""Nor...","[elaborate, exhaustive, probe, worst, ever, te...","(say, people, time, make, india, indian, first...",0,0.829942


### Topic Distribution over documents

In [267]:
topic_doc_count_df = sample_df.groupby('dom_topic_id').agg('count').reset_index().loc[:, ['dom_topic_id', 'article']]
topic_doc_count_df

Unnamed: 0,dom_topic_id,article
0,0,2539
1,1,1189
2,2,337
3,3,177
4,4,1631
5,5,1314
6,6,2245
7,7,568


In [274]:
base_chart = alt.Chart(topic_doc_count_df)
bar = base_chart.mark_bar().encode(
        x=alt.X('dom_topic_id:N', axis=alt.Axis(title='Topic ID')),
        y=alt.Y('article:Q', axis=alt.Axis(title='Number of Articles')))

(bar
).properties(width=500, height=300, title='Topics vs Number of Article Distribution'
).configure_title(fontSize=25
).configure_axis(grid=False, domain=False, 
                 labelFontSize=15,titleFontSize=20)

### Get Topic - Doc(representative)

In [219]:
sample_df.groupby('dom_topic_id').agg('max').reset_index()

Unnamed: 0,dom_topic_id,id,date,url,title,category,article,clean_text,topic_top_terms,dom_topic_perc
0,0,55146,2004-12-31,https://timesofindia.indiatimes.com/world/us/w...,zoo-loses-lone-asiatic-lion,"[world, us]",â€˜â€˜We are forging an enlarged Europe and lo...,"[zahira, key, witness, best, bakery, shocked, ...","(say, people, time, make, india, indian, first...",0.981135
1,1,55060,2004-12-31,https://timesofindia.indiatimes.com/world/us/u...,zahiras-hostile-brother-pulled-up-by-judge,"[world, us]",doweshowbellyad=0; UNITED NATIONS: Ousted Iraq...,"[zahira, sheikh, really, staying, mumbai, freq...","(police, say, case, court, arrest, accuse, sin...",0.989309
2,2,54568,2004-12-27,https://timesofindia.indiatimes.com/world/us/t...,yukta-moves-hc-against-eviction,"[world, us]",doweshowbellyad=0; After two consecutive weeks...,"[youngest, question, fired, senior, physician,...","(student, school, college, university, educati...",0.919675
3,3,55013,2004-12-30,https://timesofindia.indiatimes.com/world/us/t...,your-vespa-can-phut-phut-to-holland,"[world, us]",VELI: Untreated industrial effluent is causing...,"[wintry, condition, prevailed, mid, higher, hi...","(water, road, bus, traffic, area, city, transp...",0.993854
4,4,55062,2004-12-31,https://timesofindia.indiatimes.com/world/us/p...,zira-to-float-party,"[world, us]",doweshowbellyad=0; NEW DELHI: With the Congres...,"[yet, recover, mortal, blow, received, lok, sa...","(party, congress, minister, bjp, say, leader, ...",0.976335
5,5,55138,2004-12-31,https://timesofindia.indiatimes.com/world/us/s...,zing-80-hits-the-market,"[world, us]",xNEW DELHI: Operating choppers and small aircr...,"[zensar, technology, ltd, entered, new, joint,...","(company, new, crore, per_cent, indian, india,...",0.974216
6,6,55091,2004-12-31,https://timesofindia.indiatimes.com/world/us/u...,zip-to-lahore-in-rs-254,"[world, us]",doweshowbellyad=0; PATNA: Patna DM-cum-returni...,"[zipping, overtaking, wrong, star, succumb, ca...","(say, government, state, official, new, minist...",0.987646
7,7,55065,2004-12-31,https://timesofindia.indiatimes.com/world/rest...,youths-stop-car-rob-3-of-rs-44-lakh,"[world, rest-of-world]",doweshowbellyad=0; Winners of the 49th Manikch...,"[young, couple, found, murdered, around, noon,...","(police, say, hospital, medical, doctor, stati...",0.985862


In [270]:
# Number of Topics = 8
gensimvis.prepare(optimal_model, corpus, id2word)

In [272]:
# Number of Topics = 14
gensimvis.prepare(model_list[4], corpus, id2word)

In [None]:
# Number of Topics = 11
gensimvis.prepare(model_list[3], corpus, id2word)

In [31]:
# Number of Topics = 14
gensimvis.prepare(model_list[4], corpus, id2word)

  from imp import reload
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)
  from imp import reload
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  if LooseVersion(__version__) >= 

  from imp import reload
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)
  from imp import reload
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  if LooseVersion(__version__) >= 

  from imp import reload
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)
