## Load required libraries

In [52]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
import re
import gensim, spacy
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis 
import altair as alt
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
pyLDAvis.enable_notebook()

## Required Utilities 

### Pre-processing Utilities 

In [54]:
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', \
                   'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need',  \
                   'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    lst_text = text.split()
    ## choose only alphbetical words and filter words less than 3 chars
#     lst_text = [token.lower() for token in lst_text if token.isalpha() and len(token) >=3]      
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
#     lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
    
    lst_text = [token for token in lst_text if len(token) >=3]
            
    ## back to string from list
#     text = " ".join(lst_text)
    return lst_text


def ngram_processing(texts):
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)

    # Fastest way to get a sentence clubbed as trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)   
    
    bigram_words = [bigram_mod[doc] for doc in texts]
    trigram_words = [trigram_mod[bigram_mod[doc]] for doc in bigram_words]
    return(trigram_words)


def lemmatization(texts, allowed_postags=allowed_postags):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return(texts_out)

### Model Creation Utilities 

In [15]:
def model_creation(corpus, id2word, num_topics, workers=None):
    if not workers:
        lda = LdaModel(corpus, id2word = id2word, num_topics=num_topics,
                              random_state=696, update_every=1,
                              chunksize=10, passes=10, 
                              alpha='symmetric', iterations=100,
                              per_word_topics=True)
    else:
        lda = LdaMulticore(corpus, id2word = id2word, workers = workers, num_topics=num_topics,
                              random_state=696,
                              chunksize=10, passes=10, 
                              alpha='symmetric', iterations=100,
                              per_word_topics=True)
        
    
    return(lda)

def compute_coherence_values(id2word, corpus, texts, workers, limit, start=2, step=2):
    coherence_values = []
    perplexity_values = []
    model_list = []
    num_topics_list = []
    
    for num_topics in tqdm(range(start, limit, step)):
        lda_model = model_creation(corpus, id2word, num_topics, workers)
        perplexity_score = lda_model.log_perplexity(corpus)

        coherence_model = CoherenceModel(model=lda_model, texts=texts,
                                dictionary=id2word,coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        model_list.append(lda_model)
        coherence_values.append(coherence_score)
        perplexity_values.append(perplexity_score)
        num_topics_list.append(num_topics)
    return(num_topics_list, model_list, coherence_values, perplexity_values)
        

## Load Data

In [5]:
# Read articles for 2004 for test purpose
basepath = '/mnt/d/Amit/data-science/MADS/SIADS696/'

filepath = 'datasets/raw_data/gzip_data/'
filename = 'articles_2004.jl.gz'
filepath = basepath + filepath + filename
filepath

'/mnt/d/Amit/data-science/MADS/SIADS696/datasets/raw_data/gzip_data/articles_2004.jl.gz'

In [7]:
articles_df = pd.read_json(filepath, lines=True)
articles_df = articles_df.dropna()
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53394 entries, 0 to 54985
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        53394 non-null  int64         
 1   date      53394 non-null  datetime64[ns]
 2   url       53394 non-null  object        
 3   title     53394 non-null  object        
 4   category  53394 non-null  object        
 5   article   53394 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 2.9+ MB


### Pre-processing Text

In [32]:
%%time
articles_df['clean_text_test'] = articles_df['article'].apply(lambda text : 
                        utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=stop_words))

articles_df.head()

CPU times: user 51.4 s, sys: 0 ns, total: 51.4 s
Wall time: 51.4 s


Unnamed: 0,id,date,url,title,category,article,clean_text,clean_text_test
0,1,2004-01-01,https://timesofindia.indiatimes.com/business/i...,economy-breaks-8-pc-barrier,"[business, india-business]",NEW DELHI: The feel-good factor got a boost on...,"[new, factor, got, boost, last, day, indian, e...","[NEW, DELHI:, The, feel-good, factor, got, boo..."
1,2,2004-01-01,https://timesofindia.indiatimes.com/world/paki...,jaish-leader-missing-after-attacks,"[world, pakistan]",ISLAMABAD: Some activists of the banned milita...,"[activist, banned, militant, outfit, arrested,...","[ISLAMABAD:, Some, activist, banned, militant,..."
2,3,2004-01-01,https://timesofindia.indiatimes.com/world/us/1...,1m-for-jackson-interview-to-cbs,"[world, us]","&lt;div class=""section1""&gt;&lt;div class=""Nor...","[michael, jackson, struck, deal, cbs, paid, ef...","[&lt;div, class=""section1""&gt;&lt;div, class=""..."
3,4,2004-01-01,https://timesofindia.indiatimes.com/india/sars...,sars-screening-takes-off-at-igi,"[timesofindia.indiatimes.com, india]",NEW DELHI: With the SARS virus raising its hea...,"[new, sars, virus, raising, head, airport, aut...","[NEW, DELHI:, With, SARS, virus, raising, head..."
4,5,2004-01-01,https://timesofindia.indiatimes.com/india/indi...,india-to-test-fire-agni-iii-soon,"[timesofindia.indiatimes.com, india]",NEW DELHI: The nuclear-capable â€˜Agni-IIIâ€™ ...,"[new, capable, hitting, strategic, target, dee...","[NEW, DELHI:, The, nuclear-capable, â€˜Agni-II..."


In [8]:
%%time
articles_df['clean_text'] = articles_df['article'].apply(lambda text : 
                        utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=stop_words))

articles_df.head()

CPU times: user 36.5 s, sys: 0 ns, total: 36.5 s
Wall time: 36.5 s


Unnamed: 0,id,date,url,title,category,article,clean_text
0,1,2004-01-01,https://timesofindia.indiatimes.com/business/i...,economy-breaks-8-pc-barrier,"[business, india-business]",NEW DELHI: The feel-good factor got a boost on...,"[new, factor, got, boost, last, day, indian, e..."
1,2,2004-01-01,https://timesofindia.indiatimes.com/world/paki...,jaish-leader-missing-after-attacks,"[world, pakistan]",ISLAMABAD: Some activists of the banned milita...,"[activist, banned, militant, outfit, arrested,..."
2,3,2004-01-01,https://timesofindia.indiatimes.com/world/us/1...,1m-for-jackson-interview-to-cbs,"[world, us]","&lt;div class=""section1""&gt;&lt;div class=""Nor...","[michael, jackson, struck, deal, cbs, paid, ef..."
3,4,2004-01-01,https://timesofindia.indiatimes.com/india/sars...,sars-screening-takes-off-at-igi,"[timesofindia.indiatimes.com, india]",NEW DELHI: With the SARS virus raising its hea...,"[new, sars, virus, raising, head, airport, aut..."
4,5,2004-01-01,https://timesofindia.indiatimes.com/india/indi...,india-to-test-fire-agni-iii-soon,"[timesofindia.indiatimes.com, india]",NEW DELHI: The nuclear-capable â€˜Agni-IIIâ€™ ...,"[new, capable, hitting, strategic, target, dee..."


In [16]:
# choosing number of articles to process for stub testing
n = 10000
texts = list(articles_df['clean_text'][:n].values)
text_grams = ngram_processing(texts)
data_lemmatized = lemmatization(text_grams, allowed_postags)

In [17]:
%%time
# model creation
texts = data_lemmatized
id2word = Dictionary(texts)
corpus = [id2word.doc2bow(doc) for doc in texts]

CPU times: user 1.31 s, sys: 29.6 ms, total: 1.34 s
Wall time: 1.34 s


In [196]:
%%time
# model creation
# time estimation with LDA single core
# num_topics_list, model_list, coherence_vals, perplexity_vals = compute_coherence_values(id2word, corpus,   \
#                                                                 texts,workers=None, limit=30, start=2, step=3)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs


In [19]:
%%time
# time estimation with multicore LDA
# model creation
num_topics_list, model_list, coherence_vals, perplexity_vals = compute_coherence_values(id2word, corpus,    \
                                                                texts, workers=5, limit=30, start=2, step=3)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [28:46<00:00, 172.64s/it]

CPU times: user 23min 33s, sys: 9min 48s, total: 33min 22s
Wall time: 28min 46s





In [20]:
model_df = pd.DataFrame(list(zip(num_topics_list, coherence_vals, perplexity_vals)), 
                        columns = ['num_topics', 'coherence_score', 'perplexity_score']) 

In [21]:
model_df

Unnamed: 0,num_topics,coherence_score,perplexity_score
0,2,0.329787,-8.483017
1,5,0.428724,-8.439057
2,8,0.416579,-8.693746
3,11,0.392127,-9.561194
4,14,0.380911,-11.831976
5,17,0.372935,-12.910598
6,20,0.378009,-13.672018
7,23,0.36277,-14.264419
8,26,0.367707,-15.00456
9,29,0.384859,-15.466451


## Visualize Topic Distribution

In [22]:
base_chart = alt.Chart(model_df)
line = base_chart.mark_line().encode(
            x=alt.X('num_topics:N', axis=alt.Axis(title='Number of Topics')),
            y=alt.Y('coherence_score', axis=alt.Axis(title='Coherence Score')),
            tooltip = ['num_topics', 'coherence_score']
)

point = base_chart.mark_point(color='orange', size=50).encode(
            x=alt.X('num_topics:N', axis=alt.Axis(title='Number of Topics')),
            y=alt.Y('coherence_score', axis=alt.Axis(title='Coherence Score')),
            tooltip = ['num_topics', 'coherence_score']
)    


(line + point
).properties(width=500, height=300, title='Number of Topics vs Coherence Score'
).configure_title(fontSize=25
).configure_axis(grid=False, domain=False, 
                 labelFontSize=15,titleFontSize=20)

In [27]:

optimal_model_cond = np.where(model_df['coherence_score'] == max(model_df['coherence_score'].values))
optimal_model = model_list[optimal_model_cond[0][0]]

In [28]:
# Number of Topics = 5
gensimvis.prepare(optimal_model, corpus, id2word)

In [29]:
# Number of Topics = 8
gensimvis.prepare(model_list[2], corpus, id2word)

In [55]:
# Number of Topics = 11
gensimvis.prepare(model_list[3], corpus, id2word)

  from imp import reload
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)
  from imp import reload
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  if LooseVersion(__version__) >= 

  from imp import reload
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)
  from imp import reload
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  if LooseVersion(__version__) >= 

  from imp import reload
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(__version__) >= LooseVersion("1.17.0"):
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)
