In [1]:
!pip install gensim == 4.1.0 alive_progress == 2.3.0 pyLDAvis == 3.3.1

Collecting gensim==4.1.0
  Downloading gensim-4.1.0-cp39-cp39-win_amd64.whl (24.0 MB)
     ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
     ---------------------------------------- 0.2/24.0 MB 3.3 MB/s eta 0:00:08
      --------------------------------------- 0.3/24.0 MB 3.4 MB/s eta 0:00:07
      --------------------------------------- 0.5/24.0 MB 3.3 MB/s eta 0:00:08
     - -------------------------------------- 0.6/24.0 MB 3.3 MB/s eta 0:00:08
     - -------------------------------------- 0.7/24.0 MB 3.3 MB/s eta 0:00:07
     - -------------------------------------- 0.9/24.0 MB 3.3 MB/s eta 0:00:07
     - -------------------------------------- 1.0/24.0 MB 3.5 MB/s eta 0:00:07
     - -------------------------------------- 1.0/24.0 MB 3.5 MB/s eta 0:00:07
     - -------------------------------------- 1.0/24.0 MB 3.5 MB/s eta 0:00:07
     - -------------------------------------- 1.0/24.0 MB 3.5 MB/s eta 0:00:07
     - -------------------------------------- 1.0/24

In [2]:
import string
from pprint import pprint

import gensim.corpora as corpora
import nltk
import pandas as pd
import pyLDAvis
import spacy
from alive_progress import alive_bar
from gensim.models import LdaMulticore, CoherenceModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pyLDAvis import gensim_models as gensimvis

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
pyLDAvis.enable_notebook()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\camer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\camer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
df = pd.read_csv("dataset.csv", sep="~")
df.head()

Unnamed: 0,text
0,At the 2001 Italian general election the Green...
1,For the 2009 European Parliament election in I...
2,Sitting as a Liberal Party of Canada Member of...
3,The MRE took part to the consolidation of The ...
4,They include the records of the Federal Secret...


In [7]:
df['lowercase_text'] = df['text'].map(lambda token: token.lower())
df.head()

Unnamed: 0,text,lowercase_text
0,At the 2001 Italian general election the Green...,at the 2001 italian general election the green...
1,For the 2009 European Parliament election in I...,for the 2009 european parliament election in i...
2,Sitting as a Liberal Party of Canada Member of...,sitting as a liberal party of canada member of...
3,The MRE took part to the consolidation of The ...,the mre took part to the consolidation of the ...
4,They include the records of the Federal Secret...,they include the records of the federal secret...


In [8]:
punc_translation_table = str.maketrans('', '', string.punctuation)
tokenised_docs = [word_tokenize(doc.translate(punc_translation_table)) for doc in df["lowercase_text"]]
print(tokenised_docs[0][:30])

['at', 'the', '2001', 'italian', 'general', 'election', 'the', 'greens', 'formed', 'a', 'joint', 'list', 'with', 'the', 'italian', 'democratic', 'socialists', 'sdi', 'the', 'sunflower']


In [9]:
stops = set(stopwords.words('english'))
stops = stops.union({
    'said', 'would', 'could', 'told', 'also', 'mr', 'use', 'new', 'way'
})
tokenised_docs_no_stops = [[word for word in doc if word not in stops] for doc in tokenised_docs]
tokenised_docs_no_stops[0]

['2001',
 'italian',
 'general',
 'election',
 'greens',
 'formed',
 'joint',
 'list',
 'italian',
 'democratic',
 'socialists',
 'sdi',
 'sunflower']

In [13]:
lemmatized_docs = []
model = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
for doc in tokenised_docs_no_stops:
    # lemmatized_docs.append([token.lemma_ for token in model(" ".join(doc)) if token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV']])
    lemmatized_docs.append([token.lemma_ for token in model(" ".join(doc))])
lemmatized_docs[0]

ModuleNotFoundError: No module named 'torch.cuda'

In [None]:
index_to_word = corpora.Dictionary(lemmatized_docs)
corpus = [index_to_word.doc2bow(text) for text in lemmatized_docs]
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 4), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1)]


In [None]:
lda_model = LdaMulticore(corpus=corpus,
                         id2word=index_to_word,
                         num_topics=10,
                         random_state=123,
                         chunksize=40,
                         passes=10)

pprint(lda_model.print_topics())

[(0,
  '0.013*"war" + 0.013*"irish" + 0.011*"conference" + 0.011*"european" + '
  '0.010*"fellow" + 0.009*"election" + 0.009*"general" + 0.009*"ireland" + '
  '0.008*"leader" + 0.008*"independence"'),
 (1,
  '0.044*"election" + 0.040*"united" + 0.037*"states" + 0.035*"presidential" + '
  '0.033*"party" + 0.017*"democratic" + 0.016*"senate" + 0.012*"campaign" + '
  '0.011*"college" + 0.008*"two"'),
 (2,
  '0.061*"election" + 0.050*"party" + 0.034*"general" + 0.027*"united" + '
  '0.025*"kingdom" + 0.014*"democratic" + 0.011*"social" + 0.010*"seat" + '
  '0.008*"national" + 0.008*"movement"'),
 (3,
  '0.015*"major" + 0.013*"thomas" + 0.012*"general" + 0.010*"south" + '
  '0.009*"county" + 0.009*"majority" + 0.008*"five" + 0.008*"lord" + '
  '0.007*"home" + 0.007*"serve"'),
 (4,
  '0.016*"summit" + 0.012*"minister" + 0.012*"lose" + 0.010*"government" + '
  '0.009*"head" + 0.008*"election" + 0.008*"celac" + 0.008*"association" + '
  '0.007*"force" + 0.007*"president"'),
 (5,
  '0.026*"part

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=lemmatized_docs, dictionary=index_to_word,
                                     coherence='c_uci')
print('Coherence Score: ', coherence_model_lda.get_coherence())

Coherence Score:  -11.462413449193646


In [None]:
def get_coherence_val(num_topics, alpha_val, beta_val):
    lda_model = LdaMulticore(corpus=corpus,
                             id2word=index_to_word,
                             num_topics=num_topics,
                             random_state=123,
                             chunksize=40,
                             passes=10,
                             alpha=alpha_val,
                             eta=beta_val)

    return CoherenceModel(model=lda_model, texts=lemmatized_docs, dictionary=index_to_word,
                          coherence='c_uci').get_coherence()

In [None]:
num_topics_range = range(10, 11)
alpha_param_range = [0.01, 0.1, 1, 10, "symmetric", "asymmetric"]
beta_param_range = [0.01, 0.1, 1, 10, "symmetric", "auto"]
results = {'Topics': [],
           'Alpha': [],
           'Beta': [],
           'Coherence': []
           }

with alive_bar(len(num_topics_range) * len(alpha_param_range) * len(beta_param_range), force_tty=True) as bar:
    for n in num_topics_range:
        for a in alpha_param_range:
            for b in beta_param_range:
                coherence_val = get_coherence_val(n, a, b)
                results['Topics'].append(n)
                results['Alpha'].append(a)
                results['Beta'].append(b)
                results['Coherence'].append(coherence_val)
                bar()

pd.DataFrame(results).to_csv('lda_tuning_results.csv', index=False)
max_coherence_index = results['Coherence'].index(min(results['Coherence']))
num_of_topics, alpha_val, beta_val = results['Topics'][max_coherence_index], results['Alpha'][max_coherence_index], \
results['Beta'][max_coherence_index]

|████████████████████████████████████████| 36/36 [100%] in 1:35.8 (0.38/s)      


In [None]:
lda_model = LdaMulticore(corpus=corpus,
                         id2word=index_to_word,
                         num_topics=num_of_topics,
                         random_state=123,
                         chunksize=40,
                         passes=10,
                         alpha="asymmetric",
                         eta="auto")
pprint(lda_model.print_topics())

[(0,
  '0.011*"university" + 0.009*"war" + 0.008*"party" + 0.008*"president" + '
  '0.008*"irish" + 0.007*"leader" + 0.007*"council" + 0.006*"foreign" + '
  '0.006*"minister" + 0.006*"conference"'),
 (1,
  '0.056*"election" + 0.047*"presidential" + 0.041*"united" + 0.040*"party" + '
  '0.039*"states" + 0.019*"democratic" + 0.014*"candidate" + 0.014*"senate" + '
  '0.011*"campaign" + 0.011*"support"'),
 (2,
  '0.049*"election" + 0.036*"party" + 0.034*"general" + 0.022*"united" + '
  '0.021*"kingdom" + 0.012*"state" + 0.011*"democratic" + 0.010*"summit" + '
  '0.009*"seat" + 0.009*"national"'),
 (3,
  '0.017*"major" + 0.013*"thomas" + 0.012*"south" + 0.011*"include" + '
  '0.010*"around" + 0.010*"lord" + 0.008*"county" + 0.007*"turkish" + '
  '0.007*"samuel" + 0.007*"2014"'),
 (4,
  '0.011*"lose" + 0.009*"force" + 0.008*"federal" + 0.007*"we" + '
  '0.007*"governors" + 0.007*"association" + 0.006*"country" + 0.006*"later" + '
  '0.006*"senator" + 0.006*"defeat"'),
 (5,
  '0.027*"party" +

In [None]:
graph = gensimvis.prepare(lda_model, corpus, index_to_word, mds='mmds')
graph

  default_term_info = default_term_info.sort_values(
