# Topic modeling with Python and gensim

Be sure to install the following libraries with pip3 or pipenv (if you're using a virtual environment, as suggested) in the directory in which you plan to create your jupyter notebook to run the topic modeling code:
<ul>
    <li>gensim</li>
    <li>numpy</li>
    <li>pyLDAvis</li>
</ul>

Have all the documents for which you want to create a topic model in the same folder, and know the path to that folder. You'll need to specify this path in the first step below.

In [2]:
#Before executing the following code, ensure that all documents are encoded at UTF-8.

#Set random seed for reproduction of code. 
import numpy as np
np.random.seed(42)

#Change working directory. 
import os
os.chdir("/Users/asg/Code/nlp/Ott-Alg") 
all_files = os.listdir("/Users/asg/Code/nlp/Ott-Alg")
all_files.sort()
all_files.pop(0)

#print(all_files)
len(all_files)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/asg/Code/nlp/Ott-Alg'

In [None]:
# from nltk.corpus import stopwords

# stop_words = stopwords.words('french')

# print(stop_words)

In [None]:
import nltk
from nltk import tokenize
from nltk.tokenize import word_tokenize 
import string

stopwords = open("/Users/asg/Code/nlp/fr-stop-oa.txt").read()

#Read each file into the working directory. Remove stop words, non-alphabetical terms, and punctuation. Append each processed file into the docs list. 
docs = []

for file in all_files:
    with open(file,'r') as f:
        text = f.read()        
        lines_list = tokenize.word_tokenize(text)
        filtered = [w for w in lines_list if w not in stopwords and w.isalpha()]       
        table = str.maketrans('', '', string.punctuation)
        translated = [w.translate(table) for w in filtered]
        stripped = list(filter(None, translated))
        lower = [w.lower() for w in stripped]
        docs.append(lower)

docs[:1]

[['création',
  'beylecs',
  'administration',
  'politique',
  'beys',
  'province',
  'au',
  'cours',
  'campagnes',
  'fils',
  'kheireddine',
  'aire',
  'face',
  'difficultés',
  'internes',
  'génèrent',
  'considérablement',
  'action',
  'il',
  'constata',
  'perdait',
  'transmission',
  'ordres',
  'garnisons',
  'coordination',
  'lente',
  'souvent',
  'imparfaite',
  'raison',
  'rivalités',
  'intrigues',
  'sein',
  'garnisons',
  'contributions',
  'destinées',
  'trésor',
  'public',
  'parvenaient',
  'moment',
  'voulu',
  'parvenaient',
  'au',
  'cours',
  'opérations',
  'ouest',
  'décida',
  'réunir',
  'pouvoirs',
  'détenus',
  'kaïds',
  'indépendants',
  'mains',
  'homme',
  'octroya',
  'dignité',
  'bey',
  'responsable',
  'afin',
  'gouverneur',
  'puisse',
  'sanctionner',
  'faute',
  'négligence',
  'exécution',
  'directives',
  'conféra',
  'pouvoirs',
  'civils',
  'militaires',
  'cette',
  'organisation',
  'mise',
  'épreuve',
  'permit',
  

In [None]:
#The two main inputs to the LDA topic model are the dictionary and the corpus, 
#so we begin by creating the dictionary.

from gensim import corpora, models, similarities

#Create dictionary of words. 
dictionary = corpora.Dictionary(docs)

#View dictionary and see how many unique tokens it contains.
print(dictionary)
print(len(dictionary))

Dictionary(13945 unique tokens: ['a', 'aboutirent', 'accords', 'accédé', 'action']...)
13945


In [None]:
#In the next step, we will create the corpus for the LDA model.

#Create Term-Document Frequency, which also converts tokenized documents to vectors 
corpus = [dictionary.doc2bow(text) for text in docs]

#View
print(corpus[:1])

#Gensim creates a unique ID for each word in the document, and the resulting printout in the View step above,
#shows the mapping: (word_id, word_frequency), so (0,1) means that word ID 0 occurs once in the first document.

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 2), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 3), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 2), (50, 1), (51, 1), (52, 1), (53, 1), (54, 2), (55, 2), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 4), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 3), (71, 1), (72, 3), (73, 1), (74, 2), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 4), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 2)

In [None]:
#If you want to see a word with a given ID, pass the ID as a key to the dictionary:
dictionary[0]

'a'

In [None]:
#Human-readable format of corpus (term-frequency)
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('a', 1),
  ('aboutirent', 1),
  ('accords', 1),
  ('accédé', 1),
  ('action', 1),
  ('administration', 1),
  ('afin', 1),
  ('agents', 1),
  ('ahmed', 1),
  ('aire', 1),
  ('algérie', 1),
  ('ali', 1),
  ('années', 1),
  ('anticipé', 1),
  ('arab', 2),
  ('au', 2),
  ('auprès', 1),
  ('avancement', 1),
  ('avantages', 1),
  ('avons', 1),
  ('ben', 1),
  ('bey', 3),
  ('beylec', 1),
  ('beylecs', 1),
  ('beys', 2),
  ('blessés', 1),
  ('bonne', 1),
  ('bou', 2),
  ('bénéfice', 1),
  ('campagnes', 1),
  ('cette', 1),
  ('chargea', 1),
  ('charges', 1),
  ('chef', 1),
  ('civils', 1),
  ('combattre', 1),
  ('conféra', 1),
  ('connut', 1),
  ('consentit', 1),
  ('considérablement', 1),
  ('constata', 1),
  ('contributions', 1),
  ('coordination', 1),
  ('cours', 2),
  ('création', 1),
  ('cédant', 1),
  ('dans', 1),
  ('demeurait', 1),
  ('destinées', 1),
  ('difficultés', 2),
  ('dignité', 1),
  ('directives', 1),
  ('dissidentes', 1),
  ('dizaine', 1),
  ('douaouda', 2),
  ('décida', 

## Identify a reasonable number of topics to try based on coherence scores

In [None]:
#compute coherence scores first to determine which number(s) of topics produces the most coherent results.

from gensim.models.coherencemodel import CoherenceModel
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = models.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary, random_state = 100, passes = 25, chunksize = 100)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=docs, start=2, limit=15, step=1)

In [None]:
#Plot the graph of coherence scores across different number of topics (from 2 to 15).
import matplotlib.pyplot as plt
limit=15; start=2; step=1
x = range(start, limit, step)
plt.scatter(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

NameError: name 'coherence_values' is not defined

### A 11-topic model seems to generate the best coherence scores, followed by 14 topics and then 9 topics.

In [None]:
#Parameter tuning will lead to different models. For this model, we use our corpus, dictionary, 
#set a random state of 100 (akin to setting a seed for reproduction purposes), 7 topics 
#(optimal number according to coherence scores), 25 passes (number of times the corpus is passed 
#through for training), and chunk size of 100 (number of documents to be used in each training chunk).

#See: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/. 

# How to optimize the interval, as in mallet (http://mallet.cs.umass.edu/topics.php)?
# --optimize-interval [NUMBER] This option turns on hyperparameter optimization, 
#which allows the model to better fit the data by allowing some topics to be more prominent than others. 
#Optimization every 10-20 iterations is reasonable. 

lda_model = models.LdaMulticore(corpus=corpus,
                         id2word=dictionary,
                         random_state=100,
                         num_topics=7,
                         passes=25,
                         chunksize=100,
                         )
lda_model.show_topics()

[(0,
  '0.006*"bey" + 0.003*"il" + 0.002*"ahmed" + 0.002*"fit" + 0.002*"province" + 0.002*"constantine" + 0.002*"zmala" + 0.002*"tribu" + 0.002*"turcs" + 0.002*"alger"'),
 (1,
  '0.014*"bey" + 0.010*"alger" + 0.010*"constantine" + 0.007*"il" + 0.006*"le" + 0.006*"tunis" + 0.006*"dey" + 0.005*"troupes" + 0.004*"pacha" + 0.004*"les"'),
 (2,
  '0.011*"bey" + 0.008*"ben" + 0.008*"le" + 0.008*"il" + 0.007*"constantine" + 0.006*"alger" + 0.005*"fit" + 0.005*"les" + 0.004*"ville" + 0.004*"mais"'),
 (3,
  '0.014*"bey" + 0.008*"il" + 0.006*"constantine" + 0.006*"le" + 0.005*"alger" + 0.004*"fit" + 0.004*"les" + 0.004*"ville" + 0.004*"faire" + 0.004*"pacha"'),
 (4,
  '0.014*"bey" + 0.007*"il" + 0.007*"ben" + 0.006*"le" + 0.005*"tunis" + 0.005*"constantine" + 0.005*"fit" + 0.005*"alger" + 0.004*"les" + 0.003*"qacentina"'),
 (5,
  '0.010*"bey" + 0.005*"constantine" + 0.005*"le" + 0.004*"tchakeur" + 0.004*"les" + 0.004*"il" + 0.003*"faire" + 0.003*"alger" + 0.003*"furent" + 0.003*"si"'),
 (6,
  '0.

In [None]:
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

ModuleNotFoundError: No module named 'numpy.core._multiarray_umath'

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### For other visualization options for topic models, see the sample code that ingests MALLET output and the following articles:

<ul>
    <li>Selva Prabhakaran, <a href="https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/">"Topic Modeling Visualization - How to present the results of LDA models?"</a>, <em>Machine Learning Plus</em></li>
    <li>Selva Prabhakaran, <a href="https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/">"LDA in Python - How to grid search best topic models?"</a>, <em>Machine Learning Plus.</em> This article is particularly helpful when you're ready to explore the results of the topic model and the relationships between the topics and documents. It also provides guidance on ways to assess the the model's performance.</li>
</ul>