Latent Dirichlet Allocation Tutorial at:
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [1]:
# Import libraries
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim for topic modeling functions
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
# libraries to tokenize, clean up and calculate word counts
import nltk
from nltk.corpus import words
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt')

wordlist = nltk.corpus.words.words()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
wordlist = [stemmer.stem(lemmatizer.lemmatize(word)) for word in wordlist]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kazeem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\kazeem\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kazeem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Import Dataset
df = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/py38/GTxM/data/MasterTokens_10to260SupTw_Not_GTr.csv')
#df = pd.read_csv('D:/KOPro/PhD/TechDelivery/SourceCode/py38/GTxM/data/GroundTruthTokens.csv', encoding = "ISO-8859-1")

<b>Available corpus fields:</b><br>
smrHashtags<br>smrMentions<br>smrNER<br>smrNouns<br>smrVerbs<br>smrAdverbs<br>smrAdjectives<br>

In [4]:
len(df['RecDoc'])

1512

In [5]:
df.head()

Unnamed: 0,rowid,RecID,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,155,1057291398880391170,Could fireworks be restricted at Scottish homes?,This is the effect fireworks can have on a dog...,45.0,banfireworks fireworks,neilmackay gamesshed didriksoderlind bbcradios...,juli timmi uk last night daili bob marley ub j...,effect firework dog juli hors goat lot firewor...,stand comfort reli built hear held purchas des...,outsid long care fairli ahead exactli seemingl...,gener licens wide gener last daili big loud so...,This is the effect fireworks can have on a dog...,This is the effect fireworks can have on a dog...
1,258,1124056098925944832,Sonic movie: New trailer shows redesigned hedg...,Thank you for the support. And the criticism. ...,18.0,sonicmovie gottafixfast,fowltown,paramount sega sonic hollywood sonic jeff,support critic messag design chang paramount s...,happen care watch show listen handl learn wait...,fulli total definit actual,loud clear happi commit hard massiv awesom gla...,Thank you for the support. And the criticism. ...,Thank you for the support. And the criticism. ...
2,285,1135851552495865857,"'I Don't Know Prince Andrew,' Trump Says. Phot...","????????On Day 2 of the #USStateVisit, The Duk...",115.0,trump2020 usstatevisit realtalk liarinchief pe...,10downingstreet teram323tere fox5atlanta thedu...,duke york donald trump st jame palac uk us tru...,duke york prime minist presid donald trump st ...,clarifi surpris jump hear rememb mean swear fi...,usual besid cours anymor though enough clearli...,person dumber truth proud profession polit hug...,"????????On Day 2 of the #USStateVisit, The Duk...",????????On Day 2 of the The Duke of York Prime...
3,334,1151389038781390848,"Naga Munchetty, BBC News Anchor, Has Reprimand...","""I've been told as a woman of colour to 'go ho...",260.0,racist britains istandwithnaga trump2020 faken...,bbcworld foxnew washingtonpost jam99percent sp...,trump truth welldon speak faeifa fiffaeifa fif...,woman colour experi reaction comment presid tr...,share discu suppress speak experi listen trump...,home probabl freeli everytim regularli total s...,faeifaei anti trump fals sad bad vile pervert ...,"""I've been told as a woman of colour to 'go ho...","""I've been told as a woman of colour to 'go ho..."
4,374,1165822705037217792,Cars Are Death Machines. Self-Driving Tech Won...,"Please RT if you, or someone you know, has bee...",17.0,,aarieff realdonaldtrump ttmitch,yard hoboken nj washington yanke stadium long ...,car experi mobil panel daughter car yard aspha...,hit hit thrown end broken land save pass place...,badli recent straight right nearli nearli slow...,littl upcom catastroph oncom danger high small...,"Please RT if you, or someone you know, has bee...","Please RT if you, or someone you know, has bee..."


In [6]:
len(df)

1512

In [7]:
ExpName = "CGTNounAdv"
# df2 = df['smrNER'].fillna(value='') + df['smrAdverbs'].fillna(value='')
# df2 = df['smrNouns'].fillna(value='') + df['smrAdjectives'].fillna(value='')
# df2 = df['smrNouns'].fillna(value='') + df['smrAdverbs'].fillna(value='') + df['smrAdjectives'].fillna(value='')
df2 = df['smrNouns'].fillna(value='') + df['smrAdverbs'].fillna(value='')
# df2 = df['smrMentions'].fillna(value='') + df['smrNouns'].fillna(value='') + df['smrAdverbs'].fillna(value='')
# df2 = df['smrHashtags'].fillna(value='') + df['smrNouns'].fillna(value='') + df['smrNER'].fillna(value='')
# df2 = df['smrHashtags'].fillna(value='') + df['smrNouns'].fillna(value='') + df['smrNER'].fillna(value='')
# df2 = df['smrHashtags'].fillna(value='') + df['smrMentions'].fillna(value='') + df['smrNER'].fillna(value='')
# df2 = df['smrHashtags'].fillna(value='') + df['smrMentions'].fillna(value='') + df['smrNouns'].fillna(value='')
# df2 = df['smrHashtags'].fillna(value='') + df['smrNER'].fillna(value='')
# df2 = df['smrHashtags'].fillna(value='') + df['smrNouns'].fillna(value='') + df['smrVerbs'].fillna(value='')
# df2 = df['smrHashtags'].fillna(value='') + df['smrNER'].fillna(value='')
# df2 = df['smrMentions'].fillna(value='') + df['smrNouns'].fillna(value='') + df['smrVerbs'].fillna(value='')
# df2 = df['smrMentions'].fillna(value='') + df['smrNER'].fillna(value='')
# df2 = df['smrNouns'].fillna(value='') + df['smrVerbs'].fillna(value='')
# df2 = df['smrMentions'].fillna(value='') + df['smrNouns'].fillna(value='') 
# df2.index = df["RecID"]
data = df2.str.split()
data_words = data.values.tolist()
print('Token list created successfully.')

Token list created successfully.


In [8]:
#data_words[0]
#data.iloc[1]
#data.index

df_docs = df['RecID']

In [9]:
id2word = corpora.Dictionary(data_words)
print('Word dictionary created successfully.')

Word dictionary created successfully.


In [10]:
# Term Document Frequency
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]
print('Term-Doc-Frequency created successfully.')

Term-Doc-Frequency created successfully.


In [11]:
len(corpus)

1512

In [12]:
# Build LDA model
print('Building LDA model...')
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           #num_topics=5,     
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# Print the Keyword in the 20 topics
print('LDA model created successfully.')
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

Building LDA model...
LDA model created successfully.
[(0,
  '0.043*"famili" + 0.023*"love" + 0.022*"fy" + 0.021*"gy" + 0.019*"servic" + '
  '0.018*"friend" + 0.018*"retir" + 0.018*"life" + 0.015*"god" + '
  '0.014*"heart"'),
 (1,
  '0.042*"peopl" + 0.030*"countri" + 0.015*"hate" + 0.014*"africa" + '
  '0.012*"child" + 0.012*"jew" + 0.011*"christian" + 0.011*"jesu" + '
  '0.011*"israel" + 0.011*"canada"'),
 (2,
  '0.022*"peopl" + 0.022*"space" + 0.013*"forc" + 0.010*"flight" + '
  '0.008*"money" + 0.008*"fire" + 0.007*"uniform" + 0.007*"air" + '
  '0.007*"dongcot" + 0.007*"world"'),
 (3,
  '0.000*"plsfnofyefy" + 0.000*"outland" + 0.000*"saucepan" + 0.000*"rua" + '
  '0.000*"puffin" + 0.000*"primev" + 0.000*"pressi" + 0.000*"shetland" + '
  '0.000*"perez" + 0.000*"murdersfno"'),
 (4,
  '0.176*"anc" + 0.092*"anoan" + 0.073*"anian" + 0.057*"ancan" + 0.056*"anaan" '
  '+ 0.052*"anuan" + 0.044*"ane" + 0.039*"anean" + 0.034*"anpsan" + '
  '0.028*"annan"'),
 (5,
  '0.026*"eu" + 0.022*"uk" + 0

In [13]:
print('Generating performance scores for '+ExpName)
# Compute Perplexity
perplex_lda = lda_model.log_perplexity(corpus)
print('\nPerplexity: {:.2f}'.format(perplex_lda))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: {:.2f}'.format(coherence_lda))


Generating performance scores for CGTNounAdv

Perplexity: -10.84

Coherence Score: 0.54


<b>Performance scores:</b><br>
0. GTrNounAdv<br>
Perplexity: -8.00<br>
Coherence Score: 0.51<br>
14. NounAdj<br>
Perplexity: -10.66<br>
Coherence Score: 0.50<br>
13. NERAdverb<br>
Perplexity: -12.22<br>
Coherence Score: 0.57<br>
12. NounAdverbAdj<br>
Perplexity: -10.60<br>
Coherence Score: 0.52<br>
11. NounAdverb<br>
Perplexity: -10.84<br>
Coherence Score: 0.54<br>
10. MentNounAdverb<br>
......Perplexity:  -11.335431089830974<br>
......Coherence Score:  0.5172917836343668<br>
9. Hashtags+Nouns+NER<br>
......Perplexity:  -11.389471096607<br>
......Coherence Score:  0.5580963360366651<br>
8. Hashtags+Mentions+NER<br>
......Perplexity:  -15.337374736555555<br>
......Coherence Score:  0.5550375566450458<br>
7. Hashtags+Mentions+Nouns<br>
......Perplexity:  -11.66981087109629<br>
......Coherence Score:  0.5480562627150661<br>
6. Hashtags+NER<br>
......Perplexity:  -15.117791431322871<br>
......Coherence Score:  0.4584529220486597<br>
5. Hashtags+Nouns+Verbs<br>
......Perplexity:  -10.764463686408684<br>
......Coherence Score:  0.48532559600503405<br>
4. Mentions+Nouns+Verbs<br>
......Perplexity:  -10.9279676838438<br>
......Coherence Score:  0.43582162664144775<br>
3. Mentions+NER<br>
......Perplexity:  -15.548485846134131<br>
......Coherence Score:  0.4594905858493948<br>
2. Nouns+Verbs<br>
......Perplexity:  -10.495273547798948<br>
......Coherence Score:  0.42574474436248205<br>
1. Mentions+Nouns<br>
......Perplexity:  -11.390487661937238<br>
......Coherence Score:  0.48191581854479937<br>

<b>pyLDAvis Visualizations</b><br>
https://stackoverflow.com/questions/50923430/what-does-the-parameter-mds-mean-in-the-pyldavis-sklearn-prepare-function<br>
pcoa:Principal Coordinate Analysis(aka Classical Multidimensional Scaling)<br>
mmds:Metric Multidimensional Scaling<br>
tsne:t-distributed Stochastic Neighbor Embedding<br>

In [18]:
#-- use sort_topics=False -- https://stackoverflow.com/questions/59322409/pyldavis-visualisation-does-not-align-with-generated-topics
vis2 = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='tsne', sort_topics=False)
pyLDAvis.save_html(vis2, 'lda_tsne_'+ExpName+'_sortFalse_v2.html')

  default_term_info = default_term_info.sort_values(


In [19]:
vis2 = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='tsne', sort_topics=True)
pyLDAvis.save_html(vis2, 'lda_tsne_'+ExpName+'_sortTrue_v2.html')

  default_term_info = default_term_info.sort_values(


In [16]:
#vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='mmds', sort_topics=False)
#pyLDAvis.save_html(vis, 'lda.html')

## Generate Document-Topic lists

In [17]:
doc_topic = lda_model.get_document_topics(corpus, minimum_probability=0.1)

In [57]:
doc_topic[0]

[(7, 0.13442786), (14, 0.5890485)]

In [13]:
# Alternative approach: for loop comprehension
#topics = [ [entry[0] for entry in doc] for doc in doc_topic ]
#topics[0]
#scores = [ [entry[1] for entry in doc] for doc in doc_topic ]
#scores[0]

In [20]:
#Alternative approach: get dominant topics based on scores
#df_doc_topic = pd.DataFrame(columns=['RecID', 'TopicID', 'TopicScore'])
#max(df_doc_topic['TopicScore'][1])
#for i in range(len(df_docs)):
#    max_idx = np.argmax(df_doc_topic['TopicScore'][i])
#    print(df_docs[i], df_doc_topic['TopicID'][i][max_idx], df_doc_topic['TopicScore'][i][max_idx])
#    df_doc_topic_dominant.loc[i] = [df_docs[i].astype(str), df_doc_topic['TopicID'][i][max_idx], df_doc_topic['TopicScore'][i][max_idx]]
#    if i>5:
#        break;

In [21]:
i = 0
docs=[]
topics=[]
scores=[]
for doc in doc_topic:
    doc_id = df_docs.iloc[i]
    i = i+1
    #if i>5: break
    for topic_id, score in doc:
        #print(doc_id, topic_id, score)
        scores.append(score)
        topics.append(topic_id)
        docs.append(doc_id)

In [22]:
# The results below are the old values based on this code (which stopped working):
df_doc_topic = pd.DataFrame({'RecID': docs, 'TopicID': topics, 'TopicScore': scores})

In [23]:
df_doc_topic

Unnamed: 0,RecID,TopicID,TopicScore
0,1057291398880391170,2,0.546937
1,1057291398880391170,5,0.138392
2,1124056098925944832,0,0.484139
3,1124056098925944832,2,0.171029
4,1124056098925944832,9,0.158561
...,...,...,...
4247,1223365339494453248,5,0.554148
4248,1223365339494453248,18,0.244555
4249,222818213392678912,5,0.728429
4250,222818213392678912,9,0.200785


In [24]:
df_doc_topic.groupby(by='TopicID').count()

Unnamed: 0_level_0,RecID,TopicScore
TopicID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,461,461
1,161,161
2,828,828
4,45,45
5,453,453
6,324,324
7,4,4
9,862,862
10,22,22
12,1,1


In [25]:
df_doc_topic.to_csv('lda_doc_topic_all_'+ExpName+'_v2.csv')

In [26]:
df_doc_topic_nodup = df_doc_topic.sort_values(['TopicScore'], ascending=(False)).drop_duplicates(['RecID'])

In [27]:
len(df_doc_topic_nodup)

1512

In [28]:
df_doc_topic_nodup.to_csv('lda_doc_dominant_topic_'+ExpName+'_v2.csv')

In [29]:
topics_ndarray = df_doc_topic_nodup.TopicID.unique()
topic_list =sorted(topics_ndarray)

In [30]:
listTopicID = list(set(df_doc_topic_nodup['TopicID'].tolist())) # get the unique list of Topic IDs

In [31]:
listTopicID

[0, 1, 2, 4, 5, 6, 9, 10, 12, 14, 15, 17, 18, 19]

## Select top 20%, upto 20 items max of each Topic

In [32]:
df_doc_topic_top20pcent = pd.DataFrame(columns=['RecID', 'TopicID', 'TopicScore'])
i=0
for i in range(len(listTopicID)):
    df_temp = df_doc_topic_nodup[(df_doc_topic_nodup.TopicID == listTopicID[i])]
    topic_items = len(df_temp)
    #if len(df_temp) > 9:
    topic20pc_items = round(len(df_temp)/5)
    if topic20pc_items > 20: #max 20 items
        topic20pc_items = 20
    df_temp = df_temp.head(topic20pc_items)
    df_doc_topic_top20pcent = pd.concat([df_doc_topic_top20pcent, df_temp])
    print(i, topic_items, topic20pc_items)

0 186 20
1 20 4
2 310 20
3 25 5
4 198 20
5 84 17
6 243 20
7 3 1
8 1 0
9 31 6
10 85 17
11 38 8
12 28 6
13 260 20


In [33]:
df_doc_topic_top20pcent

Unnamed: 0,RecID,TopicID,TopicScore
665,1184501980313636864,0,0.930105
1328,1191372018500980736,0,0.912929
168,1180775281067462658,0,0.902320
2286,1201730618955948032,0,0.899565
189,1181111461306867713,0,0.886614
...,...,...,...
3106,1213913877135712258,19,0.758603
203,1181203256774664192,19,0.752758
4131,1221822472292634625,19,0.751133
2627,1206218767480438784,19,0.749127


In [34]:
df_doc_topic_top20pcent.to_csv('lda_doc_topic_top20pct_'+ExpName+'_v2.csv')

In [35]:
df_doc_topic_top20pcent.groupby(by='TopicID').count()

Unnamed: 0_level_0,RecID,TopicScore
TopicID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,20,20
1,4,4
2,20,20
4,5,5
5,20,20
6,17,17
9,20,20
10,1,1
14,6,6
15,17,17


In [36]:
len(df_doc_topic_top20pcent)

164