# LDA Modelling - 3rd Run

This model uses fewer types of words, restricting word types to either:

1. n-grams and nouns only, or
2. n-grams, nouns, and verbs (newly added during deployment)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import string
import nltk
from nltk import WordNetLemmatizer
import spacy
import re
import bbcode
import json

import gensim
from gensim.test.utils import datapath
from gensim import corpora, models, similarities
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
from gensim.parsing.preprocessing import STOPWORDS

import pyLDAvis
import pyLDAvis.gensim as p_gensim

import os
import pathlib
%matplotlib inline

In [None]:
final_df = pd.read_csv('./dataframes/final_df.csv',index_col=0)

In [None]:
final_df = final_df[['timestamp_created','review','clean_reviews','2gram_reviews','3gram_reviews']]
final_df.head()

In [None]:
final_df['clean_reviews'] = final_df['clean_reviews'].map(lambda x: ''.join(c for c in x if c=='_' or c not in string.punctuation).split()) #n-grams underscores must be preserved for readability
final_df['2gram_reviews'] = final_df['2gram_reviews'].map(lambda x: ''.join(c for c in x if c=='_' or c not in string.punctuation).split()) 
final_df['3gram_reviews'] = final_df['3gram_reviews'].map(lambda x: ''.join(c for c in x if c=='_' or c not in string.punctuation).split()) 
#Reading in the DF from a CSV turned the list of words in each cell into string, so we have to remove the punctuation and split them again to get lists of terms

In [None]:
final_df.head()

In [None]:
final_df.dtypes

In [None]:
# Text Cleaning Redux

nlp = spacy.load("en_core_web_sm")
parser = bbcode.Parser()

#expand contractions
with open('./en_contractions/contra_dict.txt') as contra_dict:
    cList = json.load(contra_dict)

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text.lower())

#convert numbers to words

num_dict = {'0':'zero',
            '1':'one',
            '2':'two',
            '3':'three',
            #'i':'one',      skipped. Using Roman numeral 'i' will conflict with the pronoun "I", which is not a number
            'ii':'two',
            'iii':'three'   
            }

def num2word(d):
    
    if (len(d) == 1 and d in '0123')  or (d in ['ii','iii']):
        word = num_dict[d]
    
    elif (len(str(d))==1 and str(d) in '0123'):
        word = num_dict(str(d))
    
    else:
        word = d
    
    return word

#define stopwords

en_stopwords = list(set(STOPWORDS))
en_stopwords.extend(['good','better','great','lot','game','like','I','i'])
en_stopwords = [w for w in en_stopwords if w not in ['one','two','three']]    #retain these for making n-grams, then remove afterwards

def remove_stopwords(doc):
    words = [num2word(w) for w in doc if w != '' and w not in en_stopwords]
    return words


#combine cleaning functions into one function
def parse_clean(text):
    parsed_text = parser.strip(text) #remove BBcode notations from text
    
    text = expandContractions(parsed_text) #expand contractions; return all text in lower case
    
    text = re.split(r'\W+',text) #separate words from punctuation (e.g. remove "'s" from "Cao Cao's")
    
    text = [num2word(w) for w in text] #convert single digits to words before word len check, or they will be lost
    
    #All word lengths should be >1 character and <= length of the longest word in the English language. It's common for people spam incoherent letters on the Internet.
    text = [word for word in text if word not in en_stopwords and len(word)>1 and len(word) <= len('pneumonoultramicroscopicsilicovolcanoconiosis')] 
    
    clean_text = [num2word(w) for w in text] #just in case any lone numbers appeared after cleaning
    
    return clean_text


def stop_clean(texts):
    texts = [parse_clean(doc) for doc in texts]
    texts = [remove_stopwords(doc) for doc in texts] #just in case, remove stopwords one more time
    
    return texts

#at this point we will make n-grams, then lemmatise using spacy since it can go by permitted postags

def spacy_lemma(bow,allowed_postags=['NOUN']): #can add any from https://spacy.io/api/annotation#pos-tagging but will be sticking to NOUN, VERB, ADJ
    
    lemma_doc = nlp(" ".join(bow)) 

    lemma_text = [token.text if '_' in token.text else token.lemma_ if token.pos_ in allowed_postags else '' for token in lemma_doc]
    
    return lemma_text

In [26]:
model2_df = final_df[['timestamp_created','review']]
model2_df.head()

Unnamed: 0,timestamp_created,review
0,1562590376,Well for me game still tons of work. i like it...
1,1562581870,I pursued Lu Bu. Now I [b]AM[/b] LU BU.
2,1562578221,Absolutely great game. \nAll the new diplomacy...
3,1562575368,A fine blend of Warhammer I/II: Total War and ...
4,1562568222,Innovative Total Game that has lots of persona...


In [27]:
model2_df['clean_reviews'] = stop_clean(model2_df['review'])
model2_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,timestamp_created,review,clean_reviews
0,1562590376,Well for me game still tons of work. i like it...,"[tons, work, recommend, one, diplomacy, joke, ..."
1,1562581870,I pursued Lu Bu. Now I [b]AM[/b] LU BU.,"[pursued, lu, bu, lu, bu]"
2,1562578221,Absolutely great game. \nAll the new diplomacy...,"[absolutely, new, diplomacy, options, depth, u..."
3,1562575368,A fine blend of Warhammer I/II: Total War and ...,"[fine, blend, warhammer, two, total, war, shog..."
4,1562568222,Innovative Total Game that has lots of persona...,"[innovative, total, lots, personality, brings,..."


In [28]:
#Now that the cleaner text is ready (only or mostly nouns), we create n-grams again

# Credit to https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/ for the n-grams code

# Build the bigram and trigram models
bigram = gensim.models.Phrases(list(model2_df['clean_reviews']), min_count=5, threshold=10) # feed a list of lists of words e.g. [['word1','word2'],['word3','word4'] to get bigrams]
trigram = gensim.models.Phrases(bigram[list(model2_df['clean_reviews'])], threshold=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [29]:
model2_df['3gram_reviews'] = make_trigrams(model2_df['clean_reviews'])
model2_df['3grams_nouns'] = model2_df['3gram_reviews'].map(lambda x: spacy_lemma(x))
model2_df['3grams_nouns_verbs'] = model2_df['3gram_reviews'].map(lambda x: spacy_lemma(x,allowed_postags=['NOUN','VERB']))
model2_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,timestamp_created,review,clean_reviews,3gram_reviews,3grams_nouns,3grams_nouns_verbs
0,1562590376,Well for me game still tons of work. i like it...,"[tons, work, recommend, one, diplomacy, joke, ...","[tons, work, recommend, one, diplomacy, joke, ...","[ton, , , , diplomacy, joke, , , military_acce...","[ton, work, recommend, , diplomacy, joke, work..."
1,1562581870,I pursued Lu Bu. Now I [b]AM[/b] LU BU.,"[pursued, lu, bu, lu, bu]","[pursued, lu_bu, lu_bu]","[, lu_bu, lu_bu]","[pursue, lu_bu, lu_bu]"
2,1562578221,Absolutely great game. \nAll the new diplomacy...,"[absolutely, new, diplomacy, options, depth, u...","[absolutely, new, diplomacy_options, depth, un...","[, , diplomacy_options, depth, unit_variety, ,...","[, , diplomacy_options, depth, unit_variety, r..."
3,1562575368,A fine blend of Warhammer I/II: Total War and ...,"[fine, blend, warhammer, two, total, war, shog...","[fine, blend, warhammer_two, total_war, shogun...","[, blend, warhammer_two, total_war, shogun_two...","[, blend, warhammer_two, total_war, shogun_two..."
4,1562568222,Innovative Total Game that has lots of persona...,"[innovative, total, lots, personality, brings,...","[innovative, total, lots, personality, brings,...","[, , lot, personality, , , diplomacy, idea, co...","[, , lot, personality, bring, , diplomacy, ide..."


In [32]:
en_stopwords = list(set(STOPWORDS))
en_stopwords.extend(['good','better','great','lot','game','like','I','i','one','two','three']) #after making n-grams, removing numbers should reduce noise

model2_df['3grams_nouns'] = model2_df['3grams_nouns'].map(lambda x: remove_stopwords(x)) #removes blanks as well
model2_df['3grams_nouns_verbs'] = model2_df['3grams_nouns_verbs'].map(lambda x: remove_stopwords(x)) 

In [33]:
model2_df.head()

Unnamed: 0,timestamp_created,review,clean_reviews,3gram_reviews,3grams_nouns,3grams_nouns_verbs
0,1562590376,Well for me game still tons of work. i like it...,"[tons, work, recommend, one, diplomacy, joke, ...","[tons, work, recommend, one, diplomacy, joke, ...","[ton, diplomacy, joke, military_access, cao_ca...","[ton, work, recommend, diplomacy, joke, work, ..."
1,1562581870,I pursued Lu Bu. Now I [b]AM[/b] LU BU.,"[pursued, lu, bu, lu, bu]","[pursued, lu_bu, lu_bu]","[lu_bu, lu_bu]","[pursue, lu_bu, lu_bu]"
2,1562578221,Absolutely great game. \nAll the new diplomacy...,"[absolutely, new, diplomacy, options, depth, u...","[absolutely, new, diplomacy_options, depth, un...","[diplomacy_options, depth, unit_variety, warha...","[diplomacy_options, depth, unit_variety, reduc..."
3,1562575368,A fine blend of Warhammer I/II: Total War and ...,"[fine, blend, warhammer, two, total, war, shog...","[fine, blend, warhammer_two, total_war, shogun...","[blend, warhammer_two, total_war, shogun_two, ...","[blend, warhammer_two, total_war, shogun_two, ..."
4,1562568222,Innovative Total Game that has lots of persona...,"[innovative, total, lots, personality, brings,...","[innovative, total, lots, personality, brings,...","[personality, diplomacy, idea, combat, bit, fa...","[personality, bring, diplomacy, idea, combat, ..."


# LDA Model - 3grams - Nouns Only

In [41]:
#build dictionary and corpus from 3gram dataset, NOUNS only with filter_extremes()

documents = list(model2_df['3grams_nouns'])
dictionary = gensim.corpora.Dictionary(documents)
dictionary.filter_extremes(no_below = 5,no_above=0.5) #trying with default settings
corpus = [dictionary.doc2bow(word) for word in documents]

In [42]:
# LDA model parameters -- we will go with just 5 topics to keep the model more generalised
num_topics = 5
passes = 100
eval_every = None #Evaluation will happen later so no need to evaluate while training

In [43]:
#NOTE: LDAMultiCore; set workers = n-1 (where n is your number of cores)

%time ldamodel1 = LdaMulticore(corpus, num_topics=num_topics, id2word = dictionary, passes=passes, alpha='asymmetric',eval_every=eval_every,workers=3)

# Check resulting topics.
topic_list = ldamodel1.print_topics(num_topics=num_topics, num_words=15)
for index, i in enumerate(topic_list):
    str1 = str(i[1])
    for c in "0123456789+*\".":
        str1 = str1.replace(c, "")
    str1 = str1.replace("  ", " ")
    print(str1)

Wall time: 1min 50s
best_total_war total_war love campaign fun _ hour time play shogun_two total_war_games battle one_best_total_war issue bug
unit battle faction character army diplomacy general campaign total_war time hero thing building way option
history time player thing year tw hour money three_kingdoms way need play people world developer
total_war series three_kingdoms fan diplomacy character battle shogun_two campaign love title one_best date dynasty_warriors entry
army war faction cao_cao vassal liu_bei lu_bu enemy character yuan_shao general china emperor battle man


In [44]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel1.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda1 = CoherenceModel(model=ldamodel1, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lda1 = coherence_model_lda1.get_coherence()
print('\nCoherence Score: ', coherence_lda1)


Perplexity:  -6.485774337157328

Coherence Score:  0.5353204591401278


In [45]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(ldamodel1, corpus, dictionary)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [40]:
#save the model

newpath = './models/nouns_only/model1' 
if not os.path.exists(newpath):
    os.makedirs(newpath)
    
ldamodel1.save('./models/nouns_only/model1/model1.model')

#Usable model with good topics! Very similar to topics found in first run and arguably better!

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Preliminary Review

This model looks much more usable than previous ones.

Coherence score is higher as well. I'm starting to see some of the coherent topics I previously identified as well.

We can further refine the nouns-only model by removing more stopwords.

Changes to the approach:

1. Cleaning
    - Removed everything other than Nouns and n-grams
    - Used filter_extremes() with default settings (no_below = 5, no_above = 0.5)
   
2. Modelling
    - Decided to use 5 topics only, because I am only looking for 5 general topics (for potential use in a later model that can generalise across the series and potentially the whole strategy genre... or at least the Total War series)
    
    
Changes to Results:
1. 1.5x the Coherence score of previous models (~0.53 coherence! Previous models had ~ 0.38 coherence even with ~10 topics!)
2. 5 topics that seem very coherent

# Fine-tuning

Attempting to refine the results by removing more terms that may be generating noise

In [178]:
en_stopwords = list(set(STOPWORDS))
en_stopwords.extend(['good','better','great','lot','game','like','I','i','one','two','three','thing','bit','total_war','time','10_10','love','fun','play','hour']) #after making bigrams, removing numbers should reduce noise
model2_df['3grams_nouns_v2'] = model2_df['3grams_nouns'].map(lambda x: remove_stopwords(x))

In [179]:
#build dictionary and corpus from 3gram dataset -- this time with filter_extremes

documents3 = list(model2_df['3grams_nouns_v2'])
dictionary3 = gensim.corpora.Dictionary(documents3)
dictionary3.filter_extremes(no_below = 5,no_above=0.5)
corpus3 = [dictionary3.doc2bow(word) for word in documents3]

In [180]:
# LDA model parameters.
num_topics = 5
passes = 100
eval_every = None #Evaluation will happen later so no need to evaluate while training

In [181]:
#Beware, LDA has some randomness to it if you do not set a random_state. May not produce high-coherence model every time.

%time ldamodel3 = LdaMulticore(corpus3, num_topics=num_topics, id2word = dictionary3, passes=passes, alpha='asymmetric',eval_every=eval_every,workers=3,random_state = 14180)

# Check resulting topics.
topic_list = ldamodel3.print_topics(num_topics=num_topics, num_words=15)
for index, i in enumerate(topic_list):
    str1 = str(i[1])
    for c in "0123456789+*\".":
        str1 = str1.replace(c, "")
    str1 = str1.replace("  ", " ")
    print(str1)

Wall time: 1min 27s
series three_kingdoms best_total_war fan shogun_two campaign diplomacy history title battle way gameplay tw total_war_games dynasty_warriors
battle character faction diplomacy general campaign unit army hero romance_mode option three_kingdoms duel player way
campaign total_war_games blood dlc battle developer release steam issue month shogun_two blood_dlc patch date day
war lu_bu cao_cao china liu_bei faction yuan_shao army vassal emperor sun_jian diplomacy campaign friend coalition
unit army faction battle general enemy cavalry building archer commander way campaign mechanic infantry stack


In [187]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel3.log_perplexity(corpus3))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda3 = CoherenceModel(model=ldamodel3, texts=documents3, dictionary=dictionary3, coherence='c_v')
coherence_lda3 = coherence_model_lda3.get_coherence()
print('\nCoherence Score: ', coherence_lda3)


Perplexity:  -6.5997657738837505

Coherence Score:  0.5454689780490602


In [183]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(ldamodel3, corpus3, dictionary3)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [184]:
#saving as model3 because I will call model2 from a saved file
newpath = './models/nouns_only/model3' 
if not os.path.exists(newpath):
    os.makedirs(newpath)
    
ldamodel3.save('./models/nouns_only/model3/model3.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


# Randomness in LDA

Due to the randomness in LDA, topics identified are likely to differ each time it is run.

For consistent results, best to include random_state when training the model. (Will do this for future iterations)

The next cell will load a pre-trained model that was used for the remainder of the project.

It has the topics that are general and coherent, and which best fit the purpose of this project.

### This Model

Topics are reasonably coherent, but more suitable for another purpose.

1. Topic 1 - Game Features/Strategic Gameplay (Multiple topics)
2. Topic 2 - Series Authenticity
3. Topic 3 - Tactical Gameplay
4. Topic 4 - Characters
5. Topic 5 - Downloadable Content and Patches

In [143]:
#set up the saved model

lda_saved = LdaMulticore.load('./models/nouns_only/saved_model/saved.model')

documents_saved = list(model2_df['3grams_nouns_v2'])

dictionary_saved = gensim.corpora.Dictionary.load('./models/nouns_only/saved_model/saved.model.id2word')

#dictionary_saved.filter_extremes(no_below = 5,no_above=0.5) #default settings were used when training this model

corpus_saved = [dictionary_saved.doc2bow(doc) for doc in documents_saved]

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [145]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(lda_saved, corpus_saved, dictionary_saved)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


# Topics Identified

The nouns-only iterations have clear topic separation between all topics, and the saved one from a previous iteration has coherent topics.

1. **Content & Authenticity**</br>
    - (Topic 2) Game content and its authenticity to the Total War series
    - Contains comparisons to other titles in the Total War series (e.g. shogun_two --> Shogun 2, one of the most well-received installments in the series)
    - Important because target market is full of hardcore history/ROTK buffs 
    - Total War has also carved a nice niche for itself in the game industry - the only closest competitor is Ultimate General, created by a Total War modder </br> </br>

2. **Strategic Gameplay**</br>
    - (Topic 4) Gameplay on the strategic scale, e.g. movements on the campaign map, city management, diplomacy, negotiations
    - Inclusive of new features e.g. revamped Diplomacy, Romance/Records mode selection, Faction playstyles </br> </br>

3. **Tactical Gameplay**</br>
    - (Topic 1) Gameplay of individual battles, managing generals' equipment and skills, their retinues, individual units, managing individual cities, etc </br> </br>

4. **Characters**</br>
    - (Topic 3) Characters in the game and their behaviours.
    - Game is based on both Records and Romance of the Three Kingdoms, historical records (Chen Shou, ~300 AD) and a novel (Luo Guanzhong, 14th Century AD) respectively. 
    - Strong overlap with Topic 1.
    - Draws comparisons to Koei Tecmo's Dynasty Warriors series based on the same source material because of Romance Mode, where generals are lone units capable of incredible feats in battle. </br> </br>

5. **UI/UX, Performance, Stability**</br>
    - (Topic 5) Bugs, crashes, and fixes are terms relevant to this topic and they often come with games that work on massive scales like Total War.
    - General performance of the game (each player uses different PC specifications)
    - Bugs are always a concern and are virtually unavoidable for games, especially close to release.
    - Examples include bugs affecting random crashes in the middle of the game and [crashes involving Liu Bei's annexation ability when the game was first released](https://steamcommunity.com/app/779340/discussions/0/1642038749328500806/).

The corrections made from previous versions of the model seem to have had a tremendously positive effect on getting coherent topics out of this model.

In [216]:
#save visualisation of LDA model
LDA_fin = pyLDAvis.gensim.prepare(lda_saved, corpus_saved, dictionary_saved)
pyLDAvis.save_html(LDA_fin, './viz/lda_fin.html')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [203]:
# Trying out nouns and verbs
en_stopwords = list(set(STOPWORDS))
en_stopwords.extend(['good','better','great','lot','game','like','I','i','one','two','three','thing','bit','total_war','time','10_10','love','fun','play','hour']) 
model2_df['3grams_nouns_verbs_v2'] = model2_df['3grams_nouns_verbs'].map(lambda x: remove_stopwords(x))

In [204]:
#build dictionary and corpus from 3gram dataset -- this time with filter_extremes

documents4 = list(model2_df['3grams_nouns_verbs_v2'])
dictionary4 = gensim.corpora.Dictionary(documents4)
dictionary4.filter_extremes(no_below = 5,no_above=0.5)
corpus4 = [dictionary4.doc2bow(word) for word in documents4]

In [205]:
# LDA model parameters.
num_topics = 5
passes = 100
eval_every = None #Evaluation will happen later so no need to evaluate while training

In [206]:
#Beware, LDA has some randomness to it if you do not set a random_state. May not produce high-coherence model every time.

seed = np.random.randint(0,999999)
print("Seed:", seed,"\n")
%time ldamodel4 = LdaMulticore(corpus4, num_topics=num_topics, id2word = dictionary4, passes=passes, alpha='asymmetric',eval_every=eval_every,workers=3,random_state = seed)

# Check resulting topics.
topic_list = ldamodel4.print_topics(num_topics=num_topics, num_words=15)
for index, i in enumerate(topic_list):
    str1 = str(i[1])
    for c in "0123456789+*\".":
        str1 = str1.replace(c, "")
    str1 = str1.replace("  ", " ")
    print(str1)

Seed: 341010 

Wall time: 1min 49s
series feel diplomacy three_kingdoms best_total_war campaign fan shogun_two battle think release total_war_games buy title tw
battle character general campaign ca add look three_kingdoms unit duel hero want feel china romance_mode
battle diplomacy turn blood friend delay feel hope work character month support dlc change reason
turn war crash cao_cao lu_bu fix start fight liu_bei army yuan_shao campaign know want faction
unit faction army ai battle diplomacy general character building need use campaign enemy build feel


In [207]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel4.log_perplexity(corpus4))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda4 = CoherenceModel(model=ldamodel4, texts=documents4, dictionary=dictionary4, coherence='c_v')
coherence_lda4 = coherence_model_lda4.get_coherence()
print('\nCoherence Score: ', coherence_lda3)


Perplexity:  -6.802324161975279

Coherence Score:  0.5454689780490602


In [208]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(ldamodel4, corpus4, dictionary4)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [209]:
# LDA model parameters.
num_topics = 7
passes = 100
eval_every = None #Evaluation will happen later so no need to evaluate while training

In [210]:
#Beware, LDA has some randomness to it if you do not set a random_state. May not produce high-coherence model every time.

seed = np.random.randint(0,999999)
print("Seed:", seed,"\n")
%time ldamodel4 = LdaMulticore(corpus4, num_topics=num_topics, id2word = dictionary4, passes=passes, alpha='asymmetric',eval_every=eval_every,workers=3,random_state = seed)

# Check resulting topics.
topic_list = ldamodel4.print_topics(num_topics=num_topics, num_words=15)
for index, i in enumerate(topic_list):
    str1 = str(i[1])
    for c in "0123456789+*\".":
        str1 = str1.replace(c, "")
    str1 = str1.replace("  ", " ")
    print(str1)

Seed: 736409 

Wall time: 1min 58s
feel three_kingdoms best_total_war campaign diplomacy series character release battle work change ca think shogun_two add
series fan battle shogun_two three_kingdoms total_war_games enjoy ca feel title dynasty_warriors romance_mode diplomacy campaign one_best
unit battle faction army character general ai diplomacy campaign feel need building enemy look start
fix crash campaign bug battle tw update issue wait patch release buy review launch franchise
diplomacy lu_bu campaign strategy mechanic battle experience turn friend coalition execute one_turn army want need
buy need look think chinese_history pay dlc win fan blood battle guy s war dlcs
war china cao_cao turn yuan_shao liu_bei vassal army fight want sun_jian kill lu_bu people let


In [211]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel4.log_perplexity(corpus4))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda4 = CoherenceModel(model=ldamodel4, texts=documents4, dictionary=dictionary4, coherence='c_v')
coherence_lda4 = coherence_model_lda4.get_coherence()
print('\nCoherence Score: ', coherence_lda3)


Perplexity:  -6.843546324823157

Coherence Score:  0.5454689780490602


In [212]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(ldamodel4, corpus4, dictionary4)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [213]:
#saving this model that uses nouns and verbs, although the results are not as good as the saved model
newpath = './models/nouns_verbs/model4' 
if not os.path.exists(newpath):
    os.makedirs(newpath)
    
ldamodel4.save('./models/nouns_verbs/model4/model4.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [214]:
model2_df.to_csv('./dataframes/model2_df.csv')

### References

https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/