# Modelling - 3rd Run

This model uses fewer types of words, restricting word types to just n-grams and nouns.

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import string
import nltk
from nltk import WordNetLemmatizer
import spacy
import re
import bbcode
import json

import gensim
from gensim.test.utils import datapath
from gensim import corpora, models, similarities
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
from gensim.parsing.preprocessing import STOPWORDS

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pyLDAvis
import pyLDAvis.gensim as p_gensim

import os
import pathlib
%matplotlib inline

In [11]:
final_df = pd.read_csv('./dataframes/final_df.csv',index_col=0)

In [12]:
final_df = final_df[['review','clean_reviews','2gram_reviews','3gram_reviews']]
final_df.head()

Unnamed: 0,review,clean_reviews,2gram_reviews,3gram_reviews
0,Well for me game still tons of work. i like it...,"['tons', 'work', 'recommend', '1', 'diplomacy'...","['ton', 'work', 'recommend', 'diplomacy', 'jok...","['ton', 'work', 'recommend', 'diplomacy', 'jok..."
1,I pursued Lu Bu. Now I [b]AM[/b] LU BU.,"['pursued', 'lu', 'bu', 'lu', 'bu']","['pursued', 'lu_bu', 'lu_bu']","['pursued', 'lu_bu', 'lu_bu']"
2,Absolutely great game. \nAll the new diplomacy...,"['absolutely', 'great', 'game', 'new', 'diplom...","['absolutely', 'great', 'new', 'diplomacy_opti...","['absolutely', 'great_game', 'new', 'diplomacy..."
3,A fine blend of Warhammer I/II: Total War and ...,"['fine', 'blend', 'warhammer', 'iii', 'total',...","['fine', 'blend', 'warhammer', 'total_war', 's...","['fine', 'blend', 'warhammer', 'total_war', 's..."
4,Innovative Total Game that has lots of persona...,"['innovative', 'total', 'lots', 'personality',...","['innovative', 'total', 'lot', 'personality', ...","['innovative', 'total', 'lot', 'personality', ..."


In [13]:
final_df['clean_reviews'] = final_df['clean_reviews'].map(lambda x: ''.join(c for c in x if c=='_' or c not in string.punctuation).split()) #n-grams underscores must be preserved for readability
final_df['2gram_reviews'] = final_df['2gram_reviews'].map(lambda x: ''.join(c for c in x if c=='_' or c not in string.punctuation).split()) 
final_df['3gram_reviews'] = final_df['3gram_reviews'].map(lambda x: ''.join(c for c in x if c=='_' or c not in string.punctuation).split()) 
#Reading in the DF from a CSV turned the list of words in each cell into string, so we have to remove the punctuation and split them again to get lists of terms

In [14]:
final_df.head()

Unnamed: 0,review,clean_reviews,2gram_reviews,3gram_reviews
0,Well for me game still tons of work. i like it...,"[tons, work, recommend, 1, diplomacy, joke, wo...","[ton, work, recommend, diplomacy, joke, work, ...","[ton, work, recommend, diplomacy, joke, work, ..."
1,I pursued Lu Bu. Now I [b]AM[/b] LU BU.,"[pursued, lu, bu, lu, bu]","[pursued, lu_bu, lu_bu]","[pursued, lu_bu, lu_bu]"
2,Absolutely great game. \nAll the new diplomacy...,"[absolutely, great, game, new, diplomacy, opti...","[absolutely, great, new, diplomacy_options, de...","[absolutely, great_game, new, diplomacy_option..."
3,A fine blend of Warhammer I/II: Total War and ...,"[fine, blend, warhammer, iii, total, war, shog...","[fine, blend, warhammer, total_war, shogun, to...","[fine, blend, warhammer, total_war, shogun, to..."
4,Innovative Total Game that has lots of persona...,"[innovative, total, lots, personality, brings,...","[innovative, total, lot, personality, brings, ...","[innovative, total, lot, personality, brings, ..."


In [15]:
# Text Cleaning Redux

nlp = spacy.load("en_core_web_sm")
parser = bbcode.Parser()

#open pre-defined dictionary of english contractions, saved separately to avoid clutter (long dictionary, can be loaded as dict using json package)
with open('./en_contractions/contra_dict.txt') as contra_dict:
    cList = json.load(contra_dict)

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text.lower())


def parse_clean(text):
    parsed_text = parser.strip(text) #remove all BBcode notations from text to enable further processing
    
    text = expandContractions(parsed_text) #expand all contractions and return all text in lower case
    
    #All word lengths should be >1 character and <= length of the longest word in the English language. It's common for people spam incoherent letters on the Internet.
    text = [word for word in text.split() if word not in en_stopwords and len(word)>1 and len(word) <= len('pneumonoultramicroscopicsilicovolcanoconiosis')] 
   
    #remove punctuation
    clean_text = [s.translate(str.maketrans('', '', string.punctuation)) 
                  for s in text]
    
    return clean_text

  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)


In [16]:
num_dict = {'0':'zero',
             '1':'one',
             '2':'two',
             '3':'three',
             #'i':'one',      skipped. Using Roman numberal 'i' will conflict with the pronoun "I", which is not a number
             'ii':'two',
             'iii':'three'   
            }

# only going up to number 3 because most games have up to 3 instalments. Many contrary examples exist though, e.g. Final Fantasy XIV and XV; Dynasty Warriors 9; RotK XIII.
#In Total War's case, we only need up to 3 to capture "three kingdoms" in an n-gram.

def num2word(d):
    
    if len(d) == 1 and d in '0123' or d in ['ii','iii']:
        word = num_dict[d]
    
    else:
        word = d
    
    return word


en_stopwords = list(set(STOPWORDS))
en_stopwords.extend(['good','better','great','lot','game','like','I','i'])
en_stopwords = [w for w in en_stopwords if w not in ['one','two','three']]

def remove_stopwords(docu):
    words = [num2word(w) for w in docu if w != '' and w not in en_stopwords]
    return words

def stop_clean(texts):
    texts = [parse_clean(doc) for doc in texts]
    texts = [remove_stopwords(doc) for doc in texts]
    
    return texts
#at this point we will make n-grams, then lemmatise using spacy since it can go by permitted postags

def spacy_lemma(bow,allowed_postags=['NOUN']):
    
    lemma_doc = nlp(" ".join(bow)) 

    lemma_text = [token.text if '_' in token.text else token.lemma_ if token.pos_ in allowed_postags else '' for token in lemma_doc]
    
    return lemma_text

In [17]:
model2_df=pd.DataFrame() #create new dataframe for this model with new sets of words
model2_df['review'] = final_df['review']
model2_df.head()

Unnamed: 0,review
0,Well for me game still tons of work. i like it...
1,I pursued Lu Bu. Now I [b]AM[/b] LU BU.
2,Absolutely great game. \nAll the new diplomacy...
3,A fine blend of Warhammer I/II: Total War and ...
4,Innovative Total Game that has lots of persona...


In [18]:
model2_df['clean_reviews'] = stop_clean(model2_df['review'])

In [19]:
model2_df.head()

Unnamed: 0,review,clean_reviews
0,Well for me game still tons of work. i like it...,"[tons, work, recommend, one, diplomacy, joke, ..."
1,I pursued Lu Bu. Now I [b]AM[/b] LU BU.,"[pursued, lu, bu, lu, bu]"
2,Absolutely great game. \nAll the new diplomacy...,"[absolutely, new, diplomacy, options, depth, u..."
3,A fine blend of Warhammer I/II: Total War and ...,"[fine, blend, warhammer, three, total, war, sh..."
4,Innovative Total Game that has lots of persona...,"[innovative, total, lots, personality, brings,..."


In [20]:
#Now that the cleaner text is ready (only or mostly nouns), we create n-grams again

# Credit to https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/ for the n-grams code
# Build the bigram and trigram models
bigram = gensim.models.Phrases(list(model2_df['clean_reviews']), min_count=5, threshold=10) # feed a list of lists of words e.g. [['word1','word2'],['word3','word4'] to get bigrams]
trigram = gensim.models.Phrases(bigram[list(model2_df['clean_reviews'])], threshold=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [21]:
model2_df['2gram_reviews'] = make_bigrams(model2_df['clean_reviews'])
model2_df['2gram_reviews'] = model2_df['2gram_reviews'].map(lambda x: spacy_lemma(x))
model2_df.head()

Unnamed: 0,review,clean_reviews,2gram_reviews
0,Well for me game still tons of work. i like it...,"[tons, work, recommend, one, diplomacy, joke, ...","[ton, , , , diplomacy, joke, , , military_acce..."
1,I pursued Lu Bu. Now I [b]AM[/b] LU BU.,"[pursued, lu, bu, lu, bu]","[, lu_bu, lu_bu]"
2,Absolutely great game. \nAll the new diplomacy...,"[absolutely, new, diplomacy, options, depth, u...","[, , diplomacy_options, depth, unit_variety, ,..."
3,A fine blend of Warhammer I/II: Total War and ...,"[fine, blend, warhammer, three, total, war, sh...","[, blend, , , total_war, shogun_two, total_war..."
4,Innovative Total Game that has lots of persona...,"[innovative, total, lots, personality, brings,...","[, , lot, personality, , , diplomacy, idea, co..."


In [22]:
model2_df['3gram_reviews'] = make_trigrams(model2_df['clean_reviews'])
model2_df['3gram_reviews'] = model2_df['3gram_reviews'].map(lambda x: spacy_lemma(x))

In [23]:
model2_df.head()

Unnamed: 0,review,clean_reviews,2gram_reviews,3gram_reviews
0,Well for me game still tons of work. i like it...,"[tons, work, recommend, one, diplomacy, joke, ...","[ton, , , , diplomacy, joke, , , military_acce...","[ton, , , , diplomacy, joke, , , military_acce..."
1,I pursued Lu Bu. Now I [b]AM[/b] LU BU.,"[pursued, lu, bu, lu, bu]","[, lu_bu, lu_bu]","[, lu_bu, lu_bu]"
2,Absolutely great game. \nAll the new diplomacy...,"[absolutely, new, diplomacy, options, depth, u...","[, , diplomacy_options, depth, unit_variety, ,...","[, , diplomacy_options, depth, unit_variety, ,..."
3,A fine blend of Warhammer I/II: Total War and ...,"[fine, blend, warhammer, three, total, war, sh...","[, blend, , , total_war, shogun_two, total_war...","[, blend, warhammer_three, total_war, shogun_t..."
4,Innovative Total Game that has lots of persona...,"[innovative, total, lots, personality, brings,...","[, , lot, personality, , , diplomacy, idea, co...","[, , lot, personality, , , diplomacy, idea, co..."


In [24]:
en_stopwords = list(set(STOPWORDS))
en_stopwords.extend(['good','better','great','lot','game','like','I','i','one','two','three']) #after making bigrams, removing numbers should reduce noise
model2_df['2gram_reviews'] = model2_df['2gram_reviews'].map(lambda x: remove_stopwords(x))
model2_df['3gram_reviews'] = model2_df['3gram_reviews'].map(lambda x: remove_stopwords(x))

#tried list comp to see if it as a .map() problem I was having -- turns out I had spacy Tokens mixed in with strings in the lists. Went back to the previous function to fix it. 
#Decided not to go and change it back since it's only a small difference

In [25]:
model2_df.head()

Unnamed: 0,review,clean_reviews,2gram_reviews,3gram_reviews
0,Well for me game still tons of work. i like it...,"[tons, work, recommend, one, diplomacy, joke, ...","[ton, diplomacy, joke, military_access, cao_ca...","[ton, diplomacy, joke, military_access, cao_ca..."
1,I pursued Lu Bu. Now I [b]AM[/b] LU BU.,"[pursued, lu, bu, lu, bu]","[lu_bu, lu_bu]","[lu_bu, lu_bu]"
2,Absolutely great game. \nAll the new diplomacy...,"[absolutely, new, diplomacy, options, depth, u...","[diplomacy_options, depth, unit_variety, warha...","[diplomacy_options, depth, unit_variety, warha..."
3,A fine blend of Warhammer I/II: Total War and ...,"[fine, blend, warhammer, three, total, war, sh...","[blend, total_war, shogun_two, total_war, love...","[blend, warhammer_three, total_war, shogun_two..."
4,Innovative Total Game that has lots of persona...,"[innovative, total, lots, personality, brings,...","[personality, diplomacy, idea, combat, bit, ch...","[personality, diplomacy, idea, combat, bit, ch..."


# LDA Model - 3grams - Filter the Bag of Words

In [362]:
#build dictionary and corpus from 3gram dataset -- this time with filter_extremes

documents = list(model2_df['3gram_reviews'])
dictionary = gensim.corpora.Dictionary(documents)
dictionary.filter_extremes(no_below = 5,no_above=0.5) #trying with default settings
corpus = [dictionary.doc2bow(word) for word in documents]

In [363]:
# LDA model parameters.
num_topics = 5
passes = 100
eval_every = None #Evaluation will happen later so no need to evaluate while training

In [364]:
%time ldamodel1 = LdaMulticore(corpus, num_topics=num_topics, id2word = dictionary, passes=passes, alpha='asymmetric',eval_every=eval_every,workers=3)

# Check resulting topics.
topic_list = ldamodel1.print_topics(num_topics=num_topics, num_words=15)
for index, i in enumerate(topic_list):
    str1 = str(i[1])
    for c in "0123456789+*\".":
        str1 = str1.replace(c, "")
    str1 = str1.replace("  ", " ")
    print(str1)

Wall time: 2min 56s
total_war series campaign battle best_total_war time three_kingdoms diplomacy total_war_games fan title play fun release hour
total_war love three_kingdoms diplomacy battle fan character history dynasty_warriors shogun story play fun time china
faction character unit battle army diplomacy thing general time campaign building way city vassal bit
cao_cao lu_bu china liu_bei war crash yuan_shao sun_jian patch warlord time friend emperor campaign year
unit army battle general enemy cavalry archer duel campaign hero stack infantry commander arrow time


In [365]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel1.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda1 = CoherenceModel(model=ldamodel1, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lda1 = coherence_model_lda1.get_coherence()
print('\nCoherence Score: ', coherence_lda1)


Perplexity:  -6.473605724382258

Coherence Score:  0.6018523303110062


In [366]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(ldamodel1, corpus, dictionary)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [367]:
newpath = r'C:\Users\Alfred\Documents\Capstone\models\nouns_only\model1' 
if not os.path.exists(newpath):
    os.makedirs(newpath)
    
ldamodel1.save('./models/nouns_only/model1/model1.model')

#Usable model with good topics! Very similar to topics found in first run and arguably better!

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


# 3rd Run in Summary

Changes to the approach:

1. Cleaning
    - Removed everything other than Nouns and n-grams
    - Used filter_extremes() with default settings (no_below = 5, no_above = 0.5)
   
2. Modelling
    - Decided to use 5 topics only, because I am only looking for 5 general topics (for potential use in a later model that can generalise across the series and potentially the whole strategy genre)
    
    
Changes to Results:
1. 1.5x to nearly 2x the Coherence score of previous models (~0.60 coherence! Previous models had ~ 0.38 coherence even with 15 topics!)
2. 5 very usable topics!

# Topics Identified

1. **Series Authenticity**</br>
    - (Topic 1) Game's authenticity compared to other titles in the Total War series 
    - Important because target market is full of hardcore history/ROTK buffs </br> </br>

2. **Strategic Gameplay**</br>
    - (Topic 2) Gameplay on the strategic scale, inclusive of new features e.g. revamped Diplomacy, Romance/Records mode selection, Faction playstyles </br> </br>

3. **Tactical Gameplay**</br>
    - (Topic 4) Gameplay of individual battles, traits of units and managing individual cities </br> </br>

4. **Historical/Cultural Authenticity**</br>
    - (Topic 3) Game's authenticity compared to source material. 
    - Game is based on both Records and Romance of the Three Kingdoms, historical records (Chen Shou, ~300 AD) and a novel (Luo Guanzhong, 14th Century AD) respectively. 
    - Strong overlap with Topic 1.
    - Draws comparisons to Koei Tecmo's Dynasty Warriors series based on the same source material because of Romance Mode, where generals are lone units capable of incredible feats in battle. </br> </br>

5. **Faction/Character Playstyle**</br>
    - (Topic 5) Each faction and character has special abilities. Liu Bei appears a lot because he is written as a virtuous hero in Romance of the Three Kingdoms. Also, his special passive ability is extremely powerful. (-50% upkeep cost for militia units, so theoretically can field 2x the number of armies as any other faction). Lü Bu and Guan Yu were famous peerless warriors of the era as well in Romance of the Three Kingdoms.
    - Some terms such as "crash" and "patch" may have entered this topic due to bugs involving [Liu Bei's annexation ability when the game was first released](https://steamcommunity.com/app/779340/discussions/0/1642038749328500806/).

The corrections made from previous versions of the model seem to have had a tremendously positive effect on getting coherent topics out of this model.

# Fine-tuning

Attempting to refine the results by removing more terms that may be generating noise

In [27]:
en_stopwords = list(set(STOPWORDS))
en_stopwords.extend(['good','better','great','lot','game','like','I','i','one','two','three','thing','bit','total_war','time']) #after making bigrams, removing numbers should reduce noise
model2_df['2gram_reviews_v2'] = model2_df['2gram_reviews'].map(lambda x: remove_stopwords(x))
model2_df['3gram_reviews_v2'] = model2_df['3gram_reviews'].map(lambda x: remove_stopwords(x))

#tried list comp to see if it as a .map() problem I was having -- turns out I had spacy Tokens mixed in with strings in the lists. Went back to the previous function to fix it. 
#Decided not to go and change it back since it's only a small difference

In [522]:
#build dictionary and corpus from 3gram dataset -- this time with filter_extremes

documents2 = list(model2_df['3gram_reviews_v2'])
dictionary2 = gensim.corpora.Dictionary(documents2)
dictionary2.filter_extremes(no_below = 5,no_above=0.5) #default settings
corpus2 = [dictionary2.doc2bow(word) for word in documents2]

In [523]:
# LDA model parameters.
num_topics = 5
passes = 100
eval_every = None #Evaluation will happen later so no need to evaluate while training

In [524]:
%time ldamodel2 = LdaMulticore(corpus2, num_topics=num_topics, id2word = dictionary2, passes=passes, alpha='asymmetric',eval_every=eval_every,workers=3)

# Check resulting topics.
topic_list = ldamodel2.print_topics(num_topics=num_topics, num_words=15)
for index, i in enumerate(topic_list):
    str1 = str(i[1])
    for c in "0123456789+*\".":
        str1 = str1.replace(c, "")
    str1 = str1.replace("  ", " ")
    print(str1)

Wall time: 1min 36s
three_kingdoms series fan diplomacy battle love best_total_war campaign total_war_games fun character shogun play title gameplay
china war cao_cao lu_bu liu_bei yuan_shao army character man emperor warlord friend history faction sun_jian
faction army diplomacy vassal war player people way city problem campaign option spy three_kingdoms review
unit battle faction army general character diplomacy campaign hero building duel enemy way map combat
campaign hour issue battle bug crash multiplayer review rome fix play patch work month release


In [525]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel2.log_perplexity(corpus2))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda2 = CoherenceModel(model=ldamodel2, texts=documents2, dictionary=dictionary2, coherence='c_v')
coherence_lda2 = coherence_model_lda2.get_coherence()
print('\nCoherence Score: ', coherence_lda2)


Perplexity:  -6.561133627837467

Coherence Score:  0.5688225451109933


In [526]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(ldamodel2, corpus2, dictionary2)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [541]:
newpath = r'C:\Users\Alfred\Documents\Capstone\models\nouns_only\model2' 
if not os.path.exists(newpath):
    os.makedirs(newpath)
    
ldamodel2.save('./models/nouns_only/model2/model2.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [540]:
model2_df.to_csv('./dataframes/model2_df.csv')

# Topics Identified

2nd version of this iteration is now preferred because of clear topic separation between all topics!

1. **Series Authenticity**</br>
    - (Topic 1) Game's authenticity to the Total War series
    - Contains comparisons to other titles in the Total War series (e.g. shogun_two --> Shogun 2, one of the most well-received installments in the series)
    - Important because target market is full of hardcore history/ROTK buffs 
    - Total War has also carved a nice niche for itself in the game industry - the only closest competitor is Ultimate General, created by a Total War modder </br> </br>

2. **Strategic Gameplay**</br>
    - (Topic 2) Gameplay on the strategic scale, e.g. movements on the campaign map, city management, diplomacy, negotiations
    - Inclusive of new features e.g. revamped Diplomacy, Romance/Records mode selection, Faction playstyles </br> </br>

3. **Tactical Gameplay**</br>
    - (Topic 4) Gameplay of individual battles, traits of units and managing individual cities </br> </br>

4. **Historical/Cultural Authenticity**</br>
    - (Topic 3) Game's authenticity compared to source material. 
    - Game is based on both Records and Romance of the Three Kingdoms, historical records (Chen Shou, ~300 AD) and a novel (Luo Guanzhong, 14th Century AD) respectively. 
    - Strong overlap with Topic 1.
    - Draws comparisons to Koei Tecmo's Dynasty Warriors series based on the same source material because of Romance Mode, where generals are lone units capable of incredible feats in battle. </br> </br>

5. **Game Stability**</br>
    - (Topic 5) Bugs, crashes, and fixes are terms relevant to this topic and they often come with games that work on massive scales like Total War.
    - Bugs are always a concern and are virtually unavoidable for games, especially close to release.
    - Examples include bugs affecting [Liu Bei's annexation ability when the game was first released](https://steamcommunity.com/app/779340/discussions/0/1642038749328500806/).

The corrections made from previous versions of the model seem to have had a tremendously positive effect on getting coherent topics out of this model.

In [28]:
#build dictionary and corpus from 3gram dataset -- doing a third run with the same parameters except num_topics increases to 6 just to see if there is a good and palatable difference

documents3 = list(model2_df['3gram_reviews_v2'])
dictionary3 = gensim.corpora.Dictionary(documents3)
dictionary3.filter_extremes(no_below = 5,no_above=0.5) #default settings
corpus3 = [dictionary3.doc2bow(word) for word in documents3]

In [29]:
# LDA model parameters.
num_topics = 6
passes = 100
eval_every = None #Evaluation will happen later so no need to evaluate while training

In [30]:
%time ldamodel3 = LdaMulticore(corpus3, num_topics=num_topics, id2word = dictionary3, passes=passes, alpha='asymmetric',eval_every=eval_every,workers=3)

# Check resulting topics.
topic_list = ldamodel3.print_topics(num_topics=num_topics, num_words=15)
for index, i in enumerate(topic_list):
    str1 = str(i[1])
    for c in "0123456789+*\".":
        str1 = str1.replace(c, "")
    str1 = str1.replace("  ", " ")
    print(str1)

Wall time: 2min 9s
three_kingdoms fan series battle character diplomacy campaign love total_war_games fun play romance_mode player tw gameplay
faction battle diplomacy campaign character general army unit mechanic map building fun three_kingdoms duel improvement
best_total_war shogun hour shogun_two mod dlc love year release launch title dlcs battle series work
china lu_bu cao_cao yuan_shao liu_bei history war man warlord sun_jian emperor friend army campaign people
problem play war issue crash bug patch fix review hour steam hope rome city diplomacy
unit army battle character general faction enemy hero campaign way diplomacy cavalry building archer city


In [31]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel3.log_perplexity(corpus3))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda3 = CoherenceModel(model=ldamodel3, texts=documents3, dictionary=dictionary3, coherence='c_v')
coherence_lda3 = coherence_model_lda3.get_coherence()
print('\nCoherence Score: ', coherence_lda3)


Perplexity:  -6.596486207932475

Coherence Score:  0.528633720775539


In [32]:
pyLDAvis.enable_notebook()
vis = p_gensim.prepare(ldamodel3, corpus3, dictionary3)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [538]:
newpath = r'C:\Users\Alfred\Documents\Capstone\models\nouns_only\model3' 
if not os.path.exists(newpath):
    os.makedirs(newpath)
    
ldamodel3.save('./models/nouns_only/model3/model3.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### References

https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/