### Importing Packages 

In [None]:
import pickle

import numpy as np 
import pandas as pd

import re 
import string 

import nltk

from langdetect import detect

from sklearn.feature_extraction.text import CountVectorizer 

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import gensim
from gensim import corpora, models, matutils
from gensim.models import CoherenceModel

from pprint import pprint

import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import os
from gensim.models.wrappers import LdaMallet

os.environ.update({'MALLET_HOME':r'C:/new_mullet/mallet-2.0.8/'})
mallet_path = 'C:/new_mullet/mallet-2.0.8/bin/mallet' 

## Opening Scrapped Data 

In [None]:
with open('pokemon.pickle','rb') as read_file:
    df = pickle.load(read_file)

# Data Cleaning 

## Dataset 

In [None]:
df.info()  

In [None]:
df['rating'] = df['rating'].astype(int)

In [None]:
### Group  reviews by their metacritic ratings 
### 0 to 4 = negative, 5 to 7 mixed, 8 and above = positive 

def sentiment(x):
    if x > 7:
        return 'positive'
    if x < 5:
        return 'negative'
    else: return 'mixed'

df['sentiment'] = df['rating'].apply(lambda x:sentiment(x))


In [None]:
df['sentiment'].value_counts(normalize = True)

In [None]:
### Some users reviewed both pokemon sword and shield. 
### These reviews tended to be a copy-paste 

df[df.duplicated('name')]

In [None]:
df.review[df['name'] == 'Mack_thge_Sack']

In [None]:
df.reset_index(inplace = True, drop = True)

In [None]:
### Remove duplicate users 
df.drop_duplicates(subset='name', keep = 'first', inplace = True)

In [None]:
### detect review language and returns NaN if not english 
def language_detection(x): 
    result = detect(x)
    if result == 'en':
        return x 
    else: return np.NaN 
    
df['review'] = df['review'].apply(lambda x:language_detection(x))

### Only keep english reviews 
df.dropna(inplace = True)

### Text Cleaning 

In [None]:
### creating stopword list from both nltk and spacy 
nltk_stop_words = nltk.corpus.stopwords.words('english')

stop_words =  list(STOP_WORDS)
stop_words.extend(['game','pokemon','pokémon']) ### these are common words that appear in almost all reviews 

for word in nltk_stop_words:
    if word in stop_words: 
        continue
    else: stop_words.append(word)


In [None]:
def make_lower(text):
    return text.lower()

def remove_words(text,wordlist):
    for word in wordlist:
        if word in text.split():
            text = re.sub(r'\b{}\b'.format(word), '', text)  
    return text

def remove_digits(text):
    return re.sub('\d', ' ', text)

def remove_punctuation(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text) 
    return re.sub(r'[^\w\s]', ' ', text)

def strip_extraspace(text):
    return re.sub('\s\s+',' ', text)

def replace_word(text,word,replacement):
    return text.replace(word,replacement)

def remove_r(text):
    return text.replace('\r',' ')
#df['review'] = df['review'].apply(lambda x:remove_punctuation(x))

def clean_text(text):
    text = make_lower(text)
    text = remove_punctuation(text)
    text = remove_digits(text)
    text = replace_word(text,'game freak','gamefreak') ### correcting inconsistencies in spelling that I noticed 
    text = replace_word(text, 'game play', 'gameplay')
    text = remove_words(text,stop_words)
    text = remove_r(text)
    text = strip_extraspace(text)
    return text

In [None]:
df['review'] = df['review'].apply(lambda x:clean_text(x))

In [None]:
df['review'][5]

In [None]:
### lemmatising using spacy
sp = spacy.load('en_core_web_sm')

def lemmatize_words(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    text = sp(text)
    lemmed_string =''
    for word in text:
        if word.pos_ in allowed_postags:
            if word.lemma_ == '-PRON-' or word.lemma_ in stop_words: 
                ### skip words that are not in allowed postags or becomes a stopword when lemmatised   
                continue 
            else: lemmed_string = lemmed_string+' '+word.lemma_
    return lemmed_string.lstrip()

In [None]:
df['review'] = df['review'].apply(lambda x:lemmatize_words(x, allowed_postags=['NOUN', 'VERB']))

In [None]:
### list all words that appear in less than 4 documents, and remove them from the reviews 

word_frequency = Counter()

for text in df.review:
    text = text.split()
    word_frequency.update(set(text)) 

rare_words = []

for key, value in word_frequency.items():
    if value < 4:
        rare_words.append(key)


In [None]:
df['review'] = df['review'].apply(lambda x:remove_words(x,rare_words)) 

In [None]:
sorted_word_freq = sorted(word_frequency.items(), key=lambda x: x[1], reverse = True)

In [None]:
sorted_word_freq 

## Splitting df by sentiment 

In [None]:
negative = df[df['sentiment']=='negative']

In [None]:
negative

In [None]:
vectorizer = CountVectorizer(stop_words=stop_words, ngram_range = (1,2),
                                   token_pattern="\\b[a-z][a-z][a-z]+\\b") ###ignore words that are less than 3 alphabets  

## Building the Model 

### LDA Groundwork

In [None]:
vectorizer.fit(negative.review)
doc_word = vectorizer.transform(negative.review).transpose()

In [None]:
corpus = matutils.Sparse2Corpus(doc_word)

In [None]:
word2id = dict((v, k) for v, k in vectorizer.vocabulary_.items())
id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())

dictionary = corpora.Dictionary()
dictionary.id2token = id2word
dictionary.token2id = word2id

texts = negative['review'].apply(lambda x: x.split())
texts = texts.to_list()

### Using coherence score to determine the number of topics 

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word, random_seed = 77)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list_mlda, coherence_values_mlda = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=texts, start=2, limit=8, step=1)

In [None]:
fig = plt.figure(figsize=(8,6))
limit=8; start=2; step=1;
x = range(start, limit, step)

plt.plot(x, coherence_values_mlda)

plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")


plt.show()

fig.savefig('CoherenceScore.png', dpi=300)

In [None]:
limit=8; start=2; step=1;
x = range(start, limit, step)
for m, cv in zip(x, coherence_values_mlda):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

## Topics

In [None]:
ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=5, id2word=id2word, random_seed = 77)

    
# Show Topics
pprint(ldamallet.show_topics(formatted=False))