### Importing Packages 

In [14]:
import pickle

import numpy as np 
import pandas as pd

import re 
import string 

import nltk
#from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer 

from langdetect import detect

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
# from sklearn.metrics.pairwise import cosine_similarity

import spacy

import gensim
from gensim import corpora, models, similarities, matutils

## Opening Scrapped Data 

In [15]:
with open('pokemon.pickle','rb') as read_file:
    df = pickle.load(read_file)

# Data Cleaning 

## Dataset 

In [16]:
df.info()  

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2903 entries, 0 to 887
Data columns (total 5 columns):
name      2903 non-null object
date      2903 non-null object
rating    2903 non-null object
review    2903 non-null object
game      2903 non-null object
dtypes: object(5)
memory usage: 136.1+ KB


In [17]:
df['rating'] = df['rating'].astype(int)

In [18]:
### let's group the reviews by their ratings 
### following metacritics, 0 to 4 = negative, 5 to 7 mixed, 8 and above = positive 

def sentiment(x):
    if x > 7:
        return 'positive'
    if x < 5:
        return 'negative'
    else: return 'mixed'

df['sentiment'] = df['rating'].apply(lambda x:sentiment(x))


In [19]:
df['sentiment'].value_counts(normalize = True)

negative    0.610059
positive    0.306235
mixed       0.083707
Name: sentiment, dtype: float64

In [20]:
### Some users posted on both pokemon sword and shield. 
### Did some checks, these reviews were the same  

df[df.duplicated('name')]

Unnamed: 0,name,date,rating,review,game,sentiment
0,Metagrass,"Nov 15, 2019",2,"I have also done a review for Pokemon Sword, b...",shield,negative
1,NintendoGuy64,"Nov 15, 2019",0,"As a lifelong fan of Pokemon games, I was ecst...",shield,negative
4,Otonaburu,"Nov 15, 2019",4,What should have been a giant leap to signific...,shield,negative
7,Gamermangamer,"Nov 15, 2019",1,"Was promised a game for ""long time fans of the...",shield,negative
8,Fumetic,"Nov 15, 2019",3,"As these are largely the same games, I have pa...",shield,negative
...,...,...,...,...,...,...
871,HollyS,"Nov 20, 2019",2,"Very short, bland and low quality Pokemon game...",shield,negative
875,Lawrence7,"Nov 20, 2019",10,For anyone debating whether they will like the...,shield,positive
877,sojasonk,"Nov 20, 2019",1,"Lazy writing, bad graphics, an absolute medioc...",shield,negative
879,KrakenOfPepsi,"Nov 20, 2019",4,Metacritic has a pretty small character limit ...,shield,negative


In [21]:
df.review[df['name'] == 'Mack_thge_Sack']

1313    I'm going to state my points and not my emotio...
887     I'm going to state my points and not my emotio...
Name: review, dtype: object

In [22]:
df.reset_index(inplace = True, drop = True)

In [23]:
### dropping duplicate names 
df.drop_duplicates(subset='name', keep = 'first', inplace = True)

In [24]:
### detect review language and returns NaN if not english 
def language_detection(x): 
    result = detect(x)
    if result == 'en':
        return x 
    else: return np.NaN 
    
df['review'] = df['review'].apply(lambda x:language_detection(x))

### drop reviews that are not in english 
df.dropna(inplace = True)

### Text Cleaning 

In [25]:
def remove_r(x):
    return x.replace('\r','')

df['review'] = df['review'].apply(lambda x:remove_r(x))  

In [26]:
def strip_whitespace(x):
    return x.strip()

df['review'] = df['review'].apply(lambda x:strip_whitespace(x))  

In [27]:
def make_lower(text):
    return text.lower()

def remove_digits(text):
    return re.sub('\d', '', text)

def remove_punctuation(text):
    return re.sub('[%s]' % re.escape(string.punctuation), '', text) 

#df['review'] = df['review'].apply(lambda x:remove_punctuation(x))

def clean_text(text):
    text = make_lower(text)
    text = remove_punctuation(text)
    text = remove_digits(text)
    return text

In [28]:
df['review'] = df['review'].apply(lambda x:clean_text(x))

In [29]:
#df['review'] = df['review'].apply(lambda x:remove_stopwords(str.split(x)))

In [30]:
# lemmatizer=WordNetLemmatizer()

# def lemmatize_words(x):
#     lemmed_string = ''
#     for word in x.split():
#         lemmed_string = lemmed_string+' '+lemmatizer.lemmatize(word)  
#     return lemmed_string.lstrip()

sp = spacy.load('en_core_web_sm')

def lemmatize_words(x):
    text = sp(x)
    lemmed_string =''
    for word in text:
        if word.lemma_ == '-PRON-':
            word.lemma_ = word.orth_ # change the string representation
            word.lemma = word.orth #
        else: lemmed_string = lemmed_string+' '+word.lemma_
    return lemmed_string.lstrip()

In [31]:
df['review'] = df['review'].apply(lambda x:lemmatize_words(x))

In [35]:
# Build the bigram and trigram models - reference: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
data = []
for entry in df.review:
    data.append(entry.split())

bigram = gensim.models.Phrases(data, min_count=5, threshold=5) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data], threshold=5)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(x):
    text = bigram_mod[x.split()] 
    grammed_string = ''
    for word in text: 
        grammed_string = grammed_string+' '+word 
    return grammed_string 

def make_trigrams(x):
    text = trigram_mod[bigram_mod[x.split()] ] 
    grammed_string = ''
    for word in text: 
        grammed_string = grammed_string+' '+word 
    return grammed_string 

In [36]:
df['review'] = df['review'].apply(lambda x:make_bigrams(x))  
df['review'] = df['review'].apply(lambda x:make_trigrams(x))  

In [37]:
# ### create list of rare words by filtering on word counts
freq = pd.DataFrame(df.review.str.split(expand=True).stack().value_counts())
freq = freq[freq<10]
freq.dropna(inplace = True)
freq.reset_index(inplace = True)
freq = freq['index'].to_list()

In [38]:
def remove_rare(x):
    for word in freq:
        if word in x:
            return x.replace(word,'')   

df['review'] = df['review'].apply(lambda x:remove_rare(x))  

In [None]:
df.to_pickle('dfclean.pickle')

# with open('dfclean.pickle','rb') as read_file:
#      df = pickle.load(read_file)

In [None]:
#freq = pd.DataFrame(df.review.str.split(expand=True).stack().value_counts())

In [None]:
### remove stopwords text, using this method just incase I want to add more stopwords 
nltk_stop_words = nltk.corpus.stopwords.words('english')

stop_words =  list(spacy.lang.en.stop_words.STOP_WORDS)
stop_words.extend(['game','pokemon','pokémon','battle','animation','good','bad'])

for word in stop_words:
    if word in stop_words: 
        continue
    else: stop_words.append(word)

for word in stop_words:
    no_punct = remove_punctuation(word)
    if no_punct not in stop_words: 
        stop_words.append(no_punct)

## Splitting df by sentiment 

In [None]:
negative = df[df['sentiment']=='negative']
mixed = df[df['sentiment']=='mixed']
positive = df[df['sentiment']=='postive']

In [None]:
negative

In [None]:
vectorizer = CountVectorizer(stop_words=stop_words,
                                   strip_accents = 'ascii', 
                                   token_pattern="\\b[a-z][a-z][a-z]+\\b")

In [None]:
doc_word = vectorizer.fit_transform(negative.review)
doc_word.shape

In [None]:
pd.DataFrame(doc_word.toarray(), index=negative.review, columns=vectorizer.get_feature_names()).head(10)

## LSA 

In [None]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

In [None]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2"],
             columns = vectorizer.get_feature_names())
topic_word

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(lsa, vectorizer.get_feature_names(), 15)

### NMF 

In [None]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)

In [None]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2"],
             columns = vectorizer.get_feature_names())
topic_word

In [None]:
display_topics(nmf_model, vectorizer.get_feature_names(), 15)

### LDA 

In [None]:
vectorizer.fit(negative.review)
doc_word = vectorizer.transform(negative.review).transpose()

In [None]:
corpus = matutils.Sparse2Corpus(doc_word)

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())

In [None]:
#lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=20)

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=50,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
lda.print_topics()

In [None]:
vectorizer.vocabulary_.items()

In [None]:
stop_words