### Importing Packages 

In [11]:
import pickle

import numpy as np 
import pandas as pd

import re 
import string 

import nltk
#from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer 

from langdetect import detect

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
# from sklearn.metrics.pairwise import cosine_similarity

import spacy

import gensim
from gensim import corpora, models, similarities, matutils

## Opening Scrapped Data 

In [None]:
with open('pokemon.pickle','rb') as read_file:
    df = pickle.load(read_file)

# Data Cleaning 

## Dataset 

In [None]:
df.info()  

In [None]:
df['rating'] = df['rating'].astype(int)

In [None]:
### let's group the reviews by their ratings 
### following metacritics, 0 to 4 = negative, 5 to 7 mixed, 8 and above = positive 

def sentiment(x):
    if x > 7:
        return 'positive'
    if x < 5:
        return 'negative'
    else: return 'mixed'

df['sentiment'] = df['rating'].apply(lambda x:sentiment(x))


In [None]:
df['sentiment'].value_counts(normalize = True)

In [None]:
### Some users posted on both pokemon sword and shield. 
### Did some checks, these reviews were the same  

df[df.duplicated('name')]

In [None]:
df.review[df['name'] == 'Mack_thge_Sack']

In [None]:
df.reset_index(inplace = True, drop = True)

In [None]:
### dropping duplicate names 
df.drop_duplicates(subset='name', keep = 'first', inplace = True)

In [None]:
### detect review language and returns NaN if not english 
def language_detection(x): 
    result = detect(x)
    if result == 'en':
        return x 
    else: return np.NaN 
    
df['review'] = df['review'].apply(lambda x:language_detection(x))

### drop reviews that are not in english 
df.dropna(inplace = True)

In [None]:
#df.to_pickle('clean_dataset.pickle')

### Text Cleaning 

In [12]:
with open('clean_dataset.pickle','rb') as read_file:
    df = pickle.load(read_file)

In [23]:
### tokenize the text, lowercase, remove punctuation, compile reviews into list of list 

def gensimple_preprocess(text):
    return gensim.utils.simple_preprocess(text, deacc=True)

df['review'] = df['review'].apply(lambda x: gensimple_preprocess(x))
all_reviews = df.review.to_list()

In [13]:
### list stopwords 
stop_words = nltk.corpus.stopwords.words('english')

#stop_words =  list(spacy.lang.en.stop_words.STOP_WORDS)
#stop_words.extend(['game','pokemon','pokémon', 'play','make'])

for word in stop_words:
    if word in stop_words: 
        continue
    else: stop_words.append(word)

stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
### remove stop words 

reviews_nostopword = []

for review in all_reviews:
    for word in review:
        if word not in stop_words:
            reviews_nostopword.append(word)
        else: continue 



In [None]:
# Build the bigram and trigram models - reference: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
data = []
for entry in df.review:
    data.append(entry.split())

bigram = gensim.models.Phrases(data, min_count=5, threshold=5) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data], threshold=5)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(x):
    text = bigram_mod[x.split()] 
    grammed_string = ''
    for word in text: 
        grammed_string = grammed_string+' '+word 
    return grammed_string 

def make_trigrams(x):
    text = trigram_mod[bigram_mod[x.split()] ] 
    grammed_string = ''
    for word in text: 
        grammed_string = grammed_string+' '+word 
    return grammed_string 

In [None]:
# def remove_r(text):
#     return text.replace('\r','')

# df['review'] = df['review'].apply(lambda x:remove_r(x))  

In [None]:
# def strip_whitespace(text):
#     return text.strip()

# def make_lower(text):
#     return text.lower()

# def remove_digits(text):
#     return re.sub('\d', '', text)

# def remove_punctuation(text):
#     text = re.sub('[%s]' % re.escape(string.punctuation), '', text) 
#     return re.sub(r'[^\w\s]', '', text)

# #df['review'] = df['review'].apply(lambda x:remove_punctuation(x))

# def clean_text(text):
#     text = strip_whitespace(text)
#     text = make_lower(text)
#     text = remove_punctuation(text)
#     text = remove_digits(text)
#     return text

In [None]:
# df['review'] = df['review'].apply(lambda x:clean_text(x))

In [None]:
#df['review'] = df['review'].apply(lambda x:remove_stopwords(str.split(x)))

In [None]:
# lemmatizer=WordNetLemmatizer()

# def lemmatize_words(x):
#     lemmed_string = ''
#     for word in x.split():
#         lemmed_string = lemmed_string+' '+lemmatizer.lemmatize(word)  
#     return lemmed_string.lstrip()

sp = spacy.load('en_core_web_sm')

def lemmatize_words(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    text = sp(text)
    lemmed_string =''
    for word in text:
        if word.pos_ in allowed_postags:
            if word.lemma_ == '-PRON-':
                word.lemma_ = word.orth_ # change the string representation
                word.lemma = word.orth #
            else: lemmed_string = lemmed_string+' '+word.lemma_
    return lemmed_string.lstrip()

# def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     """https://spacy.io/api/annotation"""
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent)) 
#         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return texts_out

In [None]:
df['review'] = df['review'].apply(lambda x:lemmatize_words(x, allowed_postags=['NOUN', 'VERB']))

In [None]:
# df['review'] = df['review'].apply(lambda x:make_bigrams(x))  
# df['review'] = df['review'].apply(lambda x:make_trigrams(x))  

In [None]:
df.to_pickle('dfclean_b4rare.pickle') ### cleaned text and lemmatised 

In [None]:
with open('dfclean_b4rare.pickle','rb') as read_file:
    df = pickle.load(read_file)

In [None]:
# ### create list of rare words by filtering on word counts
freq = pd.DataFrame(df.review.str.split(expand=True).stack().value_counts())
freq = freq[freq<11]
freq.dropna(inplace = True)
freq.reset_index(inplace = True)
freq = freq['index'].to_list()

In [None]:
freq

In [None]:
def remove_words(text,wordlist):
    for word in wordlist:
        if word in text:
            text = re.sub(r'\b{}\b'.format(word), '', text)  
    return text

df['review'] = df['review'].apply(lambda x:remove_words(x,freq))  

In [None]:
# ### checking high frequency words to add to stopword list 
freq = pd.DataFrame(df.review.str.split(expand=True).stack().value_counts())

In [None]:
pd.set_option('display.max_rows', 1000)
freq.head(30)

In [None]:
df.to_pickle('dfclean.pickle')

# with open('dfclean.pickle','rb') as read_file:
#      df = pickle.load(read_file)

In [None]:
# def replace_word(text,word,replacement):
#     return re.sub(r'\b{}\b'.format(word), replacement, text) 

# df['review'] = df['review'].apply(lambda x:replace_word(x,'game_freak','gamefreak'))


In [None]:
#freq = pd.DataFrame(df.review.str.split(expand=True).stack().value_counts())

In [None]:
### remove stopwords text, using this method just incase I want to add more stopwords 
nltk_stop_words = nltk.corpus.stopwords.words('english')

stop_words =  list(spacy.lang.en.stop_words.STOP_WORDS)
stop_words.extend(['game','pokemon','pokémon', 'play','make'])

                   
                   #'animation','area','battle', 'sword','shield']) 

                   #'play', 'review'])

#'battle','new','bad','good', 'play', 'more'])

for word in stop_words:
    if word in stop_words: 
        continue
    else: stop_words.append(word)

for word in stop_words:
    no_punct = remove_punctuation(word)
    if no_punct not in stop_words: 
        stop_words.append(no_punct)

## Splitting df by sentiment 

In [None]:
negative = df[df['sentiment']=='negative']
mixed = df[df['sentiment']=='mixed']
positive = df[df['sentiment']=='postive']


In [None]:
negative

In [None]:
vectorizer = CountVectorizer(stop_words=stop_words, ngram_range = (1,2),
                                   strip_accents = 'ascii', 
                                   token_pattern="\\b[a-z][a-z][a-z]+\\b")

In [None]:
doc_word = vectorizer.fit_transform(negative.review)
doc_word.shape

In [None]:
pd.DataFrame(doc_word.toarray(), index=negative.review, columns=vectorizer.get_feature_names()).head(10)

## LSA 

In [None]:
lsa = TruncatedSVD(4)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

In [None]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2"],
             columns = vectorizer.get_feature_names())
topic_word

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(lsa, vectorizer.get_feature_names(), 15)

### NMF 

In [None]:
nmf_model = NMF(4)
doc_topic = nmf_model.fit_transform(doc_word)

In [None]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2"],
             columns = vectorizer.get_feature_names())
topic_word

In [None]:
display_topics(nmf_model, vectorizer.get_feature_names(), 15)

### LDA 

In [None]:
vectorizer.fit(negative.review)
doc_word = vectorizer.transform(negative.review).transpose()

In [None]:
corpus = matutils.Sparse2Corpus(doc_word)

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [None]:
id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())

In [None]:
#lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=20)

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=77,
                                           update_every=1,
                                           chunksize=100,
                                           passes=50, iterations = 100,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
lda_model.print_topics()

In [None]:
vectorizer.vocabulary_.items()