In [1]:
import pickle
import numpy as np 
import pandas as pd
import re 
import nltk 
import string 
from sklearn.feature_extraction.text import CountVectorizer 

In [2]:
with open('pokemon.pickle','rb') as read_file:
    df = pickle.load(read_file)

### Data Cleaning


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2903 entries, 0 to 887
Data columns (total 5 columns):
name      2903 non-null object
date      2903 non-null object
rating    2903 non-null object
review    2903 non-null object
game      2903 non-null object
dtypes: object(5)
memory usage: 136.1+ KB


In [4]:
df['rating'] = df['rating'].astype(int)

In [5]:
### let's group the reviews by their ratings 
### following metacritics, 0 to 4 = negative, 5 to 7 mixed, 8 and above = positive 

def sentiment(x):
    if x > 7:
        return 'positive'
    if x < 5:
        return 'negative'
    else: return 'mixed'

df['sentiment'] = df['rating'].apply(lambda x:sentiment(x))


In [6]:
df['sentiment'].value_counts(normalize = True)

negative    0.610059
positive    0.306235
mixed       0.083707
Name: sentiment, dtype: float64

In [7]:
### Some users posted on both pokemon sword and shield. 
### Did some checks, these reviews were the same  

df[df.duplicated('name')]

Unnamed: 0,name,date,rating,review,game,sentiment
0,Metagrass,"Nov 15, 2019",2,"I have also done a review for Pokemon Sword, b...",shield,negative
1,NintendoGuy64,"Nov 15, 2019",0,"As a lifelong fan of Pokemon games, I was ecst...",shield,negative
4,Otonaburu,"Nov 15, 2019",4,What should have been a giant leap to signific...,shield,negative
7,Gamermangamer,"Nov 15, 2019",1,"Was promised a game for ""long time fans of the...",shield,negative
8,Fumetic,"Nov 15, 2019",3,"As these are largely the same games, I have pa...",shield,negative
...,...,...,...,...,...,...
871,HollyS,"Nov 20, 2019",2,"Very short, bland and low quality Pokemon game...",shield,negative
875,Lawrence7,"Nov 20, 2019",10,For anyone debating whether they will like the...,shield,positive
877,sojasonk,"Nov 20, 2019",1,"Lazy writing, bad graphics, an absolute medioc...",shield,negative
879,KrakenOfPepsi,"Nov 20, 2019",4,Metacritic has a pretty small character limit ...,shield,negative


In [8]:
df.review[df['name'] == 'Mack_thge_Sack']

1313    I'm going to state my points and not my emotio...
887     I'm going to state my points and not my emotio...
Name: review, dtype: object

In [9]:
df.reset_index(inplace = True, drop = True)

In [10]:
#dropping duplicates 
df.drop_duplicates(subset='name', keep = 'first', inplace = True)

In [11]:
### cleaning the text 

def make_lower(text):
    return text.lower()

def remove_punctuation(text):
    return re.sub('[%s]' % re.escape(string.punctuation), '', text) 

def remove_digits(text):
    return re.sub('\d', '', text)

def clean_text(text):
    text = make_lower(text)
    text = remove_punctuation(text)
    text = remove_digits(text)
    return text

In [12]:
df['review'] = df['review'].apply(lambda x:clean_text(x))

In [13]:
#df['review'] = df['review'].apply(lambda x:remove_stopwords(str.split(x)))

In [None]:
### check for english words 

In [60]:
pip install langdetect

Collecting langdetect
  Downloading https://files.pythonhosted.org/packages/59/59/4bc44158a767a6d66de18c4136c8aa90491d56cc951c10b74dd1e13213c9/langdetect-1.0.7.zip (998kB)
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.7-cp37-none-any.whl size=993467 sha256=fd0ecf5ed857c2fc5443a174c75b89e9f265e0fea8d80522bd9e2db7a619d544
  Stored in directory: C:\Users\adelweiss\AppData\Local\pip\Cache\wheels\ec\0c\a9\1647275e7ef5014e7b83ff30105180e332867d65e7617ddafe
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.7
Note: you may need to restart the kernel to use updated packages.


In [52]:
from nltk.stem import WordNetLemmatizer
#from nltk.tokenize import word_tokenize

lemmatizer=WordNetLemmatizer()

def lemmatize_words(x):
    lemmed_string = ''
    for word in x.split():
        lemmed_string = lemmed_string+' '+lemmatizer.lemmatize(word)  
    return lemmed_string.lstrip()

In [53]:
df['review'] = df['review'].apply(lambda x:lemmatize_words(x))

In [14]:
### remove stopwords text, using this method just incase I want to add more stopwords 
stop_words = nltk.corpus.stopwords.words('english')

# stop_words.extend(['come','order','try','go','get','make','drink','plate','dish','restaurant','place',
#                   'would','really','like','great','service','came','got'])

for word in stop_words:
    no_punct = remove_punctuation(word)
    if no_punct not in stop_words: 
        stop_words.append(no_punct)

In [54]:
df

Unnamed: 0,name,date,rating,review,game,sentiment
0,ChocolateCrane,"Nov 18, 2019",4,let u address the elephant in the room first t...,sword,negative
1,Spaxe,"Nov 15, 2019",5,my copy of sword and shield came in the mail a...,sword,mixed
2,Zerato,"Nov 15, 2019",5,the game is meh not even counting the pokemon ...,sword,mixed
3,firstlovezombie,"Nov 15, 2019",2,i really love pokemon which is why im so criti...,sword,negative
4,FilthyActs88,"Nov 15, 2019",1,pokémon sword and shield is a game of pure med...,sword,negative
...,...,...,...,...,...,...
2897,Cotroneo,"Nov 20, 2019",2,if you consider this game a successor to the s...,shield,negative
2898,gooblaster,"Nov 20, 2019",6,doe a lot of stuff really wellbut what it does...,shield,mixed
2899,Cassichu,"Nov 20, 2019",7,i have been a long time pokemon fan since befo...,shield,mixed
2900,SayoNightclaw,"Nov 21, 2019",8,most of the negative review are people just ma...,shield,positive


In [55]:
count_vectorizer = CountVectorizer(stop_words=stop_words, 
                                   strip_accents = 'ascii', 
                                   token_pattern="\\b[a-z][a-z]+\\b")

count_vectorizer.fit(df.review)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents='ascii', token_pattern='\\b[a-z][a-z]+\\b',
                tokenizer=None, vocabulary=None)

In [56]:
doc_word = count_vectorizer.transform(df.review).transpose()

In [59]:
pd.DataFrame(doc_word.toarray(), count_vectorizer.get_feature_names())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2466,2467,2468,2469,2470,2471,2472,2473,2474,2475
aa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ab,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abaixo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abandona,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zubats,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zudem,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zur,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zwar,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
