# Explore here

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#------------ PREPROCESING --------------
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
# ----------- MACHINE LEARNING ----------
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
# ----------- METRICS -------------------
from sklearn.metrics import accuracy_score
#------------ OPTIMIZATION --------------
from sklearn.model_selection import GridSearchCV


In [11]:
df_raw =  pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv')

df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [12]:
df_raw.sample(10)

Unnamed: 0,package_name,review,polarity
673,com.hamrokeyboard,excellent ŕ¤§ŕľŕ¤°ŕľ ŕ¤¸ŕ¤žŕ¤˛ ŕ¤şŕ¤ŕ¤ż ŕ¤...,1
157,com.king.candycrushsaga,for this new update....đ didn't showing fa...,0
476,com.Slack,glitch: not working on galaxy note 5 this is ...,0
844,com.hamropatro,its very helpful,1
236,com.supercell.clashofclans,"new update sux! no more dead bases, no more f...",0
79,com.linkedin.android,the new app looks great and is very easy to ...,0
325,com.viber.voip,on sgh-i727 android 4.1.2 the new platform sh...,0
332,com.viber.voip,cant use not being able to call any contact p...,0
416,com.facebook.orca,not good i had for a while but is not working...,0
374,com.google.android.talk,man i hate google and android went into my me...,0


In [13]:
df_raw['polarity'].value_counts() #El Dataset es desbalanceado. Lo cual debe tomarse en cuenta al crear los datasets de training y testing. Y al aplicar el modelo.

polarity
0    584
1    307
Name: count, dtype: int64

In [14]:
df_processed = df_raw.copy()

In [15]:
#PREPROCESING

#Se elimina la columna package_name, ya que es innecesaria.
df_processed = df_processed.drop(['package_name'], axis=1)
df_processed["review"] = df_processed["review"].str.strip().str.lower()

df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   review    891 non-null    object
 1   polarity  891 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 14.0+ KB


In [17]:
df_processed.sample(10)

Unnamed: 0,review,polarity
532,"cool app except the notifications freeze, so i...",0
635,awesome app it is good and very useful app it ...,1
248,image search on verizon ellipsis 10 tablet doe...,0
610,good but ui ease of use still declining i've u...,1
569,works great! i wish it supported expandable / ...,1
83,issues setting up. works fine now had issues w...,0
457,wtf it worked for the last 3yrs. then all of a...,0
595,"neat idea, but let-down by no linux support. a...",0
201,help me!!!!!!! my ipad had a software so i los...,0
220,i don't like the new shield system. your base ...,0


In [19]:
#Divide Data Set into Train Test Datasets
X = df_processed['review']
y = df_processed['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1530) #stratify: para definir los datasets de forma proporcional acorde a y.

X_train.sample(5)

225    1 star new update i am a busy man and don't al...
161    awesomness this game is awesome but tricky wit...
734    too bad but improving the earlier version was ...
239    not responding occasionally sometimes chrome j...
604    best free note ever !!! sometimes evernote pop...
Name: review, dtype: object

In [21]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

In [25]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [33]:
#Create the Model

#The MultinomialNB model should be used as the caracteristics are discrete counts (Categorical). Because each sentence represents a different Category. 

model_multiNB = MultinomialNB()

model_multiNB.fit(X_train, y_train)


In [34]:
y_hat_multiNB = model_multiNB.predict(X_test)
y_hat_multiNB

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0])

In [36]:
print(f'Accuracy for MultinomialNB: {accuracy_score(y_test,y_hat_multiNB):.2f}')

Accuracy for MultinomialNB: 0.83


In [38]:
#Testing with the Other Models

model_gaussNB = GaussianNB()

model_gaussNB.fit(X_train, y_train)

y_hat_gaussNB = model_gaussNB.predict(X_test)

print(f'Accuracy for GaussianNB: {accuracy_score(y_test,y_hat_gaussNB):.2f}')

Accuracy for GaussianNB: 0.78


In [39]:
model_berNB = BernoulliNB()

model_berNB.fit(X_train, y_train)

y_hat_berNB = model_berNB.predict(X_test)

print(f'Accuracy for BernoulliNB: {accuracy_score(y_test,y_hat_berNB):.2f}')

Accuracy for BernoulliNB: 0.74


In [41]:
#Compare the 3 models

cm = sns.light_palette('red', as_cmap=True)

model_compare = pd.DataFrame({'model_name': ['MultinomialNB', 'GaussianNB', 'BernoulliNB'], 'Accuracy': [accuracy_score(y_test,y_hat_multiNB), accuracy_score(y_test,y_hat_gaussNB), accuracy_score(y_test,y_hat_berNB)]})

model_compare.style.background_gradient(cmap=cm)

Unnamed: 0,model_name,Accuracy
0,MultinomialNB,0.829596
1,GaussianNB,0.775785
2,BernoulliNB,0.744395


In [None]:
#The Model was selected correctly as it receives the best results for acuracy.

In [42]:
MultinomialNB().get_params().keys()

dict_keys(['alpha', 'class_prior', 'fit_prior', 'force_alpha'])

In [43]:
param_grid = {
    'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000],
    'fit_prior': [True, False]      
}

In [None]:
grid = GridSearchCV(model_multiNB, param_grid, scoring = "accuracy", cv = 10)
grid

In [None]:
from pickle import dump

dump(model, open("naive_bayes.sav", "wb"))