# Explore here

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#------------ PREPROCESING --------------
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
# ----------- MACHINE LEARNING ----------
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
# ----------- METRICS -------------------
from sklearn.metrics import accuracy_score
#------------ OPTIMIZATION --------------
from sklearn.model_selection import GridSearchCV


In [56]:
df_raw =  pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv')

df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [57]:
df_raw.sample(10)

Unnamed: 0,package_name,review,polarity
330,com.viber.voip,i wish there is a mute button for selected pe...,0
157,com.king.candycrushsaga,for this new update....đ didn't showing fa...,0
885,com.rovio.angrybirds,too many ads far more adverts than any other ...,1
53,com.twitter.android,bug in changing notification sound i gave 1 s...,0
267,com.android.chrome,chrome since last update you can't scroll dow...,0
436,com.whatsapp,privacy its nice how it works with the agenda...,0
233,com.supercell.clashofclans,shit guys dont download it. the new update su...,0
808,org.mozilla.firefox,almost good enough i have tried lots of andro...,1
173,com.imangi.templerun2,new theme its a fun game.......its great to s...,1
366,com.google.android.talk,fraud! i did not select this app to install. ...,0


In [58]:
df_raw['polarity'].value_counts() #El Dataset es desbalanceado. Lo cual debe tomarse en cuenta al crear los datasets de training y testing. Y al aplicar el modelo.

polarity
0    584
1    307
Name: count, dtype: int64

In [59]:
df_processed = df_raw.copy()

In [60]:
#PREPROCESING

#Se elimina la columna package_name, ya que es innecesaria.
df_processed = df_processed.drop(['package_name'], axis=1)
df_processed["review"] = df_processed["review"].str.strip().str.lower()

df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   review    891 non-null    object
 1   polarity  891 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 14.0+ KB


In [61]:
df_processed.sample(10)

Unnamed: 0,review,polarity
53,bug in changing notification sound i gave 1 so...,0
27,doesn't work 90% of the time. doesn't update m...,0
579,"was great, broken with marshmallow i lauded ev...",1
572,"are you serious, there is no widget available?",0
137,i love playing candy crush but... why when u g...,0
194,very adventurous game i love it very nice thri...,1
195,nice game.... its simply amazing...but i would...,1
433,"app continuously losses connection, at times i...",0
425,this has made my life hell!! it really sucks m...,0
513,better than groupme đ,1


In [62]:
#Divide Data Set into Train Test Datasets
X = df_processed['review']
y = df_processed['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1530) #stratify: para definir los datasets de forma proporcional acorde a y.

X_train.sample(5)

344    wtf keeps going offline automatically ,my mess...
816    hamro keyboar needs update n new emojis thnkz ...
847                 good ŕ¤ŕ¤ŕ¤śŕ¤ž ŕ¤°ŕ¤žŕ¤žŕľŕ¤°ŕľ
889    fun works perfectly well. ads aren't as annoyi...
883    great game, but too many ads almost not worth ...
Name: review, dtype: object

In [63]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

In [64]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [65]:
#Create the Model

#The MultinomialNB model should be used as the caracteristics are discrete counts (Categorical). Because each sentence represents a different Category. 

model_multiNB = MultinomialNB()

model_multiNB.fit(X_train, y_train)


In [66]:
y_hat_multiNB = model_multiNB.predict(X_test)
y_hat_multiNB

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0])

In [67]:
print(f'Accuracy for MultinomialNB: {accuracy_score(y_test,y_hat_multiNB):.2f}')

Accuracy for MultinomialNB: 0.83


In [68]:
#Testing with the Other Models

model_gaussNB = GaussianNB()

model_gaussNB.fit(X_train, y_train)

y_hat_gaussNB = model_gaussNB.predict(X_test)

print(f'Accuracy for GaussianNB: {accuracy_score(y_test,y_hat_gaussNB):.2f}')

Accuracy for GaussianNB: 0.78


In [69]:
model_berNB = BernoulliNB()

model_berNB.fit(X_train, y_train)

y_hat_berNB = model_berNB.predict(X_test)

print(f'Accuracy for BernoulliNB: {accuracy_score(y_test,y_hat_berNB):.2f}')

Accuracy for BernoulliNB: 0.74


In [70]:
#Compare the 3 models

cm = sns.light_palette('red', as_cmap=True)

model_compare = pd.DataFrame({'model_name': ['MultinomialNB', 'GaussianNB', 'BernoulliNB'], 'Accuracy': [accuracy_score(y_test,y_hat_multiNB), accuracy_score(y_test,y_hat_gaussNB), accuracy_score(y_test,y_hat_berNB)]})

model_compare.style.background_gradient(cmap=cm)

Unnamed: 0,model_name,Accuracy
0,MultinomialNB,0.829596
1,GaussianNB,0.775785
2,BernoulliNB,0.744395


In [71]:
#The Model was selected correctly as it receives the best results for acuracy.

In [72]:
MultinomialNB().get_params().keys()

dict_keys(['alpha', 'class_prior', 'fit_prior', 'force_alpha'])

In [73]:
param_grid = {
    'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000],
    'fit_prior': [True, False]      
}

In [74]:
grid = GridSearchCV(model_multiNB, param_grid, scoring = "accuracy", cv = 10)
grid

In [75]:
grid.fit(X_train, y_train)

In [76]:
print(f"Best hyperparameters: {grid.best_params_}")

Best hyperparameters: {'alpha': 1, 'fit_prior': False}


In [77]:
model_multiNB_hyp = MultinomialNB(alpha=1, fit_prior=False)

model_multiNB_hyp.fit(X_train, y_train)

In [78]:
y_hat_multiNB_hyp = model_multiNB_hyp.predict(X_test)

In [79]:
print(f'Accuracy for MultinomialNB optimized: {accuracy_score(y_test,y_hat_multiNB_hyp):.4f}')

Accuracy for MultinomialNB optimized: 0.8386


In [80]:

cm = sns.light_palette('red', as_cmap=True)

model_compare = pd.DataFrame({'model_name': ['MultinomialNB', 'GaussianNB', 'BernoulliNB', 'MultinomialNB Optimized'], 'Accuracy': [accuracy_score(y_test,y_hat_multiNB), accuracy_score(y_test,y_hat_gaussNB), accuracy_score(y_test,y_hat_berNB), accuracy_score(y_test,y_hat_multiNB_hyp)]})

model_compare.style.background_gradient(cmap=cm)

Unnamed: 0,model_name,Accuracy
0,MultinomialNB,0.829596
1,GaussianNB,0.775785
2,BernoulliNB,0.744395
3,MultinomialNB Optimized,0.838565


In [81]:
from pickle import dump

dump(model_multiNB_hyp, open("../models/naive_bayes_multiNB_Optimized.sav", "wb"))