# Naive Bayes project

In [26]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

In [3]:
# Read csv

df_raw = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews_dataset.csv')

In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [5]:
df_raw.sample(5)

Unnamed: 0,package_name,review,polarity
195,com.imangi.templerun2,nice game.... its simply amazing...but i woul...,1
499,com.Slack,perfect! very close to using the desktop cli...,1
298,com.tencent.mm,ugly design. no autoimport from old install. ...,0
76,com.twitter.android,i hated it i am able to log in successfully b...,0
527,com.dropbox.android,"easy to use, safe super easy to use! fantasti...",1


**1. Transform dataframe**

In [6]:
df_transf = df_raw.copy()

In [7]:
# Drop package_name column
# 
df_transf = df_transf.drop('package_name', axis=1)

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0
...,...,...
886,loved it i loooooooooooooovvved it because it...,1
887,all time legendary game the birthday party le...,1
888,ads are way to heavy listen to the bad review...,0
889,fun works perfectly well. ads aren't as annoy...,1


In [10]:
# column review to lower case

df_transf['review'] = df_transf['review'].str.lower()

In [12]:
df = df_transf.copy()

**2. Split data frame**

In [13]:
X = df['review']
y = df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=25)


In [19]:
y_train.value_counts()

0    441
1    227
Name: polarity, dtype: int64

**3. Pipeline with two pre-processing steps and one modeling step**

In [21]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf.fit(X_train, y_train)

**4. Check results**

In [23]:
y_pred = text_clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='weighted')

(0.7647596051183496, 0.6905829596412556, 0.6084902239038081, None)

In [24]:
print('Naive Bayes Train Accuracy = ',metrics.accuracy_score(y_train,text_clf.predict(X_train)))
print('Naive Bayes Test Accuracy = ',metrics.accuracy_score(y_test,text_clf.predict(X_test)))

Naive Bayes Train Accuracy =  0.7949101796407185
Naive Bayes Test Accuracy =  0.6905829596412556


**5. Randomized search to select hyperparameters**

In [27]:
n_iter_search = 5
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
gs_clf = RandomizedSearchCV(text_clf, parameters, n_iter = n_iter_search)
gs_clf.fit(X_train, y_train)

In [28]:
gs_clf.best_params_

{'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'clf__alpha': 0.01}

In [30]:
print('Naive Bayes Train Accuracy (grid random search) = ',metrics.accuracy_score(y_train,gs_clf.predict(X_train)))
print('Naive Bayes Test Accuracy (grid random search) = ',metrics.accuracy_score(y_test,gs_clf.predict(X_test)))

Naive Bayes Train Accuracy (grid random search) =  1.0
Naive Bayes Test Accuracy (grid random search) =  0.820627802690583
