In [1]:
import os
import sys
import warnings

import pickle
import pandas as pd
import numpy as np
import re
import string
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score


warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('datasets/amazon.csv')
df.head()

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1


In [3]:
df.isnull().sum()

reviewText    0
Positive      0
dtype: int64

In [4]:
df.dtypes

reviewText    object
Positive       int64
dtype: object

In [5]:
df.shape

(20000, 2)

In [6]:
def lower_case(text):
    return text.lower()

In [7]:
df.reviewText = df.reviewText.apply(lower_case)

In [8]:
df.head()

Unnamed: 0,reviewText,Positive
0,this is a one of the best apps acording to a b...,1
1,this is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"this is a silly game and can be frustrating, b...",1
4,this is a terrific game on any pad. hrs of fun...,1


In [9]:
def clean_unique(text):
    uniques = re.compile('[^\w\s]')
    return re.sub(uniques, ' ', text)

In [10]:
df = df.sort_values(by=['reviewText', 'Positive'], ascending=True)
df_sample_text = df['reviewText'][26]
df_sample_text_clean = clean_unique(df_sample_text)
print(df_sample_text)
print(df_sample_text_clean)

this is the game124 to get. those other bad comments r stupid. this app doesn't need wifi.  i like this app a lot its free and who doesn't love angry birds!!!
this is the game124 to get  those other bad comments r stupid  this app doesn t need wifi   i like this app a lot its free and who doesn t love angry birds   


In [11]:
df.reviewText = df.reviewText.apply(clean_unique)

In [12]:
df.reviewText[22]

'this is the best game they could think of i downloaded it and my fav  character is the original pig yay love it'

In [13]:
df_sample_text

"this is the game124 to get. those other bad comments r stupid. this app doesn't need wifi.  i like this app a lot its free and who doesn't love angry birds!!!"

In [14]:
def clean_stop_word(text):
    stop_word = stopwords.words('english')
    filter_without_stopwords = []

    for i in text.split():
        if i not in stop_word:
            filter_without_stopwords.append(i)

    return filter_without_stopwords

In [15]:
df.reviewText = df.reviewText.apply(clean_stop_word)

In [16]:
df.head()

Unnamed: 0,reviewText,Positive
6573,"[calculator, ultimate, outstanding, calculator...",1
17399,"[circlelauncher, clever, app, allows, one, ico...",1
6701,"[comiccat, first, last, comic, book, app, look...",1
14001,"[drawing, pad, exactly, drawing, app, kindle, ...",1
10262,"[farkle, fun, almost, addictive, dice, game, p...",1


In [17]:
df_test = df.copy()

In [18]:
df_smaple_test = df_test.reviewText[26]
df_smaple_test

['game124',
 'get',
 'bad',
 'comments',
 'r',
 'stupid',
 'app',
 'need',
 'wifi',
 'like',
 'app',
 'lot',
 'free',
 'love',
 'angry',
 'birds']

In [19]:
def stemming(text):
    p_stemmer = PorterStemmer()
    stemmed_word = []

    for i in text:
        stemmed_word.append(p_stemmer.stem(i))

    return stemmed_word

In [20]:
df.reviewText = df.reviewText.apply(stemming)

In [21]:
df.head()

Unnamed: 0,reviewText,Positive
6573,"[calcul, ultim, outstand, calcul, app, scienti...",1
17399,"[circlelaunch, clever, app, allow, one, icon, ...",1
6701,"[comiccat, first, last, comic, book, app, look...",1
14001,"[draw, pad, exactli, draw, app, kindl, fire, e...",1
10262,"[farkl, fun, almost, addict, dice, game, play,...",1


In [22]:
def join_back(text):
    joined_word = " ".join(text)
    return joined_word

In [23]:
df.reviewText = df.reviewText.apply(join_back)

In [24]:
df.head()

Unnamed: 0,reviewText,Positive
6573,calcul ultim outstand calcul app scientif func...,1
17399,circlelaunch clever app allow one icon home sc...,1
6701,comiccat first last comic book app look way re...,1
14001,draw pad exactli draw app kindl fire easi use ...,1
10262,farkl fun almost addict dice game play comput ...,1


In [25]:
count_vectorize = CountVectorizer(max_features=4000)
X_feature = count_vectorize.fit_transform(df.reviewText).toarray()

In [26]:
pd.DataFrame(X_feature)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3990,3991,3992,3993,3994,3995,3996,3997,3998,3999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
y = df.Positive

In [28]:
X_feature.shape

(20000, 4000)

In [29]:
x_train, x_test, y_train, y_test = train_test_split(X_feature, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=2)

In [30]:
print(f'shape xFeature : {X_feature.shape}')
print(f'shape x train : {x_train.shape}')
print(f'shape x test : {x_test.shape}')
print(f'shape y train : {y_train.shape}')
print(f'shape y test : {y_test.shape}')

shape xFeature : (20000, 4000)
shape x train : (16000, 4000)
shape x test : (4000, 4000)
shape y train : (16000,)
shape y test : (4000,)


In [31]:
# gaussian_model = GaussianNB()
# bernouli_model = BernoulliNB()
# multinomial_model = MultinomialNB()

In [32]:
class Utils:
    tag_accuracy = "Accuracy Score - "
    test_split_tag = "Test Split"
    train_split_tag = "Train Split"
    matrics_label = "Accuracy Score"
    underline = 60*'-'

In [33]:
class Model:
    gaussian_model = GaussianNB()
    bernouli_model = BernoulliNB()
    multinomial_model = MultinomialNB()

In [34]:
def train_and_test(x, y):
    model = Model()
    model.gaussian_model.fit(x, y)
    model.bernouli_model.fit(x, y)
    model.multinomial_model.fit(x, y)
    return model

In [35]:
trained_model = train_and_test(x_train, y_train)
trained_model

<__main__.Model at 0x2ade469d3d0>

In [36]:
test_model = train_and_test(x_test, y_test)
test_model

<__main__.Model at 0x2ade28fcf50>

In [37]:
model = Model()
ypred_training_gaussian = model.gaussian_model.predict(x_train)
ypred_training_bernouli = model.bernouli_model.predict(x_train)
ypred_training_multinomial = model.multinomial_model.predict(x_train)

In [38]:
model = Model()
ypred_testing_gaussian = model.gaussian_model.predict(x_test)
ypred_testing_bernouli = model.bernouli_model.predict(x_test)
ypred_testing_multinomial = model.multinomial_model.predict(x_test)

In [39]:
utils = Utils()

score_gaussian_train = accuracy_score(y_train, ypred_training_gaussian)
score_bernouli_train = accuracy_score(y_train, ypred_training_bernouli)
score_multinomial_train = accuracy_score(y_train, ypred_training_multinomial)

print(f'{utils.matrics_label}')
print(f'{utils.underline}')
print(f'{utils.tag_accuracy}{utils.train_split_tag} Gaussian NB : {score_gaussian_train}')
print(f'{utils.tag_accuracy}{utils.train_split_tag} Bernouli NB : {score_bernouli_train}')
print(f'{utils.tag_accuracy}{utils.train_split_tag} Multinomial NB : {score_multinomial_train}')


Accuracy Score
------------------------------------------------------------
Accuracy Score - Train Split Gaussian NB : 0.61925
Accuracy Score - Train Split Bernouli NB : 0.864
Accuracy Score - Train Split Multinomial NB : 0.86925


In [40]:
utils = Utils()

score_gaussian_test = accuracy_score(y_test, ypred_testing_gaussian)
score_bernouli_test = accuracy_score(y_test, ypred_testing_bernouli)
score_multinomial_test = accuracy_score(y_test, ypred_testing_multinomial)

print(f'{utils.matrics_label}')
print(f'{utils.underline}')
print(f'{utils.tag_accuracy}{utils.test_split_tag} Gaussian NB : {score_gaussian_test}')
print(f'{utils.tag_accuracy}{utils.test_split_tag} Bernouli NB : {score_bernouli_test}')
print(f'{utils.tag_accuracy}{utils.test_split_tag} Multinomial NB : {score_multinomial_test}')


Accuracy Score
------------------------------------------------------------
Accuracy Score - Test Split Gaussian NB : 0.765
Accuracy Score - Test Split Bernouli NB : 0.92025
Accuracy Score - Test Split Multinomial NB : 0.93125


In [41]:
x_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [42]:
model = Model()
random_text = df.reviewText[26]
random_text

'game124 get bad comment r stupid app need wifi like app lot free love angri bird'

In [43]:
count_vec_random = count_vectorize.transform([random_text]).toarray()
count_vec_random

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [44]:
pd.DataFrame(count_vec_random)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3990,3991,3992,3993,3994,3995,3996,3997,3998,3999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
model = Model()
ypred_random = model.multinomial_model.predict(count_vec_random)[0]
ypred_random

1

# Inference

In [None]:
def inference(random_input):
    
    # Define object for ML Model
    model = Model()

    # lower case conversion
    random_input.lower()

    # clean uniques
    regex_unique = re.compile('[^\w\s]')
    clean_unique = re.sub(regex_unique, " ", random_input).strip()

    # clean space
    clean_space = re.sub(r"\s+", " ", clean_unique).strip()

    # clean stop word
    stop_words = stopwords.words('english')
    filter_without_stopwords = []
    for i in clean_space.split():
        if i not in stop_words:
            filter_without_stopwords.append(i)

    # stemming 
    p_stemmer = PorterStemmer()
    stemmed_words = []
    for j in filter_without_stopwords:
        stemmed_words.append(p_stemmer.stem(j))

    # joining back 
    joined_word = " ".join(stemmed_words)

    # vectorize word
    single_word_vectorize = count_vectorize.transform([joined_word]).toarray()

    # Prediction
    y_pred = model.multinomial_model.predict(single_word_vectorize)[0]

    return y_pred


In [47]:
inference("This is a one of the best apps acording to a bunch of people and I agree it has bombs eggs pigs TNT king pigs and realustic stuff")

1

# Pickle

In [None]:
# Pickle Save
def save_pickle():

    with open ('multinomial_naive_bayes_model.pkl', 'wb') as f:
        pickle.dump(model.multinomial_model, f)

    with open('count_vectorize_vectorizer.pkl', 'wb') as f:
        pickle.dump(count_vectorize, f)

save_pickle()


In [None]:
# Pickle Load
with open("multinomial_naive_bayes_model.pkl", "rb") as f:
    model = pickle.load(f)
    print(model)

with open("count_vectorize_vectorizer.pkl", "rb") as f:
    count_vectorize = pickle.load(f)
    print(count_vectorize)

