## importing libraries

In [1]:
import pandas as pd
import numpy as np

import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score

## fetching the data

In [2]:
df = pd.read_csv('clean_reviews_data_50k.csv')
df.head()

Unnamed: 0,Unnamed: 0.1,Rating,Title,Review,stemmed_content
0,0,4,Raspberries are Magic,Excellent family film for all ages! You can't ...,excel famili film age find mani good movi fami...
1,1,1,Road Warrior defeated in less than 1 year,I bought this battery for my wife's cell phone...,bought batteri wife cell phone last christma l...
2,2,5,"Excellent advice, and very well written",I've read this book cover-to-cover twice as I ...,read book cover cover twice prepar law school ...
3,3,1,"Horrible, Horrible, Horrible!",The dialogue is so bad that you will feel dumb...,dialogu bad feel dumber read book ever write d...
4,4,1,Don't get ripped off,Well...I have only one thing to say about this...,well one thing say soundtrack select whoever c...


## machine learning

#### putting the reviews inside a bag of words

In [3]:
cv = CountVectorizer(encoding='UTF-8')
bow = cv.fit_transform(df['stemmed_content'].values.astype('U')).astype('int8').toarray()

#### splitting an x and a y

In [4]:
x = pd.DataFrame(bow)
y = df['Rating'].astype('int8')

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

#### training the model

In [7]:
nv = MultinomialNB()
nv.fit(x_train.astype('int8'), y_train)

#### measuring metrics

In [8]:
y_pred = nv.predict(x_test)

In [9]:
confusion_matrix(y_test, y_pred)

array([[1422,  578,  220,  113,  127],
       [ 670,  932,  567,  234,  155],
       [ 416,  548,  738,  452,  228],
       [ 217,  286,  580,  746,  685],
       [ 227,  180,  262,  615, 1302]], dtype=int64)

In [10]:
accuracy_score(y_test, y_pred)

0.4112

#### saving the model

In [11]:
with open('model.pkl', 'wb') as fout:
    pickle.dump((cv, nv), fout)