## Table of Contents

<a id='table_of_contents'></a>

1. [Import Libraries](#imports)
2. [Import Preprocessed Data](#import_data)
3. [Bag of Words (BOW)](#bow)
4. [TF-IDF](#tfidf)
5. [Text Preprocessing & cleaning](#preprocessing)<br>
6. [Result Evaluation & Conclusion](#res)<br>

In [61]:
import numpy as np
import pandas as pd
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import (CountVectorizer, TfidfVectorizer)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import (MultinomialNB, GaussianNB)
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [62]:
warnings.filterwarnings('ignore')
tqdm.pandas()

# 2. Import Preprocessed Data <a id='imports_data'></a>
[Back to top](#table_of_contents)

In [63]:
import pickle
file_name = 'preprocessed_food_reviews.pkl'
with open(file_name, 'rb') as file:
    df_food_review = pickle.load(file)

In [64]:
df_food_review.head()

Unnamed: 0,Text,Score,clean_text
0,This product was horrible. The very first can...,0,this product be horrible the very first can ...
1,My cat usually loves Fancy Feast wet food and ...,0,my cat usually love fancy feast wet food and f...
2,"I really love this Pu'erh tea. It is rich, dar...",1,I really love this puerh tea it be rich dark a...
3,"I love smoked salmon, this is great value over...",1,I love smoke salmon this be great value over t...
4,During the time I lived in England I became aw...,1,during the time I live in england I become awa...


# Preparing Data for Model

In [65]:
x = df_food_review['clean_text']
y = df_food_review['Score']

In [66]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [67]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((13139,), (3285,), (13139,), (3285,))

In [68]:
def evaluate_metrics(y_true, y_pred):
    """Create and print classification report"""
    performance = classification_report(
                    y_true, y_pred, 
                    target_names=["Negative Review", "Positive Review"])
    print(performance)

# 3. Bag of words  <a id='bow'></a>
[Back to top](#table_of_contents)

In [69]:
cv = CountVectorizer()
x_train_cv = cv.fit_transform(x_train)
x_test_cv = cv.transform(x_test)

In [70]:
x_train_cv

<13139x27622 sparse matrix of type '<class 'numpy.int64'>'
	with 669331 stored elements in Compressed Sparse Row format>

In [71]:
x_train_cv.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [72]:
cv.get_feature_names_out()

array(['aa', 'aachen', 'aafcoa', ..., 'zuma', 'zupas', 'zwieback'],
      dtype=object)

In [73]:
df_bow = pd.DataFrame(x_train_cv.todense(), columns=cv.get_feature_names_out())
df_bow.head(3)

Unnamed: 0,aa,aachen,aafcoa,aah,aakg,aback,abandon,abbott,abby,abc,...,zout,zucchini,zucchinii,zuchinni,zuchon,zuke,zukeszbone,zuma,zupas,zwieback
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
df_bow.shape

(13139, 27622)

## Fitting BOW to machine learning model 

# Logistic Regression

In [75]:
lr_cv = LogisticRegression()
lr_cv.fit(x_train_cv, y_train)

In [76]:
y_train_pred = lr_cv.predict(x_train_cv)
evaluate_metrics(y_train, y_train_pred)

                 precision    recall  f1-score   support

Negative Review       0.97      0.97      0.97      6596
Positive Review       0.97      0.97      0.97      6543

       accuracy                           0.97     13139
      macro avg       0.97      0.97      0.97     13139
   weighted avg       0.97      0.97      0.97     13139



In [77]:
y_test_pred = lr_cv.predict(x_test_cv)
evaluate_metrics(y_test, y_test_pred)

                 precision    recall  f1-score   support

Negative Review       0.88      0.85      0.87      1616
Positive Review       0.86      0.89      0.87      1669

       accuracy                           0.87      3285
      macro avg       0.87      0.87      0.87      3285
   weighted avg       0.87      0.87      0.87      3285



# Naive Bayes

In [78]:
nb_bow = MultinomialNB()
nb_bow.fit(x_train_cv, y_train)

In [79]:
y_train_pred = nb_bow.predict(x_train_cv)
evaluate_metrics(y_train, y_train_pred)

                 precision    recall  f1-score   support

Negative Review       0.92      0.92      0.92      6596
Positive Review       0.92      0.92      0.92      6543

       accuracy                           0.92     13139
      macro avg       0.92      0.92      0.92     13139
   weighted avg       0.92      0.92      0.92     13139



In [80]:
y_test_pred = nb_bow.predict(x_test_cv)
evaluate_metrics(y_test_pred, y_test)

                 precision    recall  f1-score   support

Negative Review       0.85      0.85      0.85      1613
Positive Review       0.86      0.86      0.86      1672

       accuracy                           0.86      3285
      macro avg       0.86      0.86      0.86      3285
   weighted avg       0.86      0.86      0.86      3285



# Support Vector Machine (SVM)

In [81]:
svc_bow = LinearSVC()
svc_bow.fit(x_train_cv, y_train)

In [82]:
y_train_pred = svc_bow.predict(x_train_cv)
evaluate_metrics(y_train, y_train_pred)

                 precision    recall  f1-score   support

Negative Review       1.00      1.00      1.00      6596
Positive Review       1.00      1.00      1.00      6543

       accuracy                           1.00     13139
      macro avg       1.00      1.00      1.00     13139
   weighted avg       1.00      1.00      1.00     13139



In [83]:
y_test_pred = svc_bow.predict(x_test_cv)
evaluate_metrics(y_test, y_test_pred)

                 precision    recall  f1-score   support

Negative Review       0.86      0.84      0.85      1616
Positive Review       0.84      0.86      0.85      1669

       accuracy                           0.85      3285
      macro avg       0.85      0.85      0.85      3285
   weighted avg       0.85      0.85      0.85      3285



In [84]:
rf_bow = RandomForestClassifier(random_state=42)
rf_bow.fit(x_train_cv, y_train)

In [85]:
y_train_pred = rf_bow.predict(x_train_cv)
evaluate_metrics(y_train, y_train_pred)

                 precision    recall  f1-score   support

Negative Review       1.00      1.00      1.00      6596
Positive Review       1.00      1.00      1.00      6543

       accuracy                           1.00     13139
      macro avg       1.00      1.00      1.00     13139
   weighted avg       1.00      1.00      1.00     13139



In [86]:
y_test_pred = rf_bow.predict(x_test_cv)
evaluate_metrics(y_test, y_test_pred)

                 precision    recall  f1-score   support

Negative Review       0.83      0.87      0.85      1616
Positive Review       0.87      0.82      0.85      1669

       accuracy                           0.85      3285
      macro avg       0.85      0.85      0.85      3285
   weighted avg       0.85      0.85      0.85      3285



### Observations
###  BOW approach with Naive Bayes Classifer gives good prediction with  6% variance

# 4. TF-IDF  <a id='tfidf'></a>
[Back to top](#table_of_contents)