Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# sns.set()

### Resturant Reviews | Sentiment Analysis 

Dataset Source: https://www.kaggle.com/datasets/maher3id/restaurant-reviewstsv

 Importing & Inspecting Data

In [None]:
reviews = pd.read_csv('datasets/restaurant_reviews.tsv', sep='\t', quoting=3)
reviews

In [None]:
reviews.sample()

In [None]:
reviews.info()

Exploring Data

In [None]:
reviews['Liked'].value_counts()

In [None]:
sns.countplot(x = reviews['Liked'])

In [None]:
reviews['Review Letter Count'] = reviews['Review'].apply(len)

In [None]:
reviews

In [None]:
reviews['Review Letter Count'].max()

In [None]:
reviews.iloc[reviews['Review Letter Count'].idxmax()]['Review']

 Pre-processing Data (NLTK Text Data Cleaning)

In [None]:
import nltk # Natural Language Toolkit

In [None]:
# REMOVE PUNCTUATION
# import string
# string.punctuation

In [None]:
reviews['Review'][0] # the word 'this' is a stopword

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

In [None]:
print(stopwords.words('english'))

In [None]:
s = reviews['Review'][0]
s

In [None]:
import re
s = re.sub('[^a-zA-Z]', ' ', s)
s

In [None]:
s = s.lower()
s

In [None]:
s = s.split()
s

In [None]:
temp = []
for word in s:
    if word not in stopwords.words('english'):
        temp.append(word)
print(temp)

In [None]:
s = [word for word in s if word not in stopwords.words('english')]
s

In [None]:
s = ' '.join(s)
s

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
s = ps.stem(s)
s

In [None]:
# Bring it all together

corpus = []
pstem = PorterStemmer()

for i in range(len(reviews)):
    sentence = re.sub('[^a-zA-Z]', ' ', reviews['Review'][i])
    sentence = sentence.lower()
    sentence = sentence.split()
    sentence = [pstem.stem(word) for word in sentence if word not in stopwords.words('english')]
    sentence = ' '.join(sentence)
    
    corpus.append(sentence)

# print(corpus)

In [None]:
corpus

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(max_features=2000)

In [None]:
X = cv.fit_transform(corpus).toarray()
X

In [None]:
X.shape

In [None]:
y = reviews['Liked']
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

 Model Training & Building 

In [None]:
from sklearn.naive_bayes import MultinomialNB # Try GaussianNB

In [None]:
clf = MultinomialNB()

In [None]:
clf.fit(X_train, y_train)

Model Prediction

In [None]:
y_pred = clf.predict(X_test)
y_pred

Model Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

Model Deployment & Production

In [None]:
import joblib

In [None]:
joblib.dump(clf, 'reviews.pkl')