In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
data = pd.read_csv('balanced_restaurant_reviews.csv')
data.head()

Unnamed: 0,Review,Sentiment
0,"Love!! We have the kung pao chicken, broccoli ...",1
1,The new location that JH has moved into was a ...,1
2,Delicious food. If you asked me to recommend a...,1
3,I have been looking forward to Joe's Shanghai ...,1
4,I always come here just for the dumplings when...,1


In [7]:
data.tail()

Unnamed: 0,Review,Sentiment
4471,The infamous 'soup dumpling' was just ok - not...,0
4472,"overrated dumplings, overrated in general but ...",0
4473,Best soup dumplings in town! Crab with pork s...,0
4474,Disappointed in the food. I ordered one of the...,0
4475,Place is not clean. hardly anyone speaks Engli...,0


In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import nltk
from sklearn.model_selection import train_test_split

In [18]:
X = data['Review']
y = data['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
# lets crteate a class to preprocess the review
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, remove_stopwords=True, remove_punctuations=True, lemmatize=True):
        self.remove_stopwords = remove_stopwords
        self.remove_punctuations = remove_punctuations
        self.lemmatize = lemmatize
        self.lemmatizer = WordNetLemmatizer()
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.apply(self._clean_text)
        return X
    
    def _clean_text(self, text):
        text = text.lower()
        if self.remove_punctuations:
            text = self._remove_punctuations(text)
        if self.remove_stopwords:
            text = self._remove_stopwords(text)
        if self.lemmatize:
            text = self._lemmatize(text)
        return text
    
    def _remove_punctuations(self, text):
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
        return text
    
    def _remove_stopwords(self, text):
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])
        return text
    
    def _lemmatize(self, text):
        text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split()])
        return text

In [40]:
class TextTokenizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tokenizer = TfidfVectorizer()
        
    def fit(self, X, y=None):
        self.tokenizer.fit(X)
        return self
    
    def transform(self, X, y=None):
        return self.tokenizer.transform(X)

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [47]:
# lets create a pipeline to preprocess the text data
logistic_pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tokenizer', TextTokenizer()),
    ('classifier', LogisticRegression())
])

rf_pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tokenizer', TextTokenizer()),
    ('classifier', RandomForestClassifier())
])

svm_pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tokenizer', TextTokenizer()),
    ('classifier', SVC())
])

nb_pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tokenizer', TextTokenizer()),
    ('classifier', MultinomialNB())
])



In [49]:
for pipeline in [logistic_pipeline, rf_pipeline, svm_pipeline, nb_pipeline]:
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(classification_report(y_test, y_pred))

Accuracy: 0.8013392857142857
              precision    recall  f1-score   support

           0       0.79      0.79      0.79       434
           1       0.81      0.81      0.81       462

    accuracy                           0.80       896
   macro avg       0.80      0.80      0.80       896
weighted avg       0.80      0.80      0.80       896

Accuracy: 0.7779017857142857
              precision    recall  f1-score   support

           0       0.77      0.78      0.77       434
           1       0.79      0.78      0.78       462

    accuracy                           0.78       896
   macro avg       0.78      0.78      0.78       896
weighted avg       0.78      0.78      0.78       896

Accuracy: 0.8125
              precision    recall  f1-score   support

           0       0.81      0.81      0.81       434
           1       0.82      0.82      0.82       462

    accuracy                           0.81       896
   macro avg       0.81      0.81      0.81       896

In [50]:
# Lets save all the models
import joblib
joblib.dump(logistic_pipeline, 'logistic_pipeline.pkl')
joblib.dump(rf_pipeline, 'rf_pipeline.pkl')
joblib.dump(svm_pipeline, 'svm_pipeline.pkl')
joblib.dump(nb_pipeline, 'nb_pipeline.pkl')


['nb_pipeline.pkl']