In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('balanced_restaurant_reviews.csv')
data.head()

Unnamed: 0,Review,Sentiment
0,"Love!! We have the kung pao chicken, broccoli ...",1
1,The new location that JH has moved into was a ...,1
2,Delicious food. If you asked me to recommend a...,1
3,I have been looking forward to Joe's Shanghai ...,1
4,I always come here just for the dumplings when...,1


In [7]:
data.Review[4475]

'Place is not clean. hardly anyone speaks English. They have good steamed dumplings Shanghai style but the rest of the food is below average. Fish ordered came out with a sweet sauce that was not edible. Seafood is below average. Sauces are not very good, but have no taste. Only thing that was above average were the steamed dumplings, very hot, so take your time in eating. Not good decoration, and restrooms are unclean like the flatware.'

In [19]:
data.tail()

Unnamed: 0,Review,Sentiment
4471,The infamous 'soup dumpling' was just ok - not...,0
4472,"overrated dumplings, overrated in general but ...",0
4473,Best soup dumplings in town! Crab with pork s...,0
4474,Disappointed in the food. I ordered one of the...,0
4475,Place is not clean. hardly anyone speaks Engli...,0


In [20]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import nltk
from sklearn.model_selection import train_test_split

In [21]:
X = data['Review']
y = data['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
# lets crteate a class to preprocess the review
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, remove_stopwords=True, remove_punctuations=True, lemmatize=True):
        self.remove_stopwords = remove_stopwords
        self.remove_punctuations = remove_punctuations
        self.lemmatize = lemmatize
        self.lemmatizer = WordNetLemmatizer()
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.apply(self._clean_text)
        return X
    
    def _clean_text(self, text):
        text = text.lower()
        if self.remove_punctuations:
            text = self._remove_punctuations(text)
        if self.remove_stopwords:
            text = self._remove_stopwords(text)
        if self.lemmatize:
            text = self._lemmatize(text)
        return text
    
    def _remove_punctuations(self, text):
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
        return text
    
    def _remove_stopwords(self, text):
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])
        return text
    
    def _lemmatize(self, text):
        text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split()])
        return text

In [40]:
class TextTokenizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tokenizer = TfidfVectorizer()
        
    def fit(self, X, y=None):
        self.tokenizer.fit(X)
        return self
    
    def transform(self, X, y=None):
        return self.tokenizer.transform(X)

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [47]:
# lets create a pipeline to preprocess the text data
logistic_pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tokenizer', TextTokenizer()),
    ('classifier', LogisticRegression())
])

rf_pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tokenizer', TextTokenizer()),
    ('classifier', RandomForestClassifier())
])

svm_pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tokenizer', TextTokenizer()),
    ('classifier', SVC())
])

nb_pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tokenizer', TextTokenizer()),
    ('classifier', MultinomialNB())
])



In [None]:
for pipeline in [logistic_pipeline, rf_pipeline, svm_pipeline, nb_pipeline]:
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(classification_report(y_test, y_pred))

In [None]:
# Lets save all the models
import joblib
joblib.dump(logistic_pipeline, 'logistic_pipeline.pkl')
joblib.dump(rf_pipeline, 'rf_pipeline.pkl')
joblib.dump(svm_pipeline, 'svm_pipeline.pkl')
joblib.dump(nb_pipeline, 'nb_pipeline.pkl')


In [22]:
# create a LSTM model uiing Tensorflow and Keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

    

In [23]:
# lets create a class to preprocess the text data
class TextPreprocessorLSTM(BaseEstimator, TransformerMixin):
    def __init__(self, max_words=5000, max_len=200):
        self.max_words = max_words
        self.max_len = max_len
        self.tokenizer = Tokenizer(num_words=self.max_words)
        
    def fit(self, X, y=None):
        self.tokenizer.fit_on_texts(X)
        return self
    
    def transform(self, X, y=None):
        X = self.tokenizer.texts_to_sequences(X)
        X = pad_sequences(X, maxlen=self.max_len)
        return X
    
    def inverse_transform(self, X):
        return self.tokenizer.sequences_to_texts(X)
    
    def get_vocabulary_size(self):
        return len(self.tokenizer.word_index) + 1
    

class LSTMModel(BaseEstimator, TransformerMixin):
    def __init__(self, vocab_size, max_len=200, embedding_dim=128):
        self.vocab_size = vocab_size
        self.max_len = max_len
        self.embedding_dim = embedding_dim
        self.model = Sequential()
        self.model.add(Embedding(self.vocab_size, self.embedding_dim, input_length=self.max_len))
        self.model.add(LSTM(128))
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
    def fit(self, X, y=None):
        self.model.fit(X, y, epochs=5, batch_size=64, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
        return self
    
    def predict(self, X):
        return self.model.predict(X)
    
    def evaluate(self, X, y):
        return self.model.evaluate(X, y)
    
    def summary(self):
        return self.model.summary()

In [30]:
lstm_pipeline = Pipeline([
    ('preprocessor', TextPreprocessorLSTM()),
    ('model', LSTMModel(vocab_size=100))  # Instantiate the LSTMModel class
])




In [31]:
lstm_pipeline.fit(X_train, y_train)

Epoch 1/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 389ms/step - accuracy: 0.5233 - loss: 0.6890 - val_accuracy: 0.6411 - val_loss: 0.6621
Epoch 2/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 351ms/step - accuracy: 0.6425 - loss: 0.6415 - val_accuracy: 0.6620 - val_loss: 0.6106
Epoch 3/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 348ms/step - accuracy: 0.6788 - loss: 0.6059 - val_accuracy: 0.6746 - val_loss: 0.6047
Epoch 4/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 348ms/step - accuracy: 0.6563 - loss: 0.6321 - val_accuracy: 0.6173 - val_loss: 0.6287
Epoch 5/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 350ms/step - accuracy: 0.6883 - loss: 0.5960 - val_accuracy: 0.6885 - val_loss: 0.5930


In [33]:
y_pred = lstm_pipeline.predict(X_test)

[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step


In [37]:
y_pred = (lstm_pipeline.predict(X_test) > 0.5).astype(int)

[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step


In [38]:
accuracy = accuracy_score(y_test, y_pred)
print('Test accuracy:', accuracy)

Test accuracy: 0.6897321428571429
