In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import string
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [14]:
# load the data
data = pd.read_csv('restaurant_reviews.csv')
data.head()

Unnamed: 0,Review,Rating
0,This place has the best dumplings I've ever ha...,5
1,"Came here on a weekend visit to NY, my friend ...",4
2,The best soup dumplings in NYC! There is no ot...,5
3,I was so disappointed with my visit to Joes. N...,1
4,So much fun! we stayed at Hotel 50\r\nRight n...,5


In [15]:
nlp = spacy.load('en_core_web_sm')
def clean_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)  # Keep only lowercase and uppercase letters
    text = re.sub('\s+', ' ', text)  # Replace multiple whitespace characters with a single space
    text = nlp(text)
    text = [word.lemma_ for word in text if not word.is_stop and not word.is_punct and not word.like_num]
    text = ' '.join(text)
    return text

  text = re.sub('\s+', ' ', text)  # Replace multiple whitespace characters with a single space


In [16]:
# Define a custom tokenizer
def custom_tokenizer(text):
    # Use NLTK's word_tokenize function to tokenize the text
    tokens = word_tokenize(text)
    # Return the tokens
    return tokens


In [17]:
# lets create a new feature 'Length' and 'Sentiment'
data['Length'] = data['Review'].apply(len)
data['Sentiment'] = data['Rating'].apply(lambda x: 'Positive' if x >= 4 else 'Negative')

In [18]:
# X = tfidf_vectorizer.fit_transform(data['Tokenized_Review'].apply(' '.join))
# y = data['Sentiment']

In [19]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
# models = {'LR' : LogisticRegression(), 'RF' : RandomForestClassifier(), 'NB' : MultinomialNB(), 'SVM' : SVC()}
# vectorizers = {'TF-IDF' : TfidfVectorizer()}

# parameters = {'LR' : {'C' : [0.001, 0.01, 0.1, 1, 10, 100]},
#                 'RF' : {'n_estimators' : [50, 100, 200]},
#                 'NB' : {'alpha' : [0.5, 1, 2]},
#                 'SVM' : {'C' : [0.1, 1, 10], 'gamma' : [0.1, 1, 10]}}
# results = []

# for model_name, model in models.items():
#     for vectorizer_name, vectorizer in vectorizers.items():
#         pipeline = GridSearchCV(model, parameters[model_name], cv=5)
#         pipeline.fit(vectorizer.fit_transform(data['Tokenized_Review'].apply(' '.join)), data['Sentiment'])
#         results.append({'model' : model_name, 'vectorizer' : vectorizer_name, 'best_params' : pipeline.best_params_, 'best_score' : pipeline.best_score_})


In [21]:
# results = pd.DataFrame(results)
# results

In [22]:
# # lets train the model
# model = LogisticRegression(C=1)
# model.fit(X_train, y_train)
# # use k-fold cross validation to evaluate the model
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(model, X_train, y_train, cv=5)
# print('Cross-validation scores: {}'.format(scores))
# print('Average cross-validation score: {:.2f}'.format(scores.mean()))

In [23]:
# confusion_matrix(y_test, model.predict(X_test))

In [24]:
# # lets try svm
# model = SVC(C=10, gamma=1)
# model.fit(X_train, y_train)
# # use k-fold cross validation to evaluate the model
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(model, X_train, y_train, cv=5)
# print('Cross-validation scores: {}'.format(scores))
# print('Average cross-validation score: {:.2f}'.format(scores.mean()))

In [25]:
data.Sentiment.value_counts()

Sentiment
Positive    5042
Negative    2238
Name: count, dtype: int64

In [26]:
# lets extract equal number of positive and negative reviews
positive_reviews = data[data['Sentiment'] == 'Positive'].sample(2238)
negative_reviews = data[data['Sentiment'] == 'Negative'].sample(2238)


In [27]:
data = pd.concat([positive_reviews, negative_reviews]).reset_index(drop=True)
data.Sentiment.value_counts()

Sentiment
Positive    2238
Negative    2238
Name: count, dtype: int64

In [28]:
data.head()

Unnamed: 0,Review,Rating,Length,Sentiment
0,This place is ridiculous. Great soup dumpling...,5,177,Positive
1,Stopped in the other day while in Chinatown. W...,4,866,Positive
2,"In general, typical Americanized Chinese food....",4,136,Positive
3,"SOUP DUMPLINGS!! it's just delicious, I've eat...",5,331,Positive
4,This is the kind place you imagine after readi...,5,971,Positive


In [29]:
data['Review'] = data['Review'].apply(clean_text)

In [30]:
data.head()

Unnamed: 0,Review,Rating,Length,Sentiment
0,place ridiculous great soup dumpling great sta...,5,177,Positive
1,stop day chinatown want try eat visit new york...,4,866,Positive
2,general typical americanized chinese food good...,4,136,Positive
3,soup dumpling s delicious ve eat thing soup du...,5,331,Positive
4,kind place imagine read book review visit real...,5,971,Positive


In [31]:
# Convert text data into numerical features using TF-IDF
tfidf = TfidfVectorizer(tokenizer=custom_tokenizer)

In [32]:
X = tfidf.fit_transform(data['Review'])
y = data['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [33]:
# # lets train the model
# models = {'LR' : LogisticRegression(), 'RF' : RandomForestClassifier(), 'NB' : MultinomialNB(), 'SVM' : SVC()}
# vectorizers = {'TF-IDF' : TfidfVectorizer()}

# parameters = {'LR' : {'C' : [0.001, 0.01, 0.1, 0.4]},
#                 'RF' : {'n_estimators' : [50, 100, 200], 'max_depth' : [25, 30, 32, 35, 38]},
#                 'NB' : {'alpha' : [0.5, 1, 2]},
#                 'SVM' : {'C' : [0.1, 1, 10], 'gamma' : [0.1, 1, 2], 'kernel' : ['linear', 'rbf']}}
# results = []


# for model_name, model in models.items():
#     for vectorizer_name, vectorizer in vectorizers.items():
#         pipeline = GridSearchCV(model, parameters[model_name], cv=5)
#         pipeline.fit(vectorizer.fit_transform(data['Review']), data['Sentiment'])
#         results.append({'model' : model_name, 'vectorizer' : vectorizer_name, 'best_params' : pipeline.best_params_, 'best_score' : pipeline.best_score_})

In [34]:
# results = pd.DataFrame(results)
# # lets see the params of random forest
# results

In [35]:
# lets try naive bayes 
model = MultinomialNB(alpha=2)
model.fit(X_train, y_train)
scores = cross_val_score(model, X_train, y_train, cv=5)
print('Cross-validation scores: {}'.format(scores))
print('Average cross-validation score: {:.2f}'.format(scores.mean()))

Cross-validation scores: [0.80167598 0.81703911 0.80586592 0.84636872 0.81843575]
Average cross-validation score: 0.82


In [36]:
confusion_matrix(y_test, model.predict(X_test))

array([[353,  81],
       [ 90, 372]], dtype=int64)

In [37]:
# model = SVC(C=10, gamma=1, kernel='rbf')
# model.fit(X_train, y_train)
# scores = cross_val_score(model, X_train, y_train, cv=5)
# print('Cross-validation scores: {}'.format(scores))
# print('Average cross-validation score: {:.2f}'.format(scores.mean()))

In [38]:
# y_pred = model.predict(X_test)
# accuracy_score(y_test, y_pred)

In [39]:
import pandas as pd
import numpy as np
import spacy
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
# from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import streamlit as st
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from imblearn.pipeline import Pipeline


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, tokenizer=word_tokenize):
        self.tokenizer = tokenizer

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = []
        for text in X:
            text = text.lower()  # Convert to lowercase
            text = re.sub('[^a-zA-Z]', ' ', text)  # Keep only lowercase and uppercase letters
            text = re.sub('\s+', ' ', text)  # Replace multiple whitespace characters with a single space
            text = self.tokenizer(text)  # Tokenize the text
            X_transformed.append(text)
        return X_transformed

class TextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vectorizer=TfidfVectorizer):
        self.vectorizer = vectorizer

    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        return self

    def transform(self, X):
        X_transformed = self.vectorizer.transform(X)
        return X_transformed
    
class SentimentClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, classifier=LogisticRegression):
        self.classifier = classifier

    def fit(self, X, y):
        self.classifier.fit(X, y)
        return self

    def predict(self, X):
        return self.classifier.predict(X)

    def score(self, X, y):
        return self.classifier.score(X, y)
    
class SentimentPipeline:
    def __init__(self, classifier=SentimentClassifier()):
        self.classifier = classifier

    def fit(self, X, y):
        self.classifier.fit(X, y)

    def predict(self, X):
        return self.classifier.predict(X)

    def score(self, X, y):
        return self.classifier.score(X, y)


  text = re.sub('\s+', ' ', text)  # Replace multiple whitespace characters with a single space


In [40]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline(steps=[
    ('text_cleaner', TextPreprocessor),
    ('vectorizer', TextVectorizer),
    ('classifier', SentimentClassifier),
])

In [42]:
# pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

TypeError: TextPreprocessor.transform() missing 1 required positional argument: 'X'

In [None]:
import joblib
joblib.dump(pipeline,'Nlp_pipeline.pkl')

['Nlp_pipeline.pkl']

In [None]:
pipeline = joblib.load('Nlp_pipeline.pkl')