In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report



In [2]:
# Load the dataset
file_path = "Restaurant_Reviews.tsv"
dataset = pd.read_csv(file_path, delimiter='\t', quoting=3)
dataset.head(10)


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [3]:
# Feature extraction
vectorizer = CountVectorizer(max_features=1500)
X = vectorizer.fit_transform(dataset['Review']).toarray()
y = dataset['Liked']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [4]:
from sklearn.linear_model import LogisticRegression

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


Accuracy: 0.81
              precision    recall  f1-score   support

           0       0.77      0.88      0.82        96
           1       0.87      0.76      0.81       104

    accuracy                           0.81       200
   macro avg       0.82      0.82      0.81       200
weighted avg       0.82      0.81      0.81       200



In [9]:
# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(report)


Accuracy: 0.79
              precision    recall  f1-score   support

           0       0.75      0.83      0.79        96
           1       0.83      0.75      0.79       104

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.79      0.79      0.79       200



In [6]:
from sklearn.svm import SVC

# Train model
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


Accuracy: 0.79
              precision    recall  f1-score   support

           0       0.75      0.85      0.80        96
           1       0.84      0.73      0.78       104

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.80      0.79      0.79       200



In [7]:
from sklearn.ensemble import RandomForestClassifier

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


Accuracy: 0.76
              precision    recall  f1-score   support

           0       0.70      0.89      0.78        96
           1       0.86      0.64      0.74       104

    accuracy                           0.76       200
   macro avg       0.78      0.76      0.76       200
weighted avg       0.78      0.76      0.76       200



In [10]:
from xgboost import XGBClassifier

# Train model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


ModuleNotFoundError: No module named 'xgboost'

In [None]:
#Alternative Machine Learning Models for Sentiment Analysis
#Since Multinomial Naïve Bayes (MultinomialNB) is used, exploring other models can improve accuracy and performance.

#Logistic Regression: A strong baseline for text classification. It is simple, interpretable, and effective for binary sentiment analysis but struggles with highly imbalanced datasets.

#Support Vector Machine (SVM): Works well with high-dimensional text data. It is powerful for small datasets and captures decision boundaries effectively but can be computationally expensive.

#Random Forest: A robust ensemble learning method that handles complex relationships between words well. It is less prone to overfitting compared to Naïve Bayes but may be slower with large feature sets.

#XGBoost: A boosting algorithm designed for structured text data. It provides high accuracy and handles overfitting well but requires tuning for optimal performance.

#Deep Learning Models (LSTM/BERT): These models understand word relationships and contextual meaning in sentiment analysis. They perform exceptionally well but require more data and computational resources.

#Choosing the Right Model
#If you need simplicity and speed, go for Logistic Regression.

#If you want higher accuracy, SVM is a great choice.

#If your dataset is large, XGBoost is optimal.

#If you need interpretability, Random Forest is a good option.

#If you want deep contextual understanding, LSTM/BERT will provide the best results.