In [1]:
!jupyter nbconvert --to script NLTK_Logistic_NaiveBayes_SVC_Regression_Restaurant_Reviews.ipynb


[NbConvertApp] Converting notebook NLTK_Logistic_NaiveBayes_SVC_Regression_Restaurant_Reviews.ipynb to script
[NbConvertApp] Writing 3656 bytes to NLTK_Logistic_NaiveBayes_SVC_Regression_Restaurant_Reviews.py


In [26]:
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import nltk


In [27]:

nltk.download('punkt_tab')    # for tokenization
nltk.download('stopwords')    # for stopwords removal


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

## **Clean and normalize text for ML.**
Makes text more uniform, reduces noise, and simplifies vocabulary for model training.

In [29]:
def preprocess(text):
    text = text.lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = text.split()

    negation_words = {"not", "no", "nor", "n't"}
    tokens = [word for word in tokens if word not in stop_words or word in negation_words]

    # tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [stemmer.stem(word) for word in tokens]  # Stemming
    return ' '.join(tokens)

In [30]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
import csv

csv_path = '/content/drive/My Drive/NLP/Restaurant_Reviews.tsv'

# .tsv to csv
# with open(csv_path, 'r', newline='') as tsvfile, open('/content/drive/My Drive/NLP/Restaurant_Reviews.csv', 'w', newline='') as csvfile:
#     tsv_reader = csv.reader(tsvfile, delimiter='\t')
#     csv_writer = csv.writer(csvfile, delimiter=',')
#     for row in tsv_reader:
#         csv_writer.writerow(row)


In [32]:
# df = pd.DataFrame(data)
df = pd.read_csv("/content/drive/My Drive/NLP/Restaurant_Reviews.tsv", sep='\t')
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [33]:
df['clean_Review'] = df['Review'].apply(preprocess)
df

Unnamed: 0,Review,Liked,clean_Review
0,Wow... Loved this place.,1,wow loved place
1,Crust is not good.,0,crust not good
2,Not tasty and the texture was just nasty.,0,not tasty texture nasty
3,Stopped by during the late May bank holiday of...,1,stopped late may bank holiday rick steve recom...
4,The selection on the menu was great and so wer...,1,selection menu great prices
...,...,...,...
995,I think food should have flavor and texture an...,0,think food flavor texture lacking
996,Appetite instantly gone.,0,appetite instantly gone
997,Overall I was not impressed and would not go b...,0,overall not impressed would not go back
998,"The whole experience was underwhelming, and I ...",0,whole experience underwhelming think well go n...


# **TF-IDF vectorisation**
converts text into numerical vectors

In [34]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_Review'])
y = df['Liked']

In [35]:
y

Unnamed: 0,Liked
0,1
1,0
2,0
3,1
4,1
...,...
995,0
996,0
997,0
998,0


In [36]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5662 stored elements and shape (1000, 1938)>

# **Model Training**

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [38]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, name="Model"):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n====== {name} ======")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    return model


In [39]:
lr_model = train_and_evaluate_model(LogisticRegression(), X_train, y_train, X_test, y_test, name="Logistic Regression")
nb_model = train_and_evaluate_model(MultinomialNB(), X_train, y_train, X_test, y_test, name="Naive Bayes")
svc_model = train_and_evaluate_model(SVC(kernel='linear'), X_train, y_train, X_test, y_test, name="Support Vector Classifier")



Accuracy: 0.8266666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.87      0.84       152
           1       0.85      0.78      0.82       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300


Accuracy: 0.8133333333333334
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.81      0.81       152
           1       0.81      0.82      0.81       148

    accuracy                           0.81       300
   macro avg       0.81      0.81      0.81       300
weighted avg       0.81      0.81      0.81       300


Accuracy: 0.8333333333333334
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.89      0.84       152
           1       0.87      0.78      0.82       148

    accuracy        

In [40]:
def predict_sentiment(text, model):
    clean = preprocess(text)
    vec = vectorizer.transform([clean])
    result = model.predict(vec)[0]
    return "Positive" if result == 1 else "Negative"


In [41]:
test_input = "I am not happy with the service"

for model, name in zip([lr_model, nb_model, svc_model], ["LogisticRegression", "NaiveBayes", "SVC"]):
    print(f"\n{name} Prediction:")
    print(f"Input: '{test_input}'")
    print("Predicted Sentiment:", predict_sentiment(test_input, model))



LogisticRegression Prediction:
Input: 'I am not happy with the service'
Predicted Sentiment: Negative

NaiveBayes Prediction:
Input: 'I am not happy with the service'
Predicted Sentiment: Positive

SVC Prediction:
Input: 'I am not happy with the service'
Predicted Sentiment: Negative
