In [22]:
# !jupyter nbconvert --to script NLTK_Logistic_NaiveBayes_SVC_Regression_Restaurant_Reviews.ipynb


In [2]:
# pip install tensorflow

In [3]:
import re
import string
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import nltk


# FOR LSTM
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


2025-09-15 15:22:46.438751: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# ---- ensure NLTK data and then create stopwords/stemmer ----
nltk.download('punkt', quiet=True)       # tokenizers/punkt
nltk.download('stopwords', quiet=True)   # corpora/stopwords


stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()



## **Clean and normalize text for ML.**
Makes text more uniform, reduces noise, and simplifies vocabulary for model training.

In [6]:
def preprocess(text):
    text = text.lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = text.split()

    negation_words = {"not", "no", "nor", "n't"}
    tokens = [word for word in tokens if word not in stop_words or word in negation_words]

    # tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [stemmer.stem(word) for word in tokens]  # Stemming
    return ' '.join(tokens)

In [9]:
import csv

# df = pd.DataFrame(data)
df = pd.read_csv("./Restaurant_Reviews.tsv", sep='\t')
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [10]:
df['clean_Review'] = df['Review'].apply(preprocess)
df

Unnamed: 0,Review,Liked,clean_Review
0,Wow... Loved this place.,1,wow love place
1,Crust is not good.,0,crust not good
2,Not tasty and the texture was just nasty.,0,not tasti textur nasti
3,Stopped by during the late May bank holiday of...,1,stop late may bank holiday rick steve recommen...
4,The selection on the menu was great and so wer...,1,select menu great price
...,...,...,...
995,I think food should have flavor and texture an...,0,think food flavor textur lack
996,Appetite instantly gone.,0,appetit instantli gone
997,Overall I was not impressed and would not go b...,0,overal not impress would not go back
998,"The whole experience was underwhelming, and I ...",0,whole experi underwhelm think well go ninja su...


# **TF-IDF vectorisation**
converts text into numerical vectors

In [11]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_Review'])
y = df['Liked']

# map integer class indices back to original labels (useful for multiclass)
label_classes = np.unique(y)


In [12]:
y

0      1
1      0
2      0
3      1
4      1
      ..
995    0
996    0
997    0
998    0
999    0
Name: Liked, Length: 1000, dtype: int64

In [13]:
X

<1000x1612 sparse matrix of type '<class 'numpy.float64'>'
	with 5645 stored elements in Compressed Sparse Row format>

# **Model Training**

In [14]:
# ---- BiLSTM: tokenizer + sequence prep (replace existing tokenizer block) ----
vocab_size = 20000
maxlen = 128
oov_token = "<OOV>"

# fallback: if Tokenizer not available from tf.keras, try keras package
if Tokenizer is None:
    try:
        from keras.preprocessing.text import Tokenizer as TokenizerLocal
        from keras.preprocessing.sequence import pad_sequences as pad_sequences_local
        Tokenizer = TokenizerLocal
        pad_sequences = pad_sequences_local
    except Exception:
        Tokenizer = None
        pad_sequences = None

if Tokenizer is not None:
    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
    # Use the cleaned review column you created earlier
    if "clean_Review" in df.columns:
        texts_for_tokenizer = df["clean_Review"].astype(str).tolist()
    else:
        # fallback: apply preprocess to the original review column
        texts_for_tokenizer = df["Review"].astype(str).apply(preprocess).tolist()

    tokenizer.fit_on_texts(texts_for_tokenizer)
    sequences = tokenizer.texts_to_sequences(texts_for_tokenizer)
    X_seq_all = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')
else:
    tokenizer = None
    X_seq_all = None


# Train-Test Split

In [15]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# keep existing TF-IDF split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y if len(np.unique(y))>1 else None)

# split sequences for BiLSTM (if available)
if X_seq_all is not None:
    Xs_train, Xs_test, ys_train, ys_test = train_test_split(X_seq_all, y, test_size=0.3, random_state=42, stratify=y if len(np.unique(y))>1 else None)
else:
    Xs_train = Xs_test = ys_train = ys_test = None


# Training ML models

In [16]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, name="Model"):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n====== {name} ======")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    return model


In [17]:
lr_model = train_and_evaluate_model(LogisticRegression(), X_train, y_train, X_test, y_test, name="Logistic Regression")
nb_model = train_and_evaluate_model(MultinomialNB(), X_train, y_train, X_test, y_test, name="Naive Bayes")
svc_model = train_and_evaluate_model(SVC(kernel='linear'), X_train, y_train, X_test, y_test, name="Support Vector Classifier")


# ---- Train BiLSTM ----
bilstm_model = None
if tf is None:
    print("TensorFlow not available — skipping BiLSTM.")
else:
    if Xs_train is None:
        print("No tokenized sequence data (Xs_train is None). Ensure tokenizer and X_seq_all were created.")
    else:
        embed_dim = 100
        lstm_units = 128
        batch_size = 64
        epochs = 6

        n_classes = len(np.unique(ys_train))
        if n_classes > 2:
            # multiclass
            bilstm_model = Sequential([
                Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=maxlen),
                Bidirectional(LSTM(lstm_units, return_sequences=False)),
                Dropout(0.4),
                Dense(64, activation='relu'),
                Dropout(0.2),
                Dense(n_classes, activation='softmax')
            ])
            bilstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        else:
            # binary
            bilstm_model = Sequential([
                Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=maxlen),
                Bidirectional(LSTM(lstm_units, return_sequences=False)),
                Dropout(0.4),
                Dense(64, activation='relu'),
                Dropout(0.2),
                Dense(1, activation='sigmoid')
            ])
            bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        history = bilstm_model.fit(
            Xs_train, ys_train,
            validation_split=0.1,
            epochs=epochs,
            batch_size=batch_size,
            callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)],
            verbose=1
        )

        # Evaluation
        probs = bilstm_model.predict(Xs_test, verbose=0)
        probs = np.array(probs)
        if probs.ndim == 1:
            probs = probs.reshape((-1,1))
        if probs.shape[1] > 1:
            y_pred_bilstm = probs.argmax(axis=1)
        else:
            y_pred_bilstm = (probs[:,0] >= 0.5).astype(int)

        print("\n====== BiLSTM ======")
        print("Accuracy:", accuracy_score(ys_test, y_pred_bilstm))
        print("Classification Report:\n", classification_report(ys_test, y_pred_bilstm))



Accuracy: 0.8166666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.85      0.82       150
           1       0.84      0.79      0.81       150

    accuracy                           0.82       300
   macro avg       0.82      0.82      0.82       300
weighted avg       0.82      0.82      0.82       300


Accuracy: 0.7966666666666666
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.79      0.79       150
           1       0.79      0.81      0.80       150

    accuracy                           0.80       300
   macro avg       0.80      0.80      0.80       300
weighted avg       0.80      0.80      0.80       300


Accuracy: 0.8166666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.83      0.82       150
           1       0.82      0.81      0.81       150

    accuracy        

I0000 00:00:1757929968.608454   30886 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1648 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6
2025-09-15 15:22:50.994496: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91002


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 43ms/step - accuracy: 0.4937 - loss: 0.6934 - val_accuracy: 0.7143 - val_loss: 0.6900
Epoch 2/6
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.5810 - loss: 0.6877 - val_accuracy: 0.7714 - val_loss: 0.6775
Epoch 3/6
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.7286 - loss: 0.6610 - val_accuracy: 0.6143 - val_loss: 0.6383
Epoch 4/6
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.7794 - loss: 0.5656 - val_accuracy: 0.7143 - val_loss: 0.5183
Epoch 5/6
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8333 - loss: 0.5587 - val_accuracy: 0.5857 - val_loss: 1.0742
Epoch 6/6
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.7349 - loss: 0.6739 - val_accuracy: 0.8143 - val_loss: 0.4267

Accuracy: 0.7266666666666667
Classification Repor

In [18]:
# def predict_sentiment(text, model):
#     clean = preprocess(text)
#     vec = vectorizer.transform([clean])
#     result = model.predict(vec)[0]
#     return "Positive" if result == 1 else "Negative"


def predict_sentiment(text, model, vectorizer=vectorizer, preprocess_flag=True):
    txt = preprocess(text) if preprocess_flag else str(text)
    vec = vectorizer.transform([txt])
    pred = model.predict(vec)[0]
    # If label_classes has more than two items, return mapped label
    try:
        if len(label_classes) > 2:
            return label_classes[pred]
    except Exception:
        # if label_classes or mapping fails, fall back to numeric return
        pass
    # binary mapping (assumes positive label is 1)
    return "Positive" if int(pred) == 1 else "Negative"


def predict_sentiment_bilstm(text, model=bilstm_model, tokenizer=tokenizer, maxlen=maxlen, preprocess_flag=True):
    if model is None or tokenizer is None:
        return "(bilstm not available)"
    txt = preprocess(text) if preprocess_flag else str(text)
    seq = tokenizer.texts_to_sequences([txt])
    pad = pad_sequences(seq, maxlen=maxlen, padding='post', truncating='post')
    probs = model.predict(pad, verbose=0)
    probs = np.array(probs)
    if probs.ndim == 1:
        probs = probs.reshape((-1,1))
    if probs.shape[1] > 1:
        pred = int(probs.argmax(axis=1)[0])
        # map predicted class index back to original label if label_classes exists
        try:
            return label_classes[pred]
        except Exception:
            return int(pred)
  # multiclass: returns class index (int). Map to labels if you have label names.
    else:
        pred = 1 if probs[0,0] >= 0.5 else 0
        return "Positive" if pred == 1 else "Negative"


In [21]:
test_input = "I am not happy with the service"

for model, name in zip([lr_model, nb_model, svc_model], ["LogisticRegression", "NaiveBayes", "SVC"]):
    print(f"\n{name} Prediction:")
    print(f"Input: '{test_input}'")
    print("Predicted Sentiment:", predict_sentiment(test_input, model))


# BiLSTM model
if bilstm_model is not None and tokenizer is not None:
    print("\nBiLSTM Prediction:")
    print(f"Input: '{test_input}'")
    print("Predicted Sentiment:", predict_sentiment_bilstm(test_input))
else:
    print("\nBiLSTM model is not available. Train it first before prediction.")


# Save tokenizer and bilstm (if you want persistence)
import joblib, os
out_dir = "model_artifacts"
os.makedirs(out_dir, exist_ok=True)
if tokenizer is not None:
    try:
        joblib.dump(tokenizer, f"{out_dir}/tokenizer.joblib")
    except Exception:
        print("Failed to save tokenizer via joblib.")
if bilstm_model is not None and tf is not None:
    try:
        bilstm_model.save(os.path.join(out_dir, "bilstm_model"), overwrite=True, include_optimizer=False)
    except Exception:
        bilstm_model.save(os.path.join(out_dir, "bilstm_model.h5"), overwrite=True)







LogisticRegression Prediction:
Input: 'I am not happy with the service'
Predicted Sentiment: Negative

NaiveBayes Prediction:
Input: 'I am not happy with the service'
Predicted Sentiment: Positive

SVC Prediction:
Input: 'I am not happy with the service'
Predicted Sentiment: Negative

BiLSTM Prediction:
Input: 'I am not happy with the service'
Predicted Sentiment: Negative
