In [None]:
print('Installing gensim...')
!pip install gensim
print('gensim installed successfully.')

Installing gensim...
gensim installed successfully.


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import warnings
warnings.filterwarnings("ignore")

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Word2Vec
from gensim.models import Word2Vec

# Deep Learning
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# 1. Load Dataset
# ================================
DATA_PATH = "/content/drive/MyDrive/DS Internship/data.csv"
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [None]:
# 2. Label Creation
# ================================
df = df[df["Ratings"] != 3]
df["sentiment"] = df["Ratings"].apply(lambda x: 1 if x >= 4 else 0)

df["text"] = df["Review Title"].fillna("") + " " + df["Review text"].fillna("")

In [None]:
# 3. Text Preprocessing
# ================================
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = " ".join(w for w in text.split() if w not in stop_words)
    text = " ".join(lemmatizer.lemmatize(w) for w in text.split())
    return text

df["processed_text"] = df["text"].apply(preprocess)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Feature and target split
X = df["processed_text"]
y = df["sentiment"]

In [None]:
# 4. Train–Test Split (IMPORTANT) to avaoid data leakage
# ================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [None]:
# 5. NUMERICAL FEATURE EXTRACTION (TF-IDF)
# =====================================================
tfidf = TfidfVectorizer(
    max_features=7000,
    ngram_range=(1, 2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# 6. MACHINE LEARNING MODELS
# =====================================================
ml_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=200)
}

In [None]:
print("\n--- MACHINE LEARNING MODELS (TF-IDF) ---\n")

for name, model in ml_models.items():
    model.fit(X_train_tfidf, y_train)
    preds = model.predict(X_test_tfidf)
    f1 = f1_score(y_test, preds)
    print(f"{name} F1-score: {f1:.4f}")


--- MACHINE LEARNING MODELS (TF-IDF) ---

Logistic Regression F1-score: 0.9562
Naive Bayes F1-score: 0.9391
Linear SVM F1-score: 0.9613
Random Forest F1-score: 0.9546


In [None]:
# 7. DEEP LEARNING MODEL (LSTM)
# =====================================================
print("\n--- DEEP LEARNING MODEL (LSTM) ---\n")

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=200)
X_test_pad = pad_sequences(X_test_seq, maxlen=200)

lstm_model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=200),
    LSTM(128),
    Dense(1, activation="sigmoid")
])

lstm_model.compile(
    optimizer="adam",
    loss="binary_crossentropy"
)

lstm_model.fit(
    X_train_pad,
    y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

y_pred_lstm = (lstm_model.predict(X_test_pad) > 0.5).astype(int)
f1_lstm = f1_score(y_test, y_pred_lstm)

print(f"LSTM F1-score: {f1_lstm:.4f}")


--- DEEP LEARNING MODEL (LSTM) ---

Epoch 1/5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 484ms/step - loss: 0.4239 - val_loss: 0.2502
Epoch 2/5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 474ms/step - loss: 0.2258 - val_loss: 0.2051
Epoch 3/5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 466ms/step - loss: 0.1445 - val_loss: 0.2113
Epoch 4/5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 468ms/step - loss: 0.1359 - val_loss: 0.2075
Epoch 5/5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 464ms/step - loss: 0.1026 - val_loss: 0.2120
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 101ms/step
LSTM F1-score: 0.9554


- LinearSVC is found to be best model because it delivers the highest F1-score, showing better precision–recall balance for sentiment classification.

- It handles high-dimensional, sparse TF-IDF features efficiently and generalizes better than Naive Bayes and Random Forest.

- Compared to deep learning models, it is faster, less resource-intensive, and more suitable for real-time deployment.