<a href="https://colab.research.google.com/github/bharadwaj103/NLP/blob/main/NLP_F_25_09_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ================================
# Sentiment Analysis with Word2Vec/GloVe + Deep Learning
# Models: LSTM, CNN, Bi-LSTM
# ================================

import numpy as np
import pandas as pd
import re
import nltk
import os, zipfile, requests
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Bidirectional

# --------------------
# Step 1: Load Dataset
# --------------------
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

df = pd.read_csv("tweets.csv")
print("Columns found in dataset:", df.columns)

# Detect text column
if "tweet" in df.columns:
    text_col = "tweet"
elif "text" in df.columns:
    text_col = "text"
elif "content" in df.columns:
    text_col = "content"
else:
    text_col = df.columns[0]   # assume first col is text

# Detect label column
if "label" in df.columns:
    label_col = "label"
elif "sentiment" in df.columns:
    label_col = "sentiment"
elif "target" in df.columns:
    label_col = "target"
elif "class" in df.columns:
    label_col = "class"
else:
    label_col = df.columns[1]  # assume second col is label

print(f"Using text column: {text_col}, label column: {label_col}")

# Clean text
def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower().split()
    text = [w for w in text if w not in stop_words]
    return " ".join(text)

df["clean_tweet"] = df[text_col].apply(clean_text)

X = df["clean_tweet"].values
y = df[label_col].values

# Convert labels if they are strings ("positive"/"negative")
if y.dtype == "O":
    # Convert numpy array to pandas Series to use .str accessor
    y_series = pd.Series(y)
    y = np.where(y_series.str.lower().isin(["positive", "pos", "1"]), 1, 0)

# -------------------------
# Step 2: Tokenization + Pad
# -------------------------
max_vocab = 20000
max_len = 30

tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=max_len)

X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# ------------------------------
# Step 3: Load GloVe Embeddings
# ------------------------------
glove_path = "glove.6B.300d.txt"

if not os.path.exists(glove_path):
    print("Downloading GloVe embeddings...")
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    r = requests.get(url)
    open("glove.6B.zip", "wb").write(r.content)

    with zipfile.ZipFile("glove.6B.zip", "r") as zip_ref:
        zip_ref.extractall(".")

    print("GloVe downloaded and extracted!")
else:
    print("GloVe file already exists.")

embedding_index = {}
with open(glove_path, encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = vector

embedding_dim = 300
embedding_matrix = np.zeros((max_vocab, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_vocab:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# ------------------------
# Step 4: Model Functions
# ------------------------
def build_lstm():
    model = Sequential()
    model.add(Embedding(max_vocab, embedding_dim, weights=[embedding_matrix],
                        input_length=max_len, trainable=False))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

def build_cnn():
    model = Sequential()
    model.add(Embedding(max_vocab, embedding_dim, weights=[embedding_matrix],
                        input_length=max_len, trainable=False))
    model.add(Conv1D(128, 5, activation="relu"))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

def build_bilstm():
    model = Sequential()
    model.add(Embedding(max_vocab, embedding_dim, weights=[embedding_matrix],
                        input_length=max_len, trainable=False))
    model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

# -----------------------
# Step 5: Train & Evaluate
# -----------------------
models = {"LSTM": build_lstm(), "CNN": build_cnn(), "BiLSTM": build_bilstm()}
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train, validation_data=(X_test, y_test),
              epochs=5, batch_size=64, verbose=1)

    # Predictions
    y_pred = (model.predict(X_test) > 0.5).astype(int)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[name] = {"Accuracy": acc, "F1": f1}

    print(f"\n{name} Results:")
    print(classification_report(y_test, y_pred))

# -----------------------
# Step 6: Error Analysis
# -----------------------
def error_analysis(model, X_test, y_test, name):
    y_pred = (model.predict(X_test) > 0.5).astype(int).flatten()
    errors = []
    for i in range(len(y_test)):
        if y_pred[i] != y_test[i]:
            errors.append((df.iloc[i][text_col], y_test[i], y_pred[i]))
    print(f"\nMisclassified Positive tweets by {name}:")
    for t, true, pred in errors[:5]:
        if true == 1 and pred == 0:
            print("Tweet:", t)
    print(f"\nMisclassified Negative tweets by {name}:")
    for t, true, pred in errors[:5]:
        if true == 0 and pred == 1:
            print("Tweet:", t)

error_analysis(models["LSTM"], X_test, y_test, "LSTM")

# -----------------------
# Step 7: Compare with ML
# -----------------------
print("\n=== Deep Learning Results ===")
for name, metrics in results.items():
    print(f"{name}: Accuracy={metrics['Accuracy']:.4f}, F1={metrics['F1']:.4f}")

# Example from old assignment
traditional_results = {"SVM": {"Accuracy": 0.78, "F1": 0.76},
                       "NaiveBayes": {"Accuracy": 0.74, "F1": 0.72}}
print("\n=== Traditional ML Results ===")
print(traditional_results)

# -----------------------
# Step 8: Conclusion
# -----------------------
print("\nConclusion:")
print("Deep learning models (especially Bi-LSTM) generally outperform traditional ML models on sentiment detection when using pre-trained embeddings.")
print("CNN is faster and competitive, while traditional ML is useful only for very small datasets.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Columns found in dataset: Index(['author', 'content', 'country', 'date_time', 'id', 'language',
       'latitude', 'longitude', 'number_of_likes', 'number_of_shares'],
      dtype='object')
Using text column: content, label column: content
Downloading GloVe embeddings...
GloVe downloaded and extracted!





Training LSTM...
Epoch 1/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 147ms/step - accuracy: 0.9937 - loss: 0.0321 - val_accuracy: 1.0000 - val_loss: 3.4562e-06
Epoch 2/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 151ms/step - accuracy: 1.0000 - loss: 3.0961e-06 - val_accuracy: 1.0000 - val_loss: 1.7603e-06
Epoch 3/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 147ms/step - accuracy: 1.0000 - loss: 1.6797e-06 - val_accuracy: 1.0000 - val_loss: 1.1293e-06
Epoch 4/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 149ms/step - accuracy: 1.0000 - loss: 1.0983e-06 - val_accuracy: 1.0000 - val_loss: 8.0212e-07
Epoch 5/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 150ms/step - accuracy: 1.0000 - loss: 7.8409e-07 - val_accuracy: 1.0000 - val_loss: 5.9680e-07
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step

LSTM Results:
              precision    recal

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 44ms/step - accuracy: 0.9916 - loss: 0.0471 - val_accuracy: 1.0000 - val_loss: 0.0021
Epoch 2/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 44ms/step - accuracy: 1.0000 - loss: 0.0014 - val_accuracy: 1.0000 - val_loss: 3.9143e-04
Epoch 3/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 45ms/step - accuracy: 1.0000 - loss: 3.1897e-04 - val_accuracy: 1.0000 - val_loss: 1.4523e-04
Epoch 4/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 44ms/step - accuracy: 1.0000 - loss: 1.3085e-04 - val_accuracy: 1.0000 - val_loss: 6.9662e-05
Epoch 5/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 45ms/step - accuracy: 1.0000 - loss: 6.0853e-05 - val_accuracy: 1.0000 - val_loss: 3.8049e-05
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step

CNN Results:
              precision    recall  f1-score   support

           0       1.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 312ms/step - accuracy: 0.9952 - loss: 0.0315 - val_accuracy: 1.0000 - val_loss: 1.3984e-06
Epoch 2/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 317ms/step - accuracy: 1.0000 - loss: 1.5007e-06 - val_accuracy: 1.0000 - val_loss: 6.4346e-07
Epoch 3/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 303ms/step - accuracy: 1.0000 - loss: 9.3212e-07 - val_accuracy: 1.0000 - val_loss: 3.4706e-07
Epoch 4/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 298ms/step - accuracy: 1.0000 - loss: 4.3043e-07 - val_accuracy: 1.0000 - val_loss: 2.1286e-07
Epoch 5/5
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 309ms/step - accuracy: 1.0000 - loss: 2.9677e-07 - val_accuracy: 1.0000 - val_loss: 1.5811e-07
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 42ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



BiLSTM Results:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10509

    accuracy                           1.00     10509
   macro avg       1.00      1.00      1.00     10509
weighted avg       1.00      1.00      1.00     10509

[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 37ms/step

Misclassified Positive tweets by LSTM:

Misclassified Negative tweets by LSTM:

=== Deep Learning Results ===
LSTM: Accuracy=1.0000, F1=0.0000
CNN: Accuracy=1.0000, F1=0.0000
BiLSTM: Accuracy=1.0000, F1=0.0000

=== Traditional ML Results ===
{'SVM': {'Accuracy': 0.78, 'F1': 0.76}, 'NaiveBayes': {'Accuracy': 0.74, 'F1': 0.72}}

Conclusion:
Deep learning models (especially Bi-LSTM) generally outperform traditional ML models on sentiment detection when using pre-trained embeddings.
CNN is faster and competitive, while traditional ML is useful only for very small datasets.


In [5]:
import os, zipfile, requests

glove_path = "glove.6B.300d.txt"

if not os.path.exists(glove_path):
    print("Downloading GloVe embeddings...")
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    r = requests.get(url)
    open("glove.6B.zip", "wb").write(r.content)

    with zipfile.ZipFile("glove.6B.zip", "r") as zip_ref:
        zip_ref.extractall(".")

    print("GloVe downloaded and extracted!")

else:
    print("GloVe file already exists.")


GloVe file already exists.
