<a href="https://colab.research.google.com/github/anakhakr/anakhakr/blob/main/DL_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
reddit_df = pd.read_csv("/content/Reddit_Data.csv")
twitter_df = pd.read_csv("/content/Twitter_Data.csv")

print(reddit_df.head())
print(twitter_df.head())

                                       clean_comment  category
0   family mormon have never tried explain them t...         1
1  buddhism has very much lot compatible with chr...         1
2  seriously don say thing first all they won get...        -1
3  what you have learned yours and only yours wha...         0
4  for your own benefit you may want read living ...         1
                                          clean_text  category
0  when modi promised “minimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0


In [None]:
reddit_df = reddit_df[['clean_comment', 'category']]
twitter_df = twitter_df[['clean_text', 'category']]

reddit_df.columns = ['text', 'sentiment']
twitter_df.columns = ['text', 'sentiment']

df = pd.concat([reddit_df, twitter_df], axis=0)
df.reset_index(drop=True, inplace=True)

df.head()


Unnamed: 0,text,sentiment
0,family mormon have never tried explain them t...,1.0
1,buddhism has very much lot compatible with chr...,1.0
2,seriously don say thing first all they won get...,-1.0
3,what you have learned yours and only yours wha...,0.0
4,for your own benefit you may want read living ...,1.0


In [None]:
# Map sentiment values to model-friendly labels
# -1 → 0 (Negative)
#  0 → 1 (Neutral)
#  1 → 2 (Positive)

df['sentiment'] = df['sentiment'].map({
    -1.0: 0,
     0.0: 1,
     1.0: 2
})

# Remove rows with invalid sentiment (if any)
df = df.dropna(subset=['sentiment'])

# Ensure integer labels
df['sentiment'] = df['sentiment'].astype(int)

# Verify
print(df['sentiment'].value_counts())


sentiment
2    68355
1    43787
Name: count, dtype: int64


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)        # remove URLs
    text = re.sub(r"@\w+", "", text)           # remove mentions
    text = re.sub(r"#\w+", "", text)           # remove hashtags
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\d+", "", text)             # remove numbers
    text = text.strip()
    return text

df['text'] = df['text'].astype(str).apply(clean_text)


In [None]:
df['text'] = df['text'].astype(str)


In [None]:
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

# Mapping:
# Negative -> 0
# Neutral  -> 1
# Positive -> 2




In [None]:
MAX_WORDS = 20000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(df['text'])

sequences = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(sequences, maxlen=MAX_LEN)

y = df['sentiment']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()



In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=64
)


Epoch 1/5
[1m1402/1402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 276ms/step - accuracy: 0.8705 - loss: 0.2994 - val_accuracy: 0.9831 - val_loss: 0.0711
Epoch 2/5
[1m1402/1402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 275ms/step - accuracy: 0.9849 - loss: 0.0593 - val_accuracy: 0.9828 - val_loss: 0.0704
Epoch 3/5
[1m1402/1402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 274ms/step - accuracy: 0.9877 - loss: 0.0423 - val_accuracy: 0.9830 - val_loss: 0.0747
Epoch 4/5
[1m1402/1402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 273ms/step - accuracy: 0.9922 - loss: 0.0248 - val_accuracy: 0.9820 - val_loss: 0.0824
Epoch 5/5
[1m1402/1402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m439s[0m 271ms/step - accuracy: 0.9952 - loss: 0.0153 - val_accuracy: 0.9827 - val_loss: 0.0927


In [None]:
y_pred = model.predict(X_val)
y_pred = np.argmax(y_pred, axis=1)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))


[1m701/701[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 56ms/step
Validation Accuracy: 0.9827009674974364
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      8758
           1       0.99      0.98      0.99     13671

    accuracy                           0.98     22429
   macro avg       0.98      0.98      0.98     22429
weighted avg       0.98      0.98      0.98     22429



In [None]:
import os
import pickle

# Save model explicitly to /content
model.save("/content/sentiment_lstm_model.h5")

# Save tokenizer explicitly to /content
with open("/content/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Verify files exist
print(os.listdir("/content"))




['.config', 'sentiment_lstm_model.h5', 'Reddit_Data.csv', 'Twitter_Data.csv', 'tokenizer.pkl', 'sample_data']
