In [2]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Bidirectional, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

2025-06-10 22:13:21.238680: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df = pd.read_csv("training.csv", encoding="latin1", header=None, names=["target", "ids", "date", "flag", "user", "text"])
df.drop(columns=["flag", "date", "ids", "user"], inplace=True)
df["target"].astype(int)
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   text    1600000 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [5]:
df = df.sample(n=50000).reset_index(drop=True)

def mapping(target_num):
    if target_num == 0:
        return "sad"
    else:
        return "happy"
    
df["sentiment"] = df["target"].apply(mapping)

In [6]:
print(df["sentiment"].value_counts())


sentiment
sad      25040
happy    24960
Name: count, dtype: int64


In [None]:
min_count = df['sentiment'].value_counts().min()
print(f"\nMinimum class count: {min_count}")

balanced_data = df.groupby('sentiment').apply(lambda x: x.sample(n=min_count, random_state=42)).reset_index(drop=True)

print("\nClass distribution after balancing:")
print(balanced_data['sentiment'].value_counts())

def clean_tweet(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    return text.strip()

print("\nApplying preprocessing to the 'text' column...")
balanced_data['clean_text'] = balanced_data['text'].apply(clean_tweet)

print("\nSample rows from the preprocessed DataFrame:")
print(balanced_data.head())

print("\nSplitting the data into training and testing sets...")
X = balanced_data['clean_text']
y = balanced_data['sentiment']



Minimum class count: 24960

Class distribution after balancing:
sentiment
happy    24960
sad      24960
Name: count, dtype: int64

Applying preprocessing to the 'text' column...

Sample rows from the preprocessed DataFrame:
   target                                               text sentiment  \
0       4               Listening to Prince makes me smile.      happy   
1       4  Morning everyone! I'm at work already. Just an...     happy   
2       4  @Twinklybird Then you'll be bored senseless!!!...     happy   
3       4                                      Good morning      happy   
4       4  Also, Sun eve cinema was nearly empty for Star...     happy   

                                          clean_text  
0                 listening to prince makes me smile  
1  morning everyone im at work already just anoth...  
2  then youll be bored senseless p best of luck o...  
3                                       good morning  
4  also sun eve cinema was nearly empty for star ...  


In [None]:
MAX_NUM_WORDS = 15000
MAX_SEQUENCE_LENGTH = 100 
EMBEDDING_DIM = 100 
BATCH_SIZE = 512
EPOCHS = 30

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print("\nTokenizing and padding text data...")
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(
    X_train_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post'
)
X_test_padded = pad_sequences(
    X_test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post'
)


print("\nBuilding the neural network model...")
model = Sequential([
    Embedding(input_dim=MAX_NUM_WORDS, output_dim=EMBEDDING_DIM),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.6),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train the model with multiple epochs
print("\nTraining the model...")
history = model.fit(
    X_train_padded,
    y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate the model
print("\nEvaluating the model on the test set...")
y_pred = model.predict(X_test_padded)
y_pred_classes = (y_pred > 0.5).astype(int).reshape(-1)

accuracy = accuracy_score(y_test, y_pred_classes)
print(f"\nTest Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))



Tokenizing and padding text data...

Building the neural network model...

Training the model...
Epoch 1/20
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 574ms/step - accuracy: 0.5894 - loss: 0.6604 - val_accuracy: 0.7521 - val_loss: 0.5142
Epoch 2/20
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 567ms/step - accuracy: 0.7847 - loss: 0.4705 - val_accuracy: 0.7661 - val_loss: 0.5007
Epoch 3/20
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 566ms/step - accuracy: 0.8330 - loss: 0.3914 - val_accuracy: 0.7659 - val_loss: 0.5031
Epoch 4/20
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 544ms/step - accuracy: 0.8561 - loss: 0.3457 - val_accuracy: 0.7431 - val_loss: 0.5548

Evaluating the model on the test set...
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step

Test Accuracy: 0.7669

Classification Report:
              precision    recall  f1-score   support

       happy       0.81      0.

In [10]:
print(label_encoder.classes_)


['happy' 'sad']


In [9]:
import pickle

model.save("sentiment_model.h5")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


