In [26]:


# Install required packages (uncomment if needed)
# !pip install tensorflow scikeras scikit-learn pandas numpy joblib

import re, string, pickle, os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

print('✅ TensorFlow version:', tf.__version__)


✅ TensorFlow version: 2.20.0


In [27]:


# Update path if necessary
DATA_PATH = 'C:/Users/Aleesha/Desktop/abc/labeled_data.csv'   # Or your own path
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
print('📊 Data shape:', df.shape)
df.head()


📊 Data shape: (24783, 7)


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [28]:


def clean_tweet(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # URLs
    text = re.sub(r"@\w+", "", text)                     # Mentions
    text = re.sub(r"#\w+", "", text)                     # Hashtags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r"\d+", "", text)                      # Numbers
    text = re.sub(r"\s+", " ", text).strip()             # Extra spaces
    return text

df['clean_tweet'] = df['tweet'].astype(str).apply(clean_tweet)
df[['tweet', 'clean_tweet']].head()


Unnamed: 0,tweet,clean_tweet
0,!!! RT @mayasolovely: As a woman you shouldn't...,rt as a woman you shouldnt complain about clea...
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,rt boy dats coldtyga dwn bad for cuffin dat ho...
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,rt dawg rt you ever fuck a bitch and she start...
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,rt she look like a tranny
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,rt the shit you hear about me might be true or...


In [29]:
# 📌 Tokenize & pad


VOCAB_SIZE = 10000
MAX_LEN = 40
OOV_TOKEN = "<OOV>"

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(df['clean_tweet'])

sequences = tokenizer.texts_to_sequences(df['clean_tweet'])
X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

y = df['class'].values  # Assuming 'class' column contains labels 0/1/2
print('X shape:', X.shape, '| y shape:', y.shape)


X shape: (24783, 40) | y shape: (24783,)


In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print('Train:', X_train.shape, 'Test:', X_test.shape)


Train: (19826, 40) Test: (4957, 40)


In [31]:
# 📌 Model builder function

#Build function with tunable args + defaults


def build_model(vocab_size=VOCAB_SIZE, embed_dim=64, lstm_units=64, dropout_rate=0.3):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=MAX_LEN))
    model.add(LSTM(lstm_units))
    model.add(Dropout(dropout_rate))
    model.add(Dense(3, activation='softmax'))  # 3 classes
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [34]:
import itertools
from tensorflow.keras.callbacks import EarlyStopping

# Define hyperparameter search space
param_grid = {
    "embed_dim": [32, 64],
    "lstm_units": [64, 128],
    "dropout_rate": [0.3, 0.5],
    "batch_size": [64],
    "epochs": [3, 5],
}

best_acc = 0
best_params = None
best_model = None

for embed_dim, lstm_units, dropout_rate, batch_size, epochs in itertools.product(
    param_grid["embed_dim"],
    param_grid["lstm_units"],
    param_grid["dropout_rate"],
    param_grid["batch_size"],
    param_grid["epochs"]
):
    print(f"\n🧪 Training with embed_dim={embed_dim}, lstm_units={lstm_units}, dropout={dropout_rate}, "
          f"batch={batch_size}, epochs={epochs}")

    model = build_model(vocab_size=VOCAB_SIZE, embed_dim=embed_dim,
                        lstm_units=lstm_units, dropout_rate=dropout_rate)

    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        batch_size=batch_size,
        epochs=epochs,
        verbose=0,
        callbacks=[EarlyStopping(monitor='val_loss', patience=2)]
    )

    loss, acc = model.evaluate(X_test, y_test, verbose=0)
    print(f"→ Validation Accuracy: {acc:.4f}")

    if acc > best_acc:
        best_acc = acc
        best_params = (embed_dim, lstm_units, dropout_rate, batch_size, epochs)
        best_model = model

print("\n🏆 Best Accuracy:", best_acc)
print("Best Params (embed_dim, lstm_units, dropout, batch, epochs):", best_params)




🧪 Training with embed_dim=32, lstm_units=64, dropout=0.3, batch=64, epochs=3




→ Validation Accuracy: 0.8689

🧪 Training with embed_dim=32, lstm_units=64, dropout=0.3, batch=64, epochs=5
→ Validation Accuracy: 0.8531

🧪 Training with embed_dim=32, lstm_units=64, dropout=0.5, batch=64, epochs=3
→ Validation Accuracy: 0.8263

🧪 Training with embed_dim=32, lstm_units=64, dropout=0.5, batch=64, epochs=5
→ Validation Accuracy: 0.8303

🧪 Training with embed_dim=32, lstm_units=128, dropout=0.3, batch=64, epochs=3
→ Validation Accuracy: 0.8277

🧪 Training with embed_dim=32, lstm_units=128, dropout=0.3, batch=64, epochs=5
→ Validation Accuracy: 0.7743

🧪 Training with embed_dim=32, lstm_units=128, dropout=0.5, batch=64, epochs=3
→ Validation Accuracy: 0.7952

🧪 Training with embed_dim=32, lstm_units=128, dropout=0.5, batch=64, epochs=5
→ Validation Accuracy: 0.8160

🧪 Training with embed_dim=64, lstm_units=64, dropout=0.3, batch=64, epochs=3
→ Validation Accuracy: 0.8338

🧪 Training with embed_dim=64, lstm_units=64, dropout=0.3, batch=64, epochs=5
→ Validation Accuracy: 0

In [35]:
MODEL_PATH = 'best_lstm_model.keras'
TOKENIZER_PATH = 'tokenizer.pkl'

best_model.save(MODEL_PATH)
print(f"✅ Best model saved to {MODEL_PATH}")

import pickle
with open(TOKENIZER_PATH, 'wb') as f:
    pickle.dump(tokenizer, f)
print(f"✅ Tokenizer saved to {TOKENIZER_PATH}")


✅ Best model saved to best_lstm_model.keras
✅ Tokenizer saved to tokenizer.pkl
