# ***1: Import Libraries***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import pickle

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from keras.api.models import Sequential
from keras.api.layers import Dense, LSTM, Embedding, SpatialDropout1D
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.api.preprocessing.sequence import pad_sequences
from keras.api.callbacks import EarlyStopping
from keras.api.models import load_model


In [None]:
# Download NLTK resources
nltk.download('all')

# ***Read Dataset***

In [None]:
df = pd.read_csv("train.csv")
print("Dataset shape:", df.shape)
print(df.head())

# ***Preprocessing***

In [None]:
# Derive a binary label based on annotations:
df['label'] = np.where((df['hate_speech_count'] + df['offensive_language_count']) > df['neither_count'], 1, 0)
print("Dataset shape after adding label:", df.shape)
print(df[['hate_speech_count', 'offensive_language_count', 'neither_count', 'label']].head())


# **Exploratory Data Analysis (EDA)**

In [None]:
# 1.Distribution of final labels
plt.figure(figsize=(6,4))
sns.countplot(x='label', data=df)
plt.title("Distribution of Tweets by Final Label (0: Neither, 1: Hate/Offensive)")
plt.show()

In [None]:
# 2.Check tweet counts and basic statistics
if 'tweet' in df.columns:
    print("Sample tweets:")
    print(df['tweet'].head())
else:
    print("Column 'tweet' not found. Please adjust the code to use the correct text column.")


# ***Continue Preprocessing***

In [None]:
# Function to clean text: lowercasing, removing URLs, mentions, hashtags, special characters/punctuation
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text) # remove URLs
    text = re.sub(r'@[A-Za-z0-9_]+', '', text) # remove usernames/handles
    text = re.sub(r'#[A-Za-z0-9_]+', '', text) # remove hashtags
    text = re.sub(r'[^a-z\s]', '', text) # remove punctuation, numbers, special characters
    text = re.sub(r'\s+', ' ', text).strip() # remove extra whitespace
    return text



In [None]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [None]:
# Preprocessing function:
def preprocess_text(text):
    text = clean_text(text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [ps.stem(word) for word in tokens]
    return " ".join(tokens)

In [None]:
# Apply Preprocessing and Visualize Text Lengths
if 'tweet' in df.columns:
    df['clean_text'] = df['tweet'].apply(preprocess_text)
    # Visualize distribution of clean text lengths
    df['text_length'] = df['clean_text'].apply(lambda x: len(x.split()))
    plt.figure(figsize=(8,4))
    sns.histplot(df['text_length'], bins=30, kde=True)
    plt.title("Distribution of Clean Text Lengths")
    plt.xlabel("Number of words")
    plt.ylabel("Frequency")
    plt.show()
else:
    print("Tweet text column not found. Please ensure the dataset has a column for tweet text.")

# ***Split The Dataset***

In [None]:
# Data Splitting and Feature Extraction for Traditional ML Models
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['clean_text']).toarray()
y = df['label']

In [None]:
# Save the tfidf_vectorizer using pickle
import pickle
with open('tfidf_vectorizer.pkl', 'wb') as handle:
    pickle.dump(tfidf_vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Tfidf Vectorizer saved as tokenizer.pkl")

In [None]:
# Splitting the dataset into train, validation, and test sets (70% train, 15% val, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_tfidf, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [None]:
print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)

# ***ML Models Training***

In [None]:
#Train Traditional Machine Learning Models
models = {
    'LogisticRegression': LogisticRegression(max_iter=200),
    'LinearSVC': LinearSVC(),
    'KNeighbors': KNeighborsClassifier(),
    'RandomForest': RandomForestClassifier(n_estimators=100),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_val, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_val, y_pred, average='weighted', zero_division=0)
    results[model_name] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}
    print(f"{model_name} -- Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")

print("\nValidation Set Performance Comparison:")
print(pd.DataFrame(results).T)

# ***Prepare the Dataset for NN (DL)***

In [None]:
#Prepare Data for LSTM-based Deep Learning Model
max_features = 5000  # vocabulary size
maxlen = 100         # maximum sequence length

tokenizer = Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(df['clean_text'])
X_seq = tokenizer.texts_to_sequences(df['clean_text'])
X_seq = pad_sequences(X_seq, maxlen=maxlen)

X_train_seq, X_temp_seq, y_train_seq, y_temp_seq = train_test_split(X_seq, y, test_size=0.3, random_state=42, stratify=y)
X_val_seq, X_test_seq, y_val_seq, y_test_seq = train_test_split(X_temp_seq, y_temp_seq, test_size=0.5, random_state=42, stratify=y_temp_seq)


# ***DL Model Training***

In [None]:
#Build and Train the LSTM Model
embedding_dim = 128

lstm_model = Sequential()
lstm_model.add(Embedding(max_features, embedding_dim, input_length=maxlen))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(1, activation='sigmoid'))  # Binary classification (0: Neither, 1: Hate/Offensive)

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

# Early stopping callback to prevent overfitting
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

history = lstm_model.fit(X_train_seq, y_train_seq,
                         epochs=10, batch_size=64,
                         validation_data=(X_val_seq, y_val_seq),
                         callbacks=[es])

# ***Saving DL LSTM Model and Tokenizer***

In [None]:
# Save the LSTM model
lstm_model.save('lstm_model.h5')
print("LSTM model saved as lstm_model.h5")

# Save the tokenizer using pickle
import pickle
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Tokenizer saved as tokenizer.pkl")


# ***DL LSTM Evaluation***

In [None]:
# Evaluate the LSTM Model
loss, accuracy = lstm_model.evaluate(X_test_seq, y_test_seq, verbose=0)
print("LSTM Test Loss:", loss)
print("LSTM Test Accuracy:", accuracy)


In [None]:
# Generate classification report
y_pred_seq = (lstm_model.predict(X_test_seq) > 0.5).astype("int32")
print("LSTM Classification Report:")
print(classification_report(y_test_seq, y_pred_seq, zero_division=0))

# ***ML Modesl Evaluation***

In [None]:
#Evaluate Traditional Models on Test Set and Compare
for model_name, model in models.items():
    print(f"Evaluating {model_name} on Test Set...")
    y_test_pred = model.predict(X_test)
    print(classification_report(y_test, y_test_pred, zero_division=0))
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"Confusion Matrix - {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# ***Saving the best model in ML models after evaluation***

In [None]:
#Save the Best Performing Model for Deployment
best_model = models['XGBoost']
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print("Best model saved as best_model.pkl.")


# ***Deployment for both ML and DL***

In [None]:
# Simple Deployment Example
def predict_tweet(text, model, tfidf_vectorizer, tokenizer=None, use_lstm=False):
    text_clean = preprocess_text(text)
    if use_lstm:
        seq = tokenizer.texts_to_sequences([text_clean])
        seq = pad_sequences(seq, maxlen=maxlen)
        pred = lstm_model.predict(seq)[0][0]
        return 1 if pred > 0.5 else 0
    else:
        vec = tfidf_vectorizer.transform([text_clean]).toarray()
        return model.predict(vec)[0]


# ***Load the models ML and DL***

In [None]:
import pickle

with open('best_model.pkl', 'rb') as f:
    best_ml_model = pickle.load(f)

print("Model loaded successfully into best_ml_model")

In [None]:
from keras.api.models import load_model
import pickle

# Load LSTM model
lstm_model = load_model('lstm_model.h5')

# Load tokenizer
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

# ***Test ML***

In [None]:
# Tradtional ML
sample_tweet = "What a beautiful day, everything Sucks!"
prediction = predict_tweet(sample_tweet, best_model, tfidf_vectorizer)
print("Prediction for sample tweet (0: neither, 1: hate/offensive):", prediction)

# ***Test LSTM***

In [None]:
# LSTM
sample_tweet = "What a beautiful day, everything sucks!"
prediction_lstm = predict_tweet(sample_tweet, lstm_model, tfidf_vectorizer, tokenizer=tokenizer, use_lstm=True)
print("Prediction for sample tweet (0: neither, 1: hate/offensive):", prediction)