In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
import joblib

In [3]:
# Preprocess function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # remove punctuation
    text = re.sub(r"\d+", "", text)  # remove numbers
    text = re.sub(r"\s+", " ", text).strip()  # remove extra whitespace
    return text

In [5]:
# Load data
train_df = pd.read_csv('HT_Eng_3_train.csv')
test_df = pd.read_csv('HT_Eng_3_test_without_labels.csv')

# Preprocess
train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

# Shuffle and split
train_df = shuffle(train_df, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    train_df['text'], train_df['category'], test_size=0.2, random_state=42
)

In [7]:
# Vectorize text
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(test_df['text'])

### Bidirectional LSTM with Class Weights

In [25]:
# Step 1: Encode labels and compute class weights
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(y_train_enc),
                                     y=y_train_enc)
class_weight_dict = dict(enumerate(class_weights))

In [27]:
# Step 2: Tokenization and Padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df['text']), maxlen=max_len)


In [29]:
# Step 3: Build BiLSTM Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()



In [35]:
# Step 4: Train the Model
history = model.fit(
    X_train_seq, y_train_enc,
    validation_data=(X_val_seq, y_val_enc),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict
)

Epoch 1/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.9531 - loss: 0.1146 - val_accuracy: 0.8863 - val_loss: 0.3616
Epoch 2/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.9688 - loss: 0.0740 - val_accuracy: 0.8736 - val_loss: 0.3944
Epoch 3/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.9887 - loss: 0.0356 - val_accuracy: 0.9005 - val_loss: 0.3921
Epoch 4/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - accuracy: 0.9870 - loss: 0.0271 - val_accuracy: 0.8926 - val_loss: 0.3703
Epoch 5/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - accuracy: 0.9942 - loss: 0.0232 - val_accuracy: 0.9005 - val_loss: 0.3950
Epoch 6/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.9907 - loss: 0.0184 - val_accuracy: 0.9100 - val_loss: 0.3915
Epoch 7/20
[1m80/80[0m [32m━━━━

In [37]:
# Step 5: Evaluate and Predict
from sklearn.metrics import classification_report

# Predict and decode
y_val_pred = model.predict(X_val_seq)
y_val_labels = np.argmax(y_val_pred, axis=1)
print("Classification Report for BiLSTM:\n")
print(classification_report(y_val_enc, y_val_labels, target_names=le.classes_))

# Predict on test
y_test_pred = model.predict(X_test_seq)
test_df['category'] = le.inverse_transform(np.argmax(y_test_pred, axis=1))

# Save to CSV
test_df.to_csv("test_predictions_bilstm.csv", index=False)

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Classification Report for BiLSTM:

                        precision    recall  f1-score   support

            Homophobia       0.26      0.41      0.32        34
Non-anti-LGBT+ content       0.97      0.93      0.95       599
           Transphobia       0.00      0.00      0.00         0

              accuracy                           0.90       633
             macro avg       0.41      0.45      0.42       633
          weighted avg       0.93      0.90      0.91       633

[1m24/31[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 7ms/step

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


### Text CNN

In [40]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Tokenize and pad
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df['text']), maxlen=max_len)

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_enc), y=y_train_enc)
class_weight_dict = dict(enumerate(class_weights))

In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Conv1D(128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()



In [44]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_seq, y_train_enc,
    validation_data=(X_val_seq, y_val_enc),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=[early_stop]
)

Epoch 1/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5890 - loss: 1.3140 - val_accuracy: 0.9273 - val_loss: 0.7803
Epoch 2/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6553 - loss: 1.1828 - val_accuracy: 0.7125 - val_loss: 0.8801
Epoch 3/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7295 - loss: 0.9462 - val_accuracy: 0.8278 - val_loss: 0.6978
Epoch 4/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8660 - loss: 0.5490 - val_accuracy: 0.9131 - val_loss: 0.4435
Epoch 5/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9242 - loss: 0.7425 - val_accuracy: 0.9084 - val_loss: 0.2992
Epoch 6/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9465 - loss: 0.3973 - val_accuracy: 0.8926 - val_loss: 0.2865
Epoch 7/20
[1m80/80[0m [32m━━━━

In [46]:
from sklearn.metrics import classification_report

# Evaluate on validation set
val_preds = model.predict(X_val_seq)
val_preds_labels = np.argmax(val_preds, axis=1)

print("Text CNN Classification Report:\n")
print(classification_report(y_val_enc, val_preds_labels, target_names=le.classes_))

# Predict on test set
test_preds = model.predict(X_test_seq)
test_df['category'] = le.inverse_transform(np.argmax(test_preds, axis=1))

# Save CSV
test_df.to_csv("test_predictions_text_cnn.csv", index=False)

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Text CNN Classification Report:

                        precision    recall  f1-score   support

            Homophobia       0.25      0.38      0.30        34
Non-anti-LGBT+ content       0.96      0.92      0.94       599
           Transphobia       0.00      0.00      0.00         0

              accuracy                           0.89       633
             macro avg       0.40      0.44      0.41       633
          weighted avg       0.92      0.89      0.91       633

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### GRU-Based Model (with Dropout + Class Weights)

In [49]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    GRU(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()



In [51]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_seq, y_train_enc,
    validation_data=(X_val_seq, y_val_enc),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=[early_stop]
)

Epoch 1/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - accuracy: 0.4715 - loss: 1.0807 - val_accuracy: 0.8404 - val_loss: 0.7187
Epoch 2/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.7404 - loss: 0.9745 - val_accuracy: 0.8310 - val_loss: 0.5555
Epoch 3/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.8745 - loss: 0.7344 - val_accuracy: 0.7773 - val_loss: 0.4595
Epoch 4/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.9080 - loss: 0.2897 - val_accuracy: 0.8499 - val_loss: 0.3685
Epoch 5/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.9509 - loss: 0.0837 - val_accuracy: 0.8863 - val_loss: 0.3574
Epoch 6/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.9791 - loss: 0.0652 - val_accuracy: 0.8610 - val_loss: 0.4850
Epoch 7/20
[1m80/80[0m [32m━━━━

In [53]:
from sklearn.metrics import classification_report

# Evaluate on validation set
val_preds = model.predict(X_val_seq)
val_preds_labels = np.argmax(val_preds, axis=1)

print("GRU Model Classification Report:\n")
print(classification_report(y_val_enc, val_preds_labels, target_names=le.classes_))

# Predict on test set
test_preds = model.predict(X_test_seq)
test_df['category'] = le.inverse_transform(np.argmax(test_preds, axis=1))

# Save to CSV
test_df.to_csv("test_predictions_gru.csv", index=False)

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
GRU Model Classification Report:

                        precision    recall  f1-score   support

            Homophobia       0.25      0.47      0.33        34
Non-anti-LGBT+ content       0.97      0.91      0.94       599
           Transphobia       0.00      0.00      0.00         0

              accuracy                           0.89       633
             macro avg       0.41      0.46      0.42       633
          weighted avg       0.93      0.89      0.91       633

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### MLP (Multi-Layer Perceptron) on Averaged Word Embeddings

In [56]:
# Compute averaged embeddings for each sequence
def average_embedding(sequences, embedding_matrix):
    avg_embeddings = []
    for seq in sequences:
        vecs = [embedding_matrix[idx] for idx in seq if idx != 0 and idx < len(embedding_matrix)]
        if vecs:
            avg_embeddings.append(np.mean(vecs, axis=0))
        else:
            avg_embeddings.append(np.zeros(embedding_matrix.shape[1]))
    return np.array(avg_embeddings)

# Build embedding matrix (random for now; replace with GloVe if available)
embedding_dim = 128
embedding_matrix = np.random.uniform(-0.05, 0.05, (max_words, embedding_dim))

X_train_avg = average_embedding(X_train_seq, embedding_matrix)
X_val_avg = average_embedding(X_val_seq, embedding_matrix)
X_test_avg = average_embedding(X_test_seq, embedding_matrix)

In [58]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
    Dense(128, input_shape=(embedding_dim,), activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [60]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_avg, y_train_enc,
    validation_data=(X_val_avg, y_val_enc),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=[early_stop]
)

Epoch 1/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8366 - loss: 1.2218 - val_accuracy: 0.9463 - val_loss: 0.9730
Epoch 2/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 774us/step - accuracy: 0.8637 - loss: 1.0169 - val_accuracy: 0.9463 - val_loss: 0.9296
Epoch 3/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 787us/step - accuracy: 0.8238 - loss: 1.3694 - val_accuracy: 0.9463 - val_loss: 0.8620
Epoch 4/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 746us/step - accuracy: 0.8990 - loss: 1.1688 - val_accuracy: 0.9463 - val_loss: 0.8581
Epoch 5/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 702us/step - accuracy: 0.8806 - loss: 1.2049 - val_accuracy: 0.9226 - val_loss: 0.8549
Epoch 6/20
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 723us/step - accuracy: 0.8697 - loss: 0.9592 - val_accuracy: 0.8499 - val_loss: 0.8526
Epoch 7/20
[1m80/80[0m [32m

In [62]:
from sklearn.metrics import classification_report

# Validation report
val_preds = model.predict(X_val_avg)
val_preds_labels = np.argmax(val_preds, axis=1)
print("MLP on Averaged Embeddings Report:\n")
print(classification_report(y_val_enc, val_preds_labels, target_names=le.classes_))

# Test prediction
test_preds = model.predict(X_test_avg)
test_df['category'] = le.inverse_transform(np.argmax(test_preds, axis=1))

# Save CSV
test_df.to_csv("test_predictions_mlp_avg.csv", index=False)

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 946us/step
MLP on Averaged Embeddings Report:

                        precision    recall  f1-score   support

            Homophobia       0.11      0.18      0.13        34
Non-anti-LGBT+ content       0.95      0.89      0.92       599
           Transphobia       0.00      0.00      0.00         0

              accuracy                           0.85       633
             macro avg       0.35      0.35      0.35       633
          weighted avg       0.90      0.85      0.88       633

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 432us/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
