In [2]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
import joblib

In [3]:
# Preprocess function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # remove punctuation
    text = re.sub(r"\d+", "", text)  # remove numbers
    text = re.sub(r"\s+", " ", text).strip()  # remove extra whitespace
    return text

In [4]:
# Load data
train_df = pd.read_csv('/Users/arunaa/codalab/Homophobia-Transphobia/Task-1/Tamil/HT_Tam_3_train.csv')
test_df = pd.read_csv('/Users/arunaa/codalab/Homophobia-Transphobia/Task-1/Tamil/HT_Tam_3_test_without_labels.csv')

# Preprocess
train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

# Shuffle and split
train_df = shuffle(train_df, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    train_df['text'], train_df['category'], test_size=0.2, random_state=42
)

In [5]:
# Vectorize text
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(test_df['text'])

### Bidirectional LSTM with Class Weights

In [7]:
# Step 1: Encode labels and compute class weights
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(y_train_enc),
                                     y=y_train_enc)
class_weight_dict = dict(enumerate(class_weights))

In [8]:
# Step 2: Tokenization and Padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df['text']), maxlen=max_len)


In [9]:
# Step 3: Build BiLSTM Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()



In [10]:
# Step 4: Train the Model
history = model.fit(
    X_train_seq, y_train_enc,
    validation_data=(X_val_seq, y_val_enc),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict
)

Epoch 1/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.4499 - loss: 1.0931 - val_accuracy: 0.3114 - val_loss: 1.0905
Epoch 2/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step - accuracy: 0.5311 - loss: 0.9879 - val_accuracy: 0.4972 - val_loss: 1.0624
Epoch 3/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 50ms/step - accuracy: 0.8068 - loss: 0.6011 - val_accuracy: 0.7917 - val_loss: 0.5992
Epoch 4/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - accuracy: 0.9581 - loss: 0.1683 - val_accuracy: 0.8668 - val_loss: 0.4366
Epoch 5/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.9963 - loss: 0.0510 - val_accuracy: 0.8499 - val_loss: 0.4881
Epoch 6/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.9921 - loss: 0.0431 - val_accuracy: 0.8537 - val_loss: 0.6242
Epoch 7/20
[1m67/67[0m [32m━━━━

In [11]:
# Step 5: Evaluate and Predict
from sklearn.metrics import classification_report

# Predict and decode
y_val_pred = model.predict(X_val_seq)
y_val_labels = np.argmax(y_val_pred, axis=1)
print("Classification Report for BiLSTM:\n")
print(classification_report(y_val_enc, y_val_labels, target_names=le.classes_))

# Predict on test
y_test_pred = model.predict(X_test_seq)
test_df['category'] = le.inverse_transform(np.argmax(y_test_pred, axis=1))

# Save to CSV
test_df.to_csv("test_predictions_bilstm.csv", index=False)

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Classification Report for BiLSTM:

                        precision    recall  f1-score   support

            Homophobia       0.64      0.82      0.72        87
Non-anti-LGBT+ content       0.96      0.87      0.91       416
           Transphobia       0.48      0.70      0.57        30

              accuracy                           0.85       533
             macro avg       0.69      0.80      0.73       533
          weighted avg       0.88      0.85      0.86       533

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


### Text CNN

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Tokenize and pad
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df['text']), maxlen=max_len)

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_enc), y=y_train_enc)
class_weight_dict = dict(enumerate(class_weights))

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Conv1D(128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()



In [15]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_seq, y_train_enc,
    validation_data=(X_val_seq, y_val_enc),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=[early_stop]
)

Epoch 1/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.5832 - loss: 1.0519 - val_accuracy: 0.2589 - val_loss: 1.1087
Epoch 2/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.4144 - loss: 1.1037 - val_accuracy: 0.7542 - val_loss: 0.9047
Epoch 3/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.8229 - loss: 0.7818 - val_accuracy: 0.7505 - val_loss: 0.7049
Epoch 4/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9594 - loss: 0.2851 - val_accuracy: 0.8068 - val_loss: 0.4999
Epoch 5/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9835 - loss: 0.0812 - val_accuracy: 0.8537 - val_loss: 0.3646
Epoch 6/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9941 - loss: 0.0328 - val_accuracy: 0.8368 - val_loss: 0.4381
Epoch 7/20
[1m67/67[0m [32m━━━━

In [16]:
from sklearn.metrics import classification_report

# Evaluate on validation set
val_preds = model.predict(X_val_seq)
val_preds_labels = np.argmax(val_preds, axis=1)

print("Text CNN Classification Report:\n")
print(classification_report(y_val_enc, val_preds_labels, target_names=le.classes_))

# Predict on test set
test_preds = model.predict(X_test_seq)
test_df['category'] = le.inverse_transform(np.argmax(test_preds, axis=1))

# Save CSV
test_df.to_csv("test_predictions_text_cnn.csv", index=False)

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Text CNN Classification Report:

                        precision    recall  f1-score   support

            Homophobia       0.63      0.83      0.71        87
Non-anti-LGBT+ content       0.96      0.87      0.91       416
           Transphobia       0.55      0.77      0.64        30

              accuracy                           0.85       533
             macro avg       0.71      0.82      0.75       533
          weighted avg       0.88      0.85      0.86       533

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


### GRU-Based Model (with Dropout + Class Weights)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    GRU(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()



In [19]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_seq, y_train_enc,
    validation_data=(X_val_seq, y_val_enc),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=[early_stop]
)

Epoch 1/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.4012 - loss: 1.0919 - val_accuracy: 0.6660 - val_loss: 1.0703
Epoch 2/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.5484 - loss: 1.0245 - val_accuracy: 0.8218 - val_loss: 0.7615
Epoch 3/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.8412 - loss: 0.5285 - val_accuracy: 0.7186 - val_loss: 0.9471
Epoch 4/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.9718 - loss: 0.1052 - val_accuracy: 0.7917 - val_loss: 0.7227
Epoch 5/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.9925 - loss: 0.0315 - val_accuracy: 0.8368 - val_loss: 0.7056
Epoch 6/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.9981 - loss: 0.0102 - val_accuracy: 0.8762 - val_loss: 0.5246
Epoch 7/20
[1m67/67[0m [32m━━━━

In [20]:
from sklearn.metrics import classification_report

# Evaluate on validation set
val_preds = model.predict(X_val_seq)
val_preds_labels = np.argmax(val_preds, axis=1)

print("GRU Model Classification Report:\n")
print(classification_report(y_val_enc, val_preds_labels, target_names=le.classes_))

# Predict on test set
test_preds = model.predict(X_test_seq)
test_df['category'] = le.inverse_transform(np.argmax(test_preds, axis=1))

# Save to CSV
test_df.to_csv("test_predictions_gru.csv", index=False)

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
GRU Model Classification Report:

                        precision    recall  f1-score   support

            Homophobia       0.69      0.85      0.76        87
Non-anti-LGBT+ content       0.96      0.89      0.93       416
           Transphobia       0.54      0.70      0.61        30

              accuracy                           0.88       533
             macro avg       0.73      0.81      0.77       533
          weighted avg       0.89      0.88      0.88       533

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


### MLP (Multi-Layer Perceptron) on Averaged Word Embeddings

In [22]:
# Compute averaged embeddings for each sequence
def average_embedding(sequences, embedding_matrix):
    avg_embeddings = []
    for seq in sequences:
        vecs = [embedding_matrix[idx] for idx in seq if idx != 0 and idx < len(embedding_matrix)]
        if vecs:
            avg_embeddings.append(np.mean(vecs, axis=0))
        else:
            avg_embeddings.append(np.zeros(embedding_matrix.shape[1]))
    return np.array(avg_embeddings)

# Build embedding matrix (random for now; replace with GloVe if available)
embedding_dim = 128
embedding_matrix = np.random.uniform(-0.05, 0.05, (max_words, embedding_dim))

X_train_avg = average_embedding(X_train_seq, embedding_matrix)
X_val_avg = average_embedding(X_val_seq, embedding_matrix)
X_test_avg = average_embedding(X_test_seq, embedding_matrix)

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
    Dense(128, input_shape=(embedding_dim,), activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [24]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_avg, y_train_enc,
    validation_data=(X_val_avg, y_val_enc),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=[early_stop]
)

Epoch 1/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4687 - loss: 1.1105 - val_accuracy: 0.7561 - val_loss: 1.0819
Epoch 2/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 758us/step - accuracy: 0.6542 - loss: 1.0398 - val_accuracy: 0.5066 - val_loss: 1.0856
Epoch 3/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 745us/step - accuracy: 0.5803 - loss: 1.0323 - val_accuracy: 0.3621 - val_loss: 1.0900
Epoch 4/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 752us/step - accuracy: 0.2816 - loss: 1.1544 - val_accuracy: 0.7129 - val_loss: 1.0209
Epoch 5/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 742us/step - accuracy: 0.6732 - loss: 1.0822 - val_accuracy: 0.4991 - val_loss: 1.0472
Epoch 6/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 734us/step - accuracy: 0.6161 - loss: 1.0217 - val_accuracy: 0.4409 - val_loss: 1.0379
Epoch 7/20
[1m67/67[0m [32m

In [25]:
from sklearn.metrics import classification_report

# Validation report
val_preds = model.predict(X_val_avg)
val_preds_labels = np.argmax(val_preds, axis=1)
print("MLP on Averaged Embeddings Report:\n")
print(classification_report(y_val_enc, val_preds_labels, target_names=le.classes_))

# Test prediction
test_preds = model.predict(X_test_avg)
test_df['category'] = le.inverse_transform(np.argmax(test_preds, axis=1))

# Save CSV
test_df.to_csv("test_predictions_mlp_avg.csv", index=False)

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
MLP on Averaged Embeddings Report:

                        precision    recall  f1-score   support

            Homophobia       0.41      0.08      0.13        87
Non-anti-LGBT+ content       0.78      0.89      0.83       416
           Transphobia       0.09      0.13      0.11        30

              accuracy                           0.71       533
             macro avg       0.43      0.37      0.36       533
          weighted avg       0.68      0.71      0.68       533

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 406us/step
