In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
import joblib

In [2]:
# Preprocess function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # remove punctuation
    text = re.sub(r"\d+", "", text)  # remove numbers
    text = re.sub(r"\s+", " ", text).strip()  # remove extra whitespace
    return text

In [27]:
# Load data
train_df = pd.read_csv('/Users/arunaa/codalab/Homophobia-Transphobia/Marathi/HT_Marathi_3_train.csv')
test_df = pd.read_csv('/Users/arunaa/codalab/Homophobia-Transphobia/Marathi/HT_Marathi_3_test_without_labels.csv')

# Preprocess
train_df['Text'] = train_df['Text'].apply(clean_text)
test_df['Text'] = test_df['Text'].apply(clean_text)

# Shuffle and split
train_df = shuffle(train_df, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    train_df['Text'], train_df['Category '], test_size=0.2, random_state=42
)

In [31]:
# Vectorize text
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(test_df['Text'])

### Bidirectional LSTM with Class Weights

In [33]:
# Step 1: Encode labels and compute class weights
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(y_train_enc),
                                     y=y_train_enc)
class_weight_dict = dict(enumerate(class_weights))

In [37]:
# Step 2: Tokenization and Padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df['Text']), maxlen=max_len)


In [39]:
# Step 3: Build BiLSTM Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()



In [41]:
# Step 4: Train the Model
history = model.fit(
    X_train_seq, y_train_enc,
    validation_data=(X_val_seq, y_val_enc),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict
)

Epoch 1/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.3099 - loss: 1.1113 - val_accuracy: 0.3243 - val_loss: 1.0915
Epoch 2/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - accuracy: 0.5584 - loss: 1.0379 - val_accuracy: 0.5729 - val_loss: 0.9254
Epoch 3/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - accuracy: 0.7528 - loss: 0.7349 - val_accuracy: 0.5600 - val_loss: 1.0473
Epoch 4/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.8961 - loss: 0.3500 - val_accuracy: 0.7143 - val_loss: 0.8809
Epoch 5/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 52ms/step - accuracy: 0.9637 - loss: 0.1361 - val_accuracy: 0.7314 - val_loss: 1.1357
Epoch 6/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 52ms/step - accuracy: 0.9789 - loss: 0.0631 - val_accuracy: 0.7200 - val_loss: 1.1782
Epoch 7/20
[1m88/88[0m [32m━━━━

In [43]:
# Step 5: Evaluate and Predict
from sklearn.metrics import classification_report

# Predict and decode
y_val_pred = model.predict(X_val_seq)
y_val_labels = np.argmax(y_val_pred, axis=1)
print("Classification Report for BiLSTM:\n")
print(classification_report(y_val_enc, y_val_labels, target_names=le.classes_))

# Predict on test
y_test_pred = model.predict(X_test_seq)
test_df['category'] = le.inverse_transform(np.argmax(y_test_pred, axis=1))

# Save to CSV
test_df.to_csv("test_predictions_bilstm.csv", index=False)

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Classification Report for BiLSTM:

                        precision    recall  f1-score   support

            Homophobia       0.45      0.37      0.40       114
None of the categories       0.82      0.87      0.85       516
           Transphobia       0.35      0.29      0.31        70

              accuracy                           0.73       700
             macro avg       0.54      0.51      0.52       700
          weighted avg       0.71      0.73      0.72       700

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


### Text CNN

In [47]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Tokenize and pad
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df['Text']), maxlen=max_len)

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_enc), y=y_train_enc)
class_weight_dict = dict(enumerate(class_weights))

In [49]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Conv1D(128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()



In [51]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_seq, y_train_enc,
    validation_data=(X_val_seq, y_val_enc),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=[early_stop]
)

Epoch 1/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.3344 - loss: 1.1292 - val_accuracy: 0.4300 - val_loss: 1.0823
Epoch 2/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5070 - loss: 1.0696 - val_accuracy: 0.5671 - val_loss: 0.9049
Epoch 3/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7339 - loss: 0.8303 - val_accuracy: 0.7243 - val_loss: 0.6683
Epoch 4/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9198 - loss: 0.3781 - val_accuracy: 0.6329 - val_loss: 0.9678
Epoch 5/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9546 - loss: 0.1517 - val_accuracy: 0.7214 - val_loss: 0.8631
Epoch 6/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9738 - loss: 0.0881 - val_accuracy: 0.7300 - val_loss: 0.9538


In [52]:
from sklearn.metrics import classification_report

# Evaluate on validation set
val_preds = model.predict(X_val_seq)
val_preds_labels = np.argmax(val_preds, axis=1)

print("Text CNN Classification Report:\n")
print(classification_report(y_val_enc, val_preds_labels, target_names=le.classes_))

# Predict on test set
test_preds = model.predict(X_test_seq)
test_df['category'] = le.inverse_transform(np.argmax(test_preds, axis=1))

# Save CSV
test_df.to_csv("test_predictions_text_cnn.csv", index=False)

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Text CNN Classification Report:

                        precision    recall  f1-score   support

            Homophobia       0.41      0.58      0.48       114
None of the categories       0.87      0.82      0.84       516
           Transphobia       0.37      0.29      0.32        70

              accuracy                           0.72       700
             macro avg       0.55      0.56      0.55       700
          weighted avg       0.74      0.72      0.73       700

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


### GRU-Based Model (with Dropout + Class Weights)

In [55]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    GRU(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()



In [57]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_seq, y_train_enc,
    validation_data=(X_val_seq, y_val_enc),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=[early_stop]
)

Epoch 1/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.4212 - loss: 1.0988 - val_accuracy: 0.5886 - val_loss: 1.0589
Epoch 2/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.6538 - loss: 1.0114 - val_accuracy: 0.6157 - val_loss: 0.8469
Epoch 3/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.8166 - loss: 0.6770 - val_accuracy: 0.6757 - val_loss: 0.8868
Epoch 4/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.9480 - loss: 0.2828 - val_accuracy: 0.6814 - val_loss: 0.9934
Epoch 5/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.9719 - loss: 0.0929 - val_accuracy: 0.6800 - val_loss: 1.2034


In [58]:
from sklearn.metrics import classification_report

# Evaluate on validation set
val_preds = model.predict(X_val_seq)
val_preds_labels = np.argmax(val_preds, axis=1)

print("GRU Model Classification Report:\n")
print(classification_report(y_val_enc, val_preds_labels, target_names=le.classes_))

# Predict on test set
test_preds = model.predict(X_test_seq)
test_df['category'] = le.inverse_transform(np.argmax(test_preds, axis=1))

# Save to CSV
test_df.to_csv("test_predictions_gru.csv", index=False)

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
GRU Model Classification Report:

                        precision    recall  f1-score   support

            Homophobia       0.35      0.32      0.33       114
None of the categories       0.87      0.70      0.78       516
           Transphobia       0.18      0.46      0.26        70

              accuracy                           0.62       700
             macro avg       0.47      0.49      0.46       700
          weighted avg       0.71      0.62      0.65       700

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


### MLP (Multi-Layer Perceptron) on Averaged Word Embeddings

In [61]:
# Compute averaged embeddings for each sequence
def average_embedding(sequences, embedding_matrix):
    avg_embeddings = []
    for seq in sequences:
        vecs = [embedding_matrix[idx] for idx in seq if idx != 0 and idx < len(embedding_matrix)]
        if vecs:
            avg_embeddings.append(np.mean(vecs, axis=0))
        else:
            avg_embeddings.append(np.zeros(embedding_matrix.shape[1]))
    return np.array(avg_embeddings)

# Build embedding matrix (random for now; replace with GloVe if available)
embedding_dim = 128
embedding_matrix = np.random.uniform(-0.05, 0.05, (max_words, embedding_dim))

X_train_avg = average_embedding(X_train_seq, embedding_matrix)
X_val_avg = average_embedding(X_val_seq, embedding_matrix)
X_test_avg = average_embedding(X_test_seq, embedding_matrix)

In [63]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
    Dense(128, input_shape=(embedding_dim,), activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [65]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_avg, y_train_enc,
    validation_data=(X_val_avg, y_val_enc),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=[early_stop]
)

Epoch 1/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.4118 - loss: 1.1026 - val_accuracy: 0.1857 - val_loss: 1.1041
Epoch 2/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 676us/step - accuracy: 0.2748 - loss: 1.0977 - val_accuracy: 0.2100 - val_loss: 1.1213
Epoch 3/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 696us/step - accuracy: 0.3199 - loss: 1.1105 - val_accuracy: 0.3257 - val_loss: 1.1022
Epoch 4/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 662us/step - accuracy: 0.4348 - loss: 1.0867 - val_accuracy: 0.3000 - val_loss: 1.1263
Epoch 5/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4555 - loss: 1.0728 - val_accuracy: 0.4443 - val_loss: 1.0716
Epoch 6/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 696us/step - accuracy: 0.4939 - loss: 1.0775 - val_accuracy: 0.3486 - val_loss: 1.1304
Epoch 7/20
[1m88/88[0m [32m━━

In [67]:
from sklearn.metrics import classification_report

# Validation report
val_preds = model.predict(X_val_avg)
val_preds_labels = np.argmax(val_preds, axis=1)
print("MLP on Averaged Embeddings Report:\n")
print(classification_report(y_val_enc, val_preds_labels, target_names=le.classes_))

# Test prediction
test_preds = model.predict(X_test_avg)
test_df['category'] = le.inverse_transform(np.argmax(test_preds, axis=1))

# Save CSV
test_df.to_csv("test_predictions_mlp_avg.csv", index=False)

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
MLP on Averaged Embeddings Report:

                        precision    recall  f1-score   support

            Homophobia       0.27      0.22      0.24       114
None of the categories       0.88      0.50      0.64       516
           Transphobia       0.13      0.59      0.21        70

              accuracy                           0.47       700
             macro avg       0.43      0.44      0.37       700
          weighted avg       0.70      0.47      0.53       700

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 468us/step
