**SENTIMENT**

In [None]:
# =====================
# SETUP & INSTALLS
# =====================
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
import re
import unicodedata
import json
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler

%pip install -q langdetect indic-nlp-library indic-transliteration googletrans==4.0.0-rc1 lime eli5

# =====================
# LOAD TRAIN & TEST DATA
# =====================
train_df = pd.read_csv('Hindi Train HASOC 2025 - Updated.csv')[['OCR','Sentiment']]
# test_df = pd.read_csv('/content/Hindi_Test_HASOC_2025_Updated.csv')[['OCR']]
test_df = pd.read_csv('Hindi Test HASOC 2025 - Updated.csv')[['OCR', 'Ids']]

train_df.dropna(inplace=True)
train_df.drop_duplicates(inplace=True)
test_df.dropna(inplace=True)
test_df.drop_duplicates(inplace=True)

# =====================
# CLEANING & VULGAR REPLACEMENT
# =====================
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|[\w\.-]+@[\w\.-]+', '', text)
    text = re.sub(r'[^A-Za-z\u0900-\u097F\s]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

HINDI_OFFENSIVE_WORDS_FILE = "hindi-offensive-words-original.json"
with open(HINDI_OFFENSIVE_WORDS_FILE, "rb") as f:
    vulgar_dict = json.load(f)
sorted_vulgar_dict = dict(sorted(vulgar_dict.items(), key=lambda item: len(item[0]), reverse=True))

def replace_vulgar_phrases(text):
    for phrase, replacement in sorted_vulgar_dict.items():
        text = text.replace(phrase, replacement)
    return text

train_df['OCR'] = train_df['OCR'].apply(clean_text).apply(replace_vulgar_phrases)
test_df['OCR'] = test_df['OCR'].apply(clean_text).apply(replace_vulgar_phrases)

# =====================
# TOKENIZATION, STOPWORDS, STEMMING
# =====================
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from indicnlp.tokenize import indic_tokenize
from langdetect import detect, DetectorFactory
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Added this line
DetectorFactory.seed = 0

HINDI_STOPWORDS_FILE = "hindi_stopwords.json"
with open(HINDI_STOPWORDS_FILE, 'rb') as f:
    hindi_hinglish_stopwords = json.load(f)

english_stopwords = set(stopwords.words('english'))
all_stopwords = english_stopwords.union(hindi_hinglish_stopwords['hindi']).union(hindi_hinglish_stopwords['hinglish'])

def detect_language(text):
    try: return detect(text)
    except: return "other"

def tokenize_and_remove_stopwords_mix(text):
    lang = detect_language(text)
    tokens = word_tokenize(text) if lang == 'en' else indic_tokenize.trivial_tokenize(text)
    return [token for token in tokens if token not in all_stopwords]

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def roman_hindi_light_stem(token):
    suffixes = ['na', 'ne', 'liya', 'kiya', 'ti', 'ta', 'tha', 'thi', 'kar', 'ke', 'se', 'me', 'ho']
    for suffix in sorted(suffixes, key=len, reverse=True):
        if token.endswith(suffix) and len(token) > len(suffix) + 2:
            return token[:-len(suffix)]
    return token

def normalize(token):
    return unicodedata.normalize("NFC", token.strip().lower())

def lemmatize_token(token):
    lemma = lemmatizer.lemmatize(token, pos='v')
    return lemma if lemma != token else lemmatizer.lemmatize(token)

def process_tokens(tokens):
    processed = []
    for token in tokens:
        norm = normalize(token)
        if norm.isascii():
            processed.append(lemmatize_token(stemmer.stem(norm)))
        elif any('ा' <= ch <= 'ह' for ch in norm):
            processed.append(norm)
        else:
            processed.append(roman_hindi_light_stem(norm))
    return processed

tokenized_df = train_df['OCR'].astype(str).apply(tokenize_and_remove_stopwords_mix)
norm_df = pd.DataFrame(tokenized_df.apply(process_tokens), columns=['OCR'])
norm_df['OCR'] = norm_df['OCR'].apply(lambda x: ' '.join(x))

# =====================
# LABEL ENCODING
# =====================
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(train_df['Sentiment'])

# =====================
# OVERSAMPLING
# =====================
# df = pd.concat([norm_df['OCR'], pd.Series(labels, name='label')], axis=1)
df = pd.concat([norm_df['OCR'].reset_index(drop=True), pd.Series(labels, name='label').reset_index(drop=True)], axis=1)
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(df[['OCR']], df['label'])

# =====================
# TF-IDF + RANDOM FOREST
# =====================
tfidf = TfidfVectorizer(ngram_range=(1, 5))
X_tfidf = tfidf.fit_transform(X_ros['OCR'])

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_tfidf, y_ros)

os.makedirs('saved_models', exist_ok=True)
joblib.dump(clf, 'saved_models/rf_model.pkl')
joblib.dump(tfidf, 'saved_models/tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'saved_models/label_encoder.pkl')

# =====================
# PROCESS TEST DATA
# =====================
tokenized_test_df = test_df['OCR'].astype(str).apply(tokenize_and_remove_stopwords_mix)
norm_test_df = pd.DataFrame(tokenized_test_df.apply(process_tokens), columns=['OCR'])
norm_test_df['OCR'] = norm_test_df['OCR'].apply(lambda x: ' '.join(x))

# Transform & Predict
vectorizer = joblib.load('saved_models/tfidf_vectorizer.pkl')
model = joblib.load('saved_models/rf_model.pkl')
label_encoder = joblib.load('saved_models/label_encoder.pkl')

X_test_features = vectorizer.transform(norm_test_df['OCR'])
y_pred = model.predict(X_test_features)
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Save Predictions
predictions_df = pd.DataFrame({'Ids': test_df['Ids'], 'Sentiment': y_pred_labels})
predictions_df.to_csv('FAST NUCES HASOC 2025 Predictions.csv', index=False)

print("Initial predictions (Sentiment) saved to 'FAST NUCES HASOC 2025 Predictions.csv'")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/981.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Initial predictions (Sentiment) saved to 'FAST NUCES HASOC 2025 Predictions.csv'


**SARCASM**

In [None]:
%pip install pandas numpy scikit-learn tensorflow

import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# --- Preprocessing ---
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# --- Load training data ---
df = pd.read_csv("Hindi Train HASOC 2025 - Updated.csv")
df['OCR'] = df['OCR'].apply(clean_text)

X_full = df['OCR'].values
y_full = df['Sarcasm'].values

# Encode target if needed
if df['Sarcasm'].dtype == 'O':
    le = LabelEncoder()
    y_full = le.fit_transform(y_full)

# --- Tokenization ---
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_full)
X_full_seq = tokenizer.texts_to_sequences(X_full)
X_full_pad = pad_sequences(X_full_seq, maxlen=max_len, padding='post', truncating='post')

# --- CNN Model ---
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# --- Train on full training data ---
model.fit(X_full_pad, y_full, epochs=10, batch_size=32)

# --- Load and preprocess test data ---
test_df = pd.read_csv("Hindi Test HASOC 2025 - Updated.csv")  # Load test data to get Ids
test_df['OCR'] = test_df['OCR'].apply(clean_text)
X_test = test_df['OCR'].values
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# --- Predict ---
test_preds_probs = model.predict(X_test_pad)
test_preds = (test_preds_probs > 0.5).astype(int).flatten()

# --- Convert predictions to labels ---
label_map = {0: "Non-Sarcastic", 1: "Sarcastic"}
test_preds_labels = [label_map[pred] for pred in test_preds]

# --- Load existing predictions CSV and merge ---
predictions_df = pd.read_csv('FAST NUCES HASOC 2025 Predictions.csv')
sarcasm_predictions_df = pd.DataFrame({
    'Ids': test_df['Ids'],
    'Sarcasm': test_preds_labels
})
predictions_df = pd.merge(predictions_df, sarcasm_predictions_df, on='Ids', how='left')

# --- Save updated predictions ---
predictions_df.to_csv("FAST NUCES HASOC 2025 Predictions.csv", index=False)

print("✅ Sarcasm predictions ('Sarcastic'/'Non-Sarcastic') added to 'FAST NUCES HASOC 2025 Predictions.csv'")


Epoch 1/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 120ms/step - accuracy: 0.6181 - loss: 0.6581
Epoch 2/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 107ms/step - accuracy: 0.6688 - loss: 0.6268
Epoch 3/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 126ms/step - accuracy: 0.6852 - loss: 0.5641
Epoch 4/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 93ms/step - accuracy: 0.8418 - loss: 0.3896
Epoch 5/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 161ms/step - accuracy: 0.9294 - loss: 0.1860
Epoch 6/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 53ms/step - accuracy: 0.9128 - loss: 0.1624
Epoch 7/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 54ms/step - accuracy: 0.9416 - loss: 0.1113
Epoch 8/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - accuracy: 0.9451 - loss: 0.1262
Epoch 9/10
[1m36/36[0m [32m━━━━━━━━━━━━━

**VULGAR**

In [4]:
# === INSTALL DEPENDENCIES ===
%pip install pandas numpy scikit-learn tqdm matplotlib seaborn tensorflow keras opencv-python pillow indic-transliteration regex

# === MOUNT DRIVE ===
from google.colab import drive
drive.mount('/content/drive')

# === IMPORT LIBRARIES ===
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from indic_transliteration.sanscript import SCHEMES, transliterate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# === CONFIG ===
IMG_SIZE = 224
BATCH_SIZE = 32
EPOCHS = 10
LINGUAL_WEIGHT = 0.5
VISUAL_WEIGHT = 0.5
THRESHOLD = 0.5
CSV_PATH = "Hindi Train HASOC 2025 - Updated.csv"
IMAGE_DIR = "/content/drive/MyDrive/HASOC Abusive&Sarcasm/Hindi Training Images"
TEST_CSV = "Hindi Test HASOC 2025 - Updated.csv"
TEST_IMAGE_DIR = "/content/drive/MyDrive/HASOC Abusive&Sarcasm/Hindi Testing Images"

# === LOAD TRAIN DATA ===
df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=['OCR', 'Vulgar', 'Ids'])
df['label'] = df['Vulgar'].map({'Non Vulgar': 0, 'Vulgar': 1}).astype(str)
df['path'] = df['Ids'].apply(lambda x: os.path.join(IMAGE_DIR, x))
df = df[df['path'].apply(os.path.exists)]

# === TEXT PREPROCESSING ===
def convert_to_roman(text):
    try:
        return transliterate(text, SCHEMES['devanagari'], SCHEMES['iast'])
    except:
        return text

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text.strip()

df['OCR_clean'] = df['OCR'].apply(convert_to_roman).apply(clean_text)

# === TF-IDF + RANDOM FOREST ===
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(df['OCR_clean'])

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_text, df['label'].astype(int))

# === RESNET50 ===
datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
full_gen = datagen.flow_from_dataframe(
    df, x_col='path', y_col='label', target_size=(IMG_SIZE, IMG_SIZE),
    class_mode='binary', batch_size=BATCH_SIZE, shuffle=True
)

base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
x = GlobalAveragePooling2D()(base_model.output)
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)
cnn_model = Model(inputs=base_model.input, outputs=output)

# Freeze ResNet base
for layer in base_model.layers:
    layer.trainable = False

cnn_model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

cnn_model.fit(full_gen, epochs=EPOCHS, callbacks=[early_stop])

# === LOAD TEST DATA ===
test_df = pd.read_csv(TEST_CSV) # Load test data to get Ids
test_df = test_df.dropna(subset=['OCR', 'Ids'])
test_df['path'] = test_df['Ids'].apply(lambda x: os.path.join(TEST_IMAGE_DIR, x))
test_df = test_df[test_df['path'].apply(os.path.exists)]
test_df['OCR_clean'] = test_df['OCR'].apply(convert_to_roman).apply(clean_text)

# === LINGUAL PREDICTIONS ===
X_test_text = tfidf.transform(test_df['OCR_clean'])
test_lingual_probs = rf.predict_proba(X_test_text)[:, 1]

# === VISUAL PREDICTIONS ===
test_gen = datagen.flow_from_dataframe(
    test_df, x_col='path', y_col=None, target_size=(IMG_SIZE, IMG_SIZE),
    class_mode=None, batch_size=BATCH_SIZE, shuffle=False
)
test_visual_probs = cnn_model.predict(test_gen).reshape(-1)

# === COMBINE PREDICTIONS ===
combined_probs = (LINGUAL_WEIGHT * test_lingual_probs) + (VISUAL_WEIGHT * test_visual_probs)
final_preds = (combined_probs >= THRESHOLD).astype(int)

# === LOAD existing predictions CSV and merge ===
predictions_df = pd.read_csv('FAST NUCES HASOC 2025 Predictions.csv')
vulgar_predictions_df = pd.DataFrame({'Ids': test_df['Ids'], 'Vulgar': final_preds})
vulgar_predictions_df['Vulgar'] = vulgar_predictions_df['Vulgar'].map({0: 'Non Vulgar', 1: 'Vulgar'})
predictions_df = pd.merge(predictions_df, vulgar_predictions_df, on='Ids', how='left')


# === SAVE OUTPUT CSV ===
predictions_df.to_csv("FAST NUCES HASOC 2025 Predictions.csv", index=False)

print("✅ Vulgar predictions added to 'FAST NUCES HASOC 2025 Predictions.csv'")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found 1133 validated image filenames belonging to 2 classes.
Epoch 1/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 6s/step - accuracy: 0.6411 - loss: 0.6664
Epoch 2/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 6s/step - accuracy: 0.7332 - loss: 0.5509
Epoch 3/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 6s/step - accuracy: 0.7455 - loss: 0.5213
Epoch 4/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 6s/step - accuracy: 0.8122 - loss: 0.4793
Epoch 5/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 6s/step - accuracy: 0.8118 - loss: 0.4372
Epoch 6/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 6s/step - accuracy: 0.8047 - loss: 0.4326
Epoch 7/10
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 6s/step - accuracy:

ABUSE

In [12]:
# ========== Install Required Packages ==========
%pip install pandas numpy tqdm scikit-learn imbalanced-learn torch torchvision transformers tensorflow gensim

# ========== Imports ==========
import os
import re
import json
import joblib
import pickle
import urllib.request
import gzip
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping

from gensim.models import KeyedVectors
from gensim.test.utils import datapath

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# ========== Load Abuse Word Dictionary ==========
with open('hindi-offensive-words-original.json', 'r', encoding='utf-8') as f:
    abuse_dict = json.load(f)
abuse_words = set(abuse_dict.keys())

# ========== Clean and Preprocess Text ==========
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\u0900-\u097F a-zA-Z0-9]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def contains_abuse_word(tokens):
    return int(any(token in abuse_words for token in tokens))

# ========== Load FastText Embeddings (Auto-download) ==========
import urllib.request
import gzip
import shutil
from gensim.models import KeyedVectors

def setup_fasttext_embeddings():
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz"
    gz_path = "cc.hi.300.vec.gz"
    vec_path = "cc.hi.300.vec"

    # Download
    print("📥 Downloading...")
    urllib.request.urlretrieve(url, gz_path)
    print("✅ Downloaded:", gz_path)

    # Extract
    print("📂 Extracting...")
    with gzip.open(gz_path, 'rb') as f_in:
        with open(vec_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print("✅ Extracted:", vec_path)

    # Load
    print("📦 Loading FastText vectors...")
    model = KeyedVectors.load_word2vec_format(vec_path, binary=False, encoding='utf-8', unicode_errors='ignore')
    print("✅ Loaded FastText model.")
    return model


def encode(tokens):
    embeddings = [fasttext_model[word] for word in tokens if word in fasttext_model]
    if not embeddings:
        embeddings = [np.zeros(embedding_dim)]
    return np.mean(embeddings, axis=0)

# ========== Dataset Class ==========
class AbuseDataset(Dataset):
    def __init__(self, X, abuse_flags, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.abuse_flags = torch.tensor(abuse_flags, dtype=torch.float32).unsqueeze(1)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.abuse_flags[idx], self.y[idx]

# ========== Model ==========
class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTMClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim + 1, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, abuse_flag):
        x = torch.cat((x, abuse_flag), dim=1)
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out


# ========== Load FastText Hindi Vectors ==========
fasttext_model = setup_fasttext_embeddings()
embedding_dim = 300

# ========== Load Training Data ==========
df = pd.read_csv("Hindi Train HASOC 2025 - Updated.csv")
df = df[['OCR', 'Abuse']].dropna()
df['cleaned'] = df['OCR'].apply(clean_text)
df['tokens'] = df['cleaned'].apply(lambda x: x.split())
df['has_abuse_word'] = df['tokens'].apply(contains_abuse_word)
df['input_ids'] = df['tokens'].apply(encode)

X_input_ids = np.array(df['input_ids'].tolist())
X_abuse_flag = df['has_abuse_word'].values
y_labels = df['Abuse'].values

label_encoder = LabelEncoder()
y_labels_encoded = label_encoder.fit_transform(y_labels)

ros = RandomOverSampler(random_state=42)
X_combined = np.concatenate([X_input_ids, X_abuse_flag.reshape(-1, 1)], axis=1)
X_resampled, y_resampled_encoded = ros.fit_resample(X_combined, y_labels_encoded)

X_input_ids = X_resampled[:, :-1]
X_abuse_flag = X_resampled[:, -1]
y_resampled = y_resampled_encoded

train_dataset = AbuseDataset(X_input_ids, X_abuse_flag, y_resampled)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# ========== Train Model ==========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMClassifier(300, 128, 2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, abuse_flag, y_batch in train_loader:
        X_batch, abuse_flag, y_batch = X_batch.to(device), abuse_flag.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch, abuse_flag)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# ========== Predict on Test Data ==========
test_df = pd.read_csv("Hindi Test HASOC 2025 - Updated.csv")
test_df = test_df[['Ids', 'OCR']].dropna()
test_df['cleaned'] = test_df['OCR'].apply(clean_text)
test_df['tokens'] = test_df['cleaned'].apply(lambda x: x.split())
test_df['has_abuse_word'] = test_df['tokens'].apply(contains_abuse_word)
test_df['input_ids'] = test_df['tokens'].apply(encode)

X_test_input_ids = np.array(test_df['input_ids'].tolist())
X_test_abuse_flag = test_df['has_abuse_word'].values

pred_dataset = AbuseDataset(X_test_input_ids, X_test_abuse_flag, np.zeros(len(test_df)))
pred_loader = DataLoader(pred_dataset, batch_size=32)

model.eval()
all_preds = []
with torch.no_grad():
    for X_batch, abuse_flag, _ in pred_loader:
        X_batch, abuse_flag = X_batch.to(device), abuse_flag.to(device)
        outputs = model(X_batch, abuse_flag)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)

# ========== Merge Predictions ==========
predictions_df = pd.read_csv('FAST NUCES HASOC 2025 Predictions.csv')
abuse_predictions_df = pd.DataFrame({'Ids': test_df['Ids'], 'Abuse': all_preds})
label_map = {0: "Abusive", 1: "Non-abusive"}
abuse_predictions_df['Abuse'] = abuse_predictions_df['Abuse'].map(label_map)

predictions_df = pd.merge(predictions_df, abuse_predictions_df, on='Ids', how='left')
predictions_df.to_csv("FAST NUCES HASOC 2025 Predictions.csv", index=False)
print("✅ Abuse predictions added to 'FAST NUCES HASOC 2025 Predictions.csv'")


📥 Downloading...
✅ Downloaded: cc.hi.300.vec.gz
📂 Extracting...
✅ Extracted: cc.hi.300.vec
📦 Loading FastText vectors...
✅ Loaded FastText model.
Epoch 1/5, Loss: 0.6867
Epoch 2/5, Loss: 0.6556
Epoch 3/5, Loss: 0.6232
Epoch 4/5, Loss: 0.5923
Epoch 5/5, Loss: 0.5753
✅ Abuse predictions added to 'FAST NUCES HASOC 2025 Predictions.csv'
