In [1]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    TfidfVectorizer,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union
from sklearn.svm import SVC
from xgboost import XGBClassifier
import time
from sklearn.linear_model import SGDClassifier
from IPython.display import clear_output
import torch
from transformers import AutoTokenizer, AutoModel

In [2]:
import gdown
files = {
   "mal_full_offensive_train.csv": "1TX-2hn2dsFvmrU-t1PULOYeAF96SnI0h",
   "mal_full_offensive_dev.csv": '1p4VRR9pP-WvOGh36Pee4iQm__jdCtQAk'
}
for file_name, file_id in files.items():
  url = f'https://drive.google.com/uc?export=download&id={file_id}'
  gdown.download(url, file_name, quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=1TX-2hn2dsFvmrU-t1PULOYeAF96SnI0h
To: /content/mal_full_offensive_train.csv
100%|██████████| 2.02M/2.02M [00:00<00:00, 77.8MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=1p4VRR9pP-WvOGh36Pee4iQm__jdCtQAk
To: /content/mal_full_offensive_dev.csv
100%|██████████| 258k/258k [00:00<00:00, 27.2MB/s]


In [6]:

class MuRILVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, batch_size=512):
        self.model_name = "setu4993/LaBSE"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
        self.batch_size = batch_size
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.Series):
            X = X.tolist()

        embeddings = []
        for i in range(0, len(X), self.batch_size):
            batch = X[i:i + self.batch_size]
            batch_embeddings = self.get_embeddings(batch)
            embeddings.append(batch_embeddings)
        return np.concatenate(embeddings, axis=0)

    def get_embeddings(self, texts):
        inputs = self.tokenizer(texts, max_length=256, padding='max_length',  truncation=True, return_tensors='pt')
        inputs = {key: value.to(self.device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)

        # embeddings = outputs.last_hidden_state.mean(dim=1)
        out = outputs.last_hidden_state
        mean_pooling = torch.mean(out, 1)
        max_pooling, _ = torch.max(out, 1)
        embed = torch.cat((mean_pooling, max_pooling), 1)
        return embed.cpu().numpy()


class BoCVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vectorizer = CountVectorizer()

    def fit(self, X, y=None):
        return self.vectorizer.fit(X)

    def transform(self, X):
        return self.vectorizer.transform(X)

class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, label_encoding=False):
        self.label_encoding = label_encoding
        self.encoder = LabelEncoder() if label_encoding else None

    def fit(self, X, y=None):
        if self.label_encoding:
            self.encoder.fit(y)
        return self

    def transform(self, X, y=None):
        # Ensure X is a list of strings before lowercasing
        X_list = X.tolist() if isinstance(X, pd.Series) else X
        X_transformed = [str(text).lower() for text in X_list]

        if self.label_encoding and y is not None:
            y_transformed = self.encoder.transform(y)
            return X_transformed, y_transformed
        return X_transformed

    def fit_transform(self, X, y=None):
        # Ensure X is a list of strings before lowercasing
        return X_transformed, y # return original y if no label encoding

feature_sets = {
    # "Word2Vec": make_pipeline(Preprocessor(), Word2VecVectorizer()),
    "LaBSE Embedding": make_pipeline(MuRILVectorizer()),
    # "BoW": make_pipeline(CountVectorizer(token_pattern=r'[\u0D00-\u0D7F]+|[a-zA-Z]+')),
    # "TF-IDF": make_pipeline(TfidfVectorizer(token_pattern=r'[\u0D00-\u0D7F]+|[a-zA-Z]+')),
}
classifiers = {
    "RFC": make_pipeline(RandomForestClassifier(n_estimators=500)),
    "AdaBoost": make_pipeline(StandardScaler(with_mean=False), AdaBoostClassifier()),
    "Bagging": make_pipeline(StandardScaler(with_mean=False), BaggingClassifier()),
    "DecisionTree": make_pipeline(StandardScaler(with_mean=False), DecisionTreeClassifier()),
    "ExtraTrees": make_pipeline(StandardScaler(with_mean=False), ExtraTreesClassifier()),
    "KNeighbors": make_pipeline(
        StandardScaler(with_mean=False), KNeighborsClassifier()
    ),
    "LinearSVC": make_pipeline(
        StandardScaler(with_mean=False), SVC(kernel="linear",  probability=True)
    ),
    "LogisticRegression": make_pipeline(
        StandardScaler(with_mean=False), LogisticRegression(max_iter=1000)
    ),
    "MLP": make_pipeline(
        StandardScaler(with_mean=False), MLPClassifier(max_iter=1000)
    ),
    "NearestCentroid": make_pipeline(StandardScaler(with_mean=False), NearestCentroid()),
    "OneVsOne": make_pipeline(StandardScaler(with_mean=False), OneVsOneClassifier(LinearSVC())),
    "OneVsRest": make_pipeline(StandardScaler(with_mean=False), OneVsRestClassifier(LinearSVC())),
    "PassiveAggressive": make_pipeline(
        StandardScaler(with_mean=False), PassiveAggressiveClassifier(max_iter=1000)
    ),
    "Perceptron": make_pipeline(StandardScaler(with_mean=False), Perceptron(max_iter=1000)),
    "RidgeClassifier": make_pipeline(StandardScaler(with_mean=False), RidgeClassifier()),
    "SGDClassifier": make_pipeline(
        StandardScaler(with_mean=False), SGDClassifier(max_iter=1000, tol=1e-3)
    ),
    "SVC-GC": make_pipeline(
        StandardScaler(with_mean=False), SVC(kernel="poly",  probability=True)
    ),
    "SVM_RBF": make_pipeline(StandardScaler(with_mean=False), SVC(kernel="rbf",  probability=True)),
}

In [4]:
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, roc_auc_score, precision_recall_fscore_support)
from sklearn.model_selection import cross_val_score, cross_val_predict
import re
from sklearn.metrics import f1_score, roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

In [None]:
from sklearn.preprocessing import LabelEncoder

def train():
    np.set_printoptions(precision=5)
    pd.set_option("display.float_format", "{:.5f}".format)
    train_df = pd.read_csv("/content/mal_full_offensive_train.csv") # Swapped to use train file for training
    test_df = pd.read_csv("/content/mal_full_offensive_dev.csv") # Swapped to use dev file for testing

    X_train, X_test, y_train, y_test = (
        train_df["Text"],
        test_df["Text"],
        train_df["Labels"],
        test_df["Labels"],
    )
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)
    feature_cache = {}
    results = []
    try:
      for f_name, f_pipe in feature_sets.items():
        if f_name not in feature_cache:
            X_train_transformed = f_pipe.fit_transform(X_train, y_train)
            X_test_transformed = f_pipe.transform(X_test)
            feature_cache[f_name] = X_train_transformed, X_test_transformed
        X_feat_train, X_feat_test = feature_cache[f_name]
        for c_name, clf in classifiers.items():
            clf.fit(X_feat_train, y_train)
            y_pred = clf.predict(X_feat_test)
            y_true = y_test
            acc = accuracy_score(y_true, y_pred)
            precision, recall, f1, _ = precision_recall_fscore_support(
                y_true, y_pred, average='weighted', zero_division=0
            )
            f1_macro = f1_score(y_true, y_pred, average="macro")  # Macro F1
            f1_micro = f1_score(y_true, y_pred, average="micro")  # Micro F1
            ytest_bin = label_binarize(y_true, classes=[0, 1, 2, 3, 4])  # Adjust classes accordingly
            ypred_bin = label_binarize(y_pred, classes=[0, 1, 2, 3, 4])  # Adjust classes accordingly

            fpr = {}
            tpr = {}
            roc_auc = {}
            for i in range(ytest_bin.shape[1]):
                fpr[i], tpr[i], _ = roc_curve(ytest_bin[:, i], ypred_bin[:, i])
                roc_auc[i] = auc(fpr[i], tpr[i])

            fpr["macro"], tpr["macro"], _ = roc_curve(ytest_bin.ravel(), ypred_bin.ravel())
            roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
            print(
                f"{f_name} + {c_name} → Acc: {acc:.4f}, Prec: {precision:.4f}, "
                f"Recall: {recall:.4f}, F1: {f1:.4f} "
                f"F1_macro: {f1_macro:.4f}, F1_micro: {f1_micro:.4f}, "
                f"ROC_AUC:{roc_auc['macro']:.4f}"
            )
            results.append({
                "feature": f_name,
                "classifier": c_name,
                "accuracy": acc,
                "precision": precision,
                "recall": recall,
                "f1": f1,
                'f1_macro': f1_macro,
                'f1_micro': f1_micro,
                'roc_auc': roc_auc["macro"],
            })
    except KeyboardInterrupt:
      return results
    return results
if __name__ == "__main__":
    results = train()
    clear_output(wait=True)
    df = pd.DataFrame(results)
    df = df.sort_values(by=["f1"], ascending=False)
    df = df.reset_index(drop=True)
    print(df)
    df.to_csv("results.csv", index=False)



LaBSE Embedding + RFC → Acc: 0.9555, Prec: 0.9572, Recall: 0.9555, F1: 0.9504 F1_macro: 0.7561, F1_micro: 0.9555, ROC_AUC:0.9722
LaBSE Embedding + AdaBoost → Acc: 0.9070, Prec: 0.8726, Recall: 0.9070, F1: 0.8828 F1_macro: 0.2820, F1_micro: 0.9070, ROC_AUC:0.9418
LaBSE Embedding + Bagging → Acc: 0.9480, Prec: 0.9475, Recall: 0.9480, F1: 0.9421 F1_macro: 0.6710, F1_micro: 0.9480, ROC_AUC:0.9675
