In [None]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

os.environ['WANDB_DISABLED'] = "true"

In [None]:
!gdown 1EI12smTmdi4Mlu_lLNULN6p6g9dYEWql

In [None]:
!rm -rf processed_data

!unzip -q processed_data.zip

In [None]:
!git config --global credential.helper store

In [None]:
from huggingface_hub import login

# Log in using your Hugging Face token
login(token="")

In [None]:
from huggingface_hub import Repository

# Replace with your repo URL
repo_url = "https://huggingface.co/ErfanSadegh/SemEval2025-Task11-TrackA"

# Specify the local directory where the repo will be cloned
local_dir = "SemEval2025-Task11-TrackA"

# Clone the repository
repo = Repository(local_dir=local_dir, clone_from=repo_url)

In [None]:
!pip install -q transformers[torch] accelerate -U
!pip install -q sentence_transformers

!pip install -q datasets

!pip install -q transformers

!pip install -q iterative-stratification

!pip install -q auto-gptq optimum bitsandbytes

In [None]:
import numpy as np

import pandas as pd

import os

from tqdm.notebook import tqdm

from tqdm import tqdm

from transformers import BitsAndBytesConfig

from torch.utils.data import DataLoader,Dataset

import torch

from sklearn.utils.class_weight import compute_class_weight

import pickle

from sentence_transformers import SentenceTransformer

import xgboost as xgb

from sklearn.svm import SVC


In [None]:
MAX_LENGTH = 512
# kind="dev"
# task="track_c"

# langs=["afr","amh","deu","eng","oro","ptbr","rus","som","sum","tir"]
# labels=['Anger','Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']
# langs=["amh","arq","ary","chn","deu","esp","hau","hin","ibo","kin","mar","orm","pcm","ptbr","ptmz","ron","rus","som","sun","swa","swe","tat","tir","ukr","vmw","yor","ind","jav","xho","zul"]
# langs=["esp"]
# langs=["eng"]
# langs=["afr"]

# label_columns =['anger', 'fear', 'joy', 'sadness', 'surprise'] #eng
label_columns =['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
# label_columns =['anger', 'disgust', 'fear', 'joy', 'sadness'] #afr


In [None]:
def read_csv(tasks,kinds,langs):
  all_csv=pd.DataFrame()
  for task in tasks:
    for kind in kinds:
      for lang in langs:
        processed_path=f"processed_data/{kind}/{task}/{lang}.csv"
        if not os.path.isfile(processed_path):

          print("not found:",processed_path)

          continue
        train_data=pd.read_csv(processed_path)

        train_data.columns = train_data.columns.str.lower()

        all_csv = pd.concat([all_csv, train_data],ignore_index=True)
  if len(all_csv)==0:
    print(f"{lang} with {kinds} and {tasks} is empty")
    return None,None,None
  train_data=all_csv
  train_data=train_data.fillna(0)
  float_columns = train_data.select_dtypes(include=['float64']).columns
  # Convert those columns to int
  train_data[float_columns] = train_data[float_columns].astype('int')
  train_data = train_data[train_data['clean_message'].isnull()==False]

  train_data.reset_index()
  train_data['clean_message']=train_data['clean_message'].astype(str)
  train_data=train_data.sample(frac=1)
  x_train, y_train = train_data['clean_message'].values.tolist(), train_data[label_columns].values.tolist()
  return train_data,x_train,y_train


In [None]:
device="cuda" if torch.cuda.is_available() else 'cpu'

In [None]:
#update label column name

def get_class_weights_tensor(y_train):
    y_train_df = pd.DataFrame(y_train, columns=label_columns)
    label_distribution = y_train_df.apply(pd.Series.value_counts).T.fillna(0).astype(int)

    label_distribution.columns = ['count_0', 'count_1']
    label_distribution['sum'] = label_distribution['count_0'] + label_distribution['count_1']

    counts_0 = label_distribution['count_0'].to_numpy()
    counts_1 = label_distribution['count_1'].to_numpy()

    # Compute class-specific weights for each label (each class)
    class_weights = []
    for i in range(len(counts_0)):
        weight = compute_class_weight('balanced', classes=np.array([0, 1]), y=[0] * counts_0[i] + [1] * counts_1[i])
        class_weights.append(weight)

    # Convert the list of weights to a tensor
    class_weights_tensor = torch.tensor([w[1] for w in class_weights], dtype=torch.float,device=device)
    return class_weights_tensor

In [None]:
quantization_config = BitsAndBytesConfig(

    load_in_4bit=True,

    bnb_4bit_use_double_quant=True,

    bnb_4bit_quant_type="nf4",

    bnb_4bit_compute_dtype=torch.bfloat16,

)

In [None]:

def load_model(model_name):
    for _ in range(20):
        torch.cuda.empty_cache()
    print(model_name)
    use_gpu = torch.cuda.is_available()
    embedding_model = SentenceTransformer(
            model_name,
            trust_remote_code=True,
            device="cpu" if not use_gpu else "cuda",
            model_kwargs={
                "torch_dtype": torch.bfloat16 if use_gpu else torch.float32,
                # "attn_implementation": "sdpa"
            },
            # config_kwargs={"is_text_encoder": True, "vector_dim": 1024},
        )
        # We can reduce the max_seq_length from the default of 2048 for faster encoding
    embedding_model.max_seq_length = MAX_LENGTH
    embedding_model.eval()
    embedding_model = embedding_model.to(device)
    for param in embedding_model.parameters():
        param.data = param.data.contiguous()
    return embedding_model

In [None]:
# Custom Dataset for batching
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]


In [None]:

# Batch extraction function
def extract_embeddings(texts, model, batch_size=32):
    dataset = TextDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    embeddings = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Extracting embeddings"):
            # Move batch to device
            # input_ids = torch.Tensor(batch['input_ids']).squeeze().to(device)
            # attention_mask = torch.Tensor(batch['attention_mask']).squeeze().to(device)

            # Forward pass through the model
            cls_embeddings = model.encode(batch)
            embeddings.append(cls_embeddings)

    return np.vstack(embeddings)

In [None]:
def load_models(dir):
    # Initialize an empty dictionary to store the loaded SVM models
    models = {}

    # Loop through the saved files and load the models
    for filename in os.listdir(dir):  # List all files in the current directory
        if filename.startswith("svm_model_") and filename.endswith(".pkl"):
            label = filename[len("svm_model_"):-len(".pkl")]  # Extract the label from the filename
            with open(os.path.join(dir,filename), 'rb') as f:
                models[label] = pickle.load(f)
        elif filename.startswith("xgb_model_") and filename.endswith(".pkl"):
            label = filename[len("xgb_model_"):-len(".pkl")]
            with open(os.path.join(dir,filename), 'rb') as f:
                models[label] = pickle.load(f)
    return models


In [None]:
def save_models(models,dir,isxgb=False):
    # Loop through the models and save them
    #remove everything ending with zip
    for filename in os.listdir(dir):
        if filename.endswith(".zip"):
            os.remove(os.path.join(dir,filename))

    for label, model in models.items():
        if isxgb:
            filename = f"xgb_model_{label}.pkl"
        else:
            filename = f"svm_model_{label}.pkl"
        with open(os.path.join(dir,filename), 'wb') as f:
            pickle.dump(model, f)
    # zip these pickle files using python
    if isxgb:
        zip_file_name = "xgb_models_pkl.zip"
    else:
        zip_file_name = "svm_models_pkl.zip"
    zip_file_path = os.path.join(dir, zip_file_name)
    import zipfile
    with zipfile.ZipFile(zip_file_path, 'w') as z:
        for filename in os.listdir(dir):
            if filename.endswith(".pkl"):
                z.write(os.path.join(dir,filename), filename)
    for filename in os.listdir(dir):
        if filename.endswith(".pkl"):
            os.remove(os.path.join(dir,filename))

In [None]:
model2lang={
    "intfloat/multilingual-e5-large-instruct":
    ["afr","amh","ary","deu","esp","hin","ibo","kin","mar","orm","pcm","ptmz","ron","som","sun","swa","swe","tat","tir","yor","ind","jav","xho","zul"],
    "Abdou/arabert-large-algerian":["arq"],
    "iampanda/zpoint_large_embedding_zh":["chn"],
    "infgrad/jasper_en_vision_language_v1":["eng"],
    "konstantindobler/xlm-roberta-base-focus-hausa":["hau"],
    "neuralmind/bert-large-portuguese-cased":["ptbr"],
    "ai-forever/FRIDA":["rus"],
    "jinaai/jina-embeddings-v3":["ukr","vmw"],
}
allLangs=[]
allModel="intfloat/multilingual-e5-large-instruct"

In [None]:
lang2labelcolumns={
    "eng":['anger', 'fear', 'joy', 'sadness', 'surprise'],
    "afr":['anger', 'disgust', 'fear', 'joy', 'sadness'],
    "others":['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
}

In [None]:
import os
import zipfile

def unzip_models(local_dir, langpath,zip_file_name="svm_models_pkl.zip"):
    zip_file_path = os.path.join(local_dir, "models", langpath, zip_file_name)
    extract_dir = os.path.join(local_dir, "models", langpath)

    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
      for member in zip_ref.namelist():
        zip_ref.extract(member, extract_dir)

    # print(f"Extracted {zip_file_path} to {extract_dir}")

In [None]:
for model in model2lang:
    # Load model related to that lang
    embedding_model = load_model(model)

    for lang in model2lang[model]:
        label_columns=lang2labelcolumns.get(lang,lang2labelcolumns["others"])
        # Read CSV of that lang
        train_data,x_train,y_train=read_csv(tasks=["track_a","track_c"],kinds=["train","dev"],langs=[lang])

        # Get class weights tensor
        class_weights_tensor = get_class_weights_tensor(y_train)


        # Extract embeddings
        train_embeddings = extract_embeddings(x_train, embedding_model, batch_size=32)

        # Flatten labels for XGBoost (multi-label classification)
        y_train_flat = np.array(y_train)
        langpath=lang

        if os.path.isdir(f"{local_dir}/models/{langpath}")==False:
            langpath="all"
            allLangs.append(lang)
            continue
            xgb_models = {}

            for i, label in enumerate(label_columns):
                print(f"Training XGBoost for label: {label}")
                # Use the computed class weights for the positive class
                scale_pos_weight = class_weights_tensor[i].item()
                # Initialize and train XGBoost classifier
                xgb_model = xgb.XGBClassifier(
                    objective='binary:logistic',
                    eval_metric='logloss',
                    use_label_encoder=False,
                    n_estimators=100,
                    learning_rate=0.1,
                    max_depth=6,
                    tree_method='hist',
                    scale_pos_weight=scale_pos_weight,  # Apply the class weight here
                    device=device
                )
                xgb_model.fit(train_embeddings, y_train_flat[:, i])
                xgb_models[label] = xgb_model
            models=xgb_models
            save_models(models,f"{local_dir}/models/{langpath}",isxgb=True)
        else:
            svm_models = {}

            for i, label in enumerate(label_columns):
                print(f"Training SVM for label: {label}")

                # Use the computed class weights for balancing the classes
                class_weight = {0: 1, 1: class_weights_tensor[i].item()}  # Adjust the class weights

                svm_model = SVC(class_weight={0: 1, 1: class_weights_tensor[i].item()})

                svm_model.fit(train_embeddings, y_train_flat[:, i])
                svm_models[label] = svm_model
            models=svm_models
            save_models(models,f"{local_dir}/models/{langpath}",isxgb=False)
        # if os.path.exists(f"{local_dir}/models/{langpath}/svm_models_pkl.zip"):
        #     unzip_models(local_dir, langpath,"svm_models_pkl.zip")
        # elif os.path.exists(f"{local_dir}/models/{langpath}/xgb_models_pkl.zip"):
        #     unzip_models(local_dir, langpath,"xgb_models_pkl.zip")
        # else:
        #     print(f"No models found for task:{task} lang:{lang}")
        #     continue

        # models=load_models(f"{local_dir}/models/{langpath}")

        # Predict labels for the test data
        for task in ["track_a","track_c"]:
            test_data,x_test,y_test=read_csv(tasks=[task],kinds=["test"],langs=[lang])
            if test_data is None:
                continue
            test_embeddings = extract_embeddings(x_test, embedding_model, batch_size=32)
            y_test_flat = np.array(y_test)
            test_predictions = {}
            for label in label_columns:
                print(f"Predicting for label: {label}")
                test_predictions[label] = models[label].predict(test_embeddings)

            predictions_df = pd.DataFrame(test_predictions)
            test_data.reset_index(inplace=True)

            output_df = pd.concat([test_data[['id','lang']], predictions_df], axis=1)

            output_dir = task
            os.makedirs(output_dir, exist_ok=True)

            # Iterate through the unique languages in the 'lang' column
            for lang in output_df['lang'].unique():
                # Filter rows for the current language
                lang_df = output_df[output_df['lang'] == lang]
                lang_df.drop(columns=['lang'],inplace=True)

                # Define the output file path for the current language
                lang_csv_file = os.path.join(output_dir, f"pred_{lang}.csv")

                # Sort the DataFrame by the 'id' column
                lang_df.sort_values(by='id', inplace=True)

                # Save the filtered DataFrame to a CSV file
                lang_df.to_csv(lang_csv_file, index=False)

                # Print a confirmation message
                print(f"Predictions for language '{lang}' saved to {lang_csv_file}")
    del embedding_model

In [None]:
# Load model related to that lang
embedding_model = load_model(allModel)

label_columns=lang2labelcolumns["others"]
# Read CSV of that lang
train_data,x_train,y_train=read_csv(tasks=["track_a","track_c"],kinds=["train","dev"],langs=allLangs)

# Get class weights tensor
class_weights_tensor = get_class_weights_tensor(y_train)

# Extract embeddings
train_embeddings = extract_embeddings(x_train, embedding_model, batch_size=32)

# Flatten labels for XGBoost (multi-label classification)
y_train_flat = np.array(y_train)
langpath=allLangs[0]

if os.path.isdir(f"{local_dir}/models/{langpath}")==False:
    langpath="all"
    xgb_models = {}

    for i, label in enumerate(label_columns):
        print(f"Training XGBoost for label: {label}")
        # Use the computed class weights for the positive class
        scale_pos_weight = class_weights_tensor[i].item()
        # Initialize and train XGBoost classifier
        xgb_model = xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            use_label_encoder=False,
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            tree_method='hist',
            scale_pos_weight=scale_pos_weight,  # Apply the class weight here
            device=device
        )
        xgb_model.fit(train_embeddings, y_train_flat[:, i])
        xgb_models[label] = xgb_model
    models=xgb_models
    save_models(models,f"{local_dir}/models/{langpath}",isxgb=True)
else:
    svm_models = {}

    for i, label in enumerate(label_columns):
        print(f"Training SVM for label: {label}")

        # Use the computed class weights for balancing the classes
        class_weight = {0: 1, 1: class_weights_tensor[i].item()}  # Adjust the class weights

        svm_model = SVC(class_weight={0: 1, 1: class_weights_tensor[i].item()})

        svm_model.fit(train_embeddings, y_train_flat[:, i])
        svm_models[label] = svm_model
    models=svm_models
    save_models(models,f"{local_dir}/models/{langpath}",isxgb=False)
# if os.path.exists(f"{local_dir}/models/{langpath}/svm_models_pkl.zip"):
#     unzip_models(local_dir, langpath,"svm_models_pkl.zip")
# elif os.path.exists(f"{local_dir}/models/{langpath}/xgb_models_pkl.zip"):
#     unzip_models(local_dir, langpath,"xgb_models_pkl.zip")
# else:
#     print(f"No models found for task:{task} lang:{lang}")
#     continue

# models=load_models(f"{local_dir}/models/{langpath}")

# Predict labels for the test data
for lang in allLangs:
    for task in ["track_a","track_c"]:
        test_data,x_test,y_test=read_csv(tasks=[task],kinds=["test"],langs=[lang])
        if test_data is None:
            continue
        test_embeddings = extract_embeddings(x_test, embedding_model, batch_size=32)
        y_test_flat = np.array(y_test)
        test_predictions = {}
        for label in label_columns:
            print(f"Predicting for label: {label}")
            test_predictions[label] = models[label].predict(test_embeddings)

        predictions_df = pd.DataFrame(test_predictions)
        test_data.reset_index(inplace=True)

        output_df = pd.concat([test_data[['id','lang']], predictions_df], axis=1)

        output_dir = task
        os.makedirs(output_dir, exist_ok=True)

        # Iterate through the unique languages in the 'lang' column
        for lang in output_df['lang'].unique():
            # Filter rows for the current language
            lang_df = output_df[output_df['lang'] == lang]
            lang_df.drop(columns=['lang'],inplace=True)

            # Define the output file path for the current language
            lang_csv_file = os.path.join(output_dir, f"pred_{lang}.csv")

            # Sort the DataFrame by the 'id' column
            lang_df.sort_values(by='id', inplace=True)

            # Save the filtered DataFrame to a CSV file
            lang_df.to_csv(lang_csv_file, index=False)

            # Print a confirmation message
            print(f"Predictions for language '{lang}' saved to {lang_csv_file}")

In [None]:
task="track_a"
!zip -r {task}.zip {task}

In [None]:
task="track_c"
!zip -r {task}.zip {task}

In [None]:
!git lfs install

In [None]:
!cd {local_dir}
!git config --global user.email "sadeghpoolaee@gmail.com"
!git config --global user.name "mspoulaei"
!git remote set-url origin https://huggingface.co/ErfanSadegh/SemEval2025-Task11-TrackA
!git add .
!git commit -m "Version 2"
!git push