# Libraries

In [1]:
%pip install -q -U datasets transformers accelerate sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m82.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the 

In [2]:
import os
import random
import numpy as np
import pandas as pd
import torch
import transformers
import unicodedata
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from pprint import pprint

# Config

In [3]:
seed = 42
lang = 'sun'
save_path = 'pred_sun.csv'

# hf_model_id = 'alxxtexxr/RoBERTa-Base-SE2025T11A-sun-v20241227113119' # BEST
hf_model_id = 'alxxtexxr/RoBERTa-Base-SE2025T11A-sun-v20250112051102'
hf_tokenizer_id = hf_model_id
hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
hf_data_config = 'track_a_sun_raw_v2'

In [4]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


In [5]:
datasets = load_dataset(hf_data_id, hf_data_config)
splits = [*datasets.keys()]
print("Splits:", splits)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

public_data_dev/track_a/train/sun.csv:   0%|          | 0.00/104k [00:00<?, ?B/s]

public_data_dev/track_a/dev/sun.csv:   0%|          | 0.00/21.3k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Splits: ['train', 'dev']


In [6]:
dev_df = pd.DataFrame(datasets['dev'])

cols = list(dev_df.columns)
emotion_cols_raw = [col for col in cols if col not in ['Unnamed: 0', 'id', 'text', 'emotion']]

# Add 'Neutral' column
dev_df['Neutral'] = None
emotion_cols = emotion_cols_raw + ['Neutral']

print("Data columns:", cols)
print("Emotions columns:", emotion_cols)

Data columns: ['id', 'text', 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
Emotions columns: ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'Neutral']


In [7]:
def contains_non_ascii(text):
    try:
        text.encode('ascii')
    except UnicodeEncodeError:
        return True
    return False

print("Total data with non-ASCII chars:", int(dev_df['text'].apply(contains_non_ascii).sum()))

Total data with non-ASCII chars: 83


In [8]:
def normalize_to_ascii(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

# Normalize to ASCII equivalents
dev_df['text'] = dev_df['text'].apply(normalize_to_ascii)
print("Total data with non-ASCII chars (after normalizing them):", int(dev_df['text'].apply(contains_non_ascii).sum()))

Total data with non-ASCII chars (after normalizing them): 0


In [9]:
class2id = {class_:id for id, class_ in enumerate(emotion_cols)}
id2class = {id:class_ for class_, id in class2id.items()}

print("Class to ID:")
pprint(class2id, width=1)
print()
print("ID to Class:")
pprint(id2class, width=1)

Class to ID:
{'Neutral': 6,
 'anger': 0,
 'disgust': 1,
 'fear': 2,
 'joy': 3,
 'sadness': 4,
 'surprise': 5}

ID to Class:
{0: 'anger',
 1: 'disgust',
 2: 'fear',
 3: 'joy',
 4: 'sadness',
 5: 'surprise',
 6: 'Neutral'}


In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    hf_model_id, 
    num_labels=len(emotion_cols),
    id2label=id2class, 
    label2id=class2id,
    problem_type = "multi_label_classification",
)
print(model.classifier)

config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=7, bias=True)
)


In [11]:
tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_id)

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/786k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/445k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.53M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

In [12]:
threshold = 0.3

def sigmoid(x):
   return 1/(1 + np.exp(-x))

for i, row in dev_df.iterrows():
    text = row['text']
    inputs = tokenizer(text, return_tensors='pt').to(model.device)
    outputs = model(**inputs)
    logits = outputs.logits
    probs = sigmoid(logits.squeeze().detach().cpu().numpy()) # apply sigmoid + threshold
    labels_pred = (probs > threshold).astype(int) # threshold: 0.5
    
    dev_df.loc[i, emotion_cols] = labels_pred

In [13]:
save_df = dev_df[['id'] + emotion_cols_raw]
save_df.head()

Unnamed: 0,id,anger,disgust,fear,joy,sadness,surprise
0,sun_dev_track_a_00001,0,0,0,1,0,1
1,sun_dev_track_a_00002,0,0,0,1,0,0
2,sun_dev_track_a_00003,1,0,0,0,0,0
3,sun_dev_track_a_00004,0,0,0,1,0,1
4,sun_dev_track_a_00005,0,0,0,1,0,0


In [15]:
save_df.to_csv(save_path, index=False)