In [1]:
# https://platform.olimpiada-ai.ro/problems/30

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [2]:
train = pd.read_csv("/kaggle/input/text-based-emotion-classification/train.csv")
test = pd.read_csv("/kaggle/input/text-based-emotion-classification/test.csv")

In [9]:
class2idx = {
    'sadness': 0,
    'anger': 1,
    'love': 2,
    'surprise': 3,
    'fear': 4,
    'joy': 5
}

idx2class = {v: k for k, v in class2idx.items()}

train['label'] = train['label'].map(class2idx.get)

In [12]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = ['text']
text_features = ['text']

X, y = train[features], train['label']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.1, stratify=y)

train_pool = Pool(X_train, y_train, text_features=text_features)
valid_pool = Pool(X_valid, y_valid, text_features=text_features)

In [19]:
from catboost import CatBoostClassifier

params = {
    'iterations': 1000,
    'loss_function': 'MultiClass',
    'eval_metric': 'TotalF1:average=Macro',
    'metric_period': 25,
    'max_depth': 4
}

model = CatBoostClassifier(**params)

model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.113892
0:	learn: 0.5474287	test: 0.6133631	best: 0.6133631 (0)	total: 231ms	remaining: 3m 50s
25:	learn: 0.6387257	test: 0.6879484	best: 0.6879484 (25)	total: 6.26s	remaining: 3m 54s
50:	learn: 0.6840588	test: 0.7250471	best: 0.7250471 (50)	total: 11.9s	remaining: 3m 40s
75:	learn: 0.6989875	test: 0.7377022	best: 0.7377022 (75)	total: 17.4s	remaining: 3m 31s
100:	learn: 0.7126844	test: 0.7508305	best: 0.7508305 (100)	total: 22.7s	remaining: 3m 21s
125:	learn: 0.7186493	test: 0.7620243	best: 0.7620243 (125)	total: 28.2s	remaining: 3m 15s
150:	learn: 0.7239404	test: 0.7725448	best: 0.7725448 (150)	total: 33.6s	remaining: 3m 9s
175:	learn: 0.7297793	test: 0.7726728	best: 0.7726728 (175)	total: 39.1s	remaining: 3m 2s
200:	learn: 0.7381477	test: 0.7769422	best: 0.7769422 (200)	total: 44.5s	remaining: 2m 56s
225:	learn: 0.7413441	test: 0.7770605	best: 0.7770605 (225)	total: 50s	remaining: 2m 51s
250:	learn: 0.7470384	test: 0.7743223	best: 0.7770605 (225)	total: 55.6s	r

<catboost.core.CatBoostClassifier at 0x7a39e4836ad0>

In [20]:
from sklearn.metrics import f1_score

y_pred = model.predict(X_valid).flatten()

score = f1_score(y_valid, y_pred, average='macro')
print(f"F1 Score: {score:.5f}")

F1 Score: 0.81763


In [22]:
X_test = test[features]

y_pred = model.predict(X_test).flatten()

subm = pd.DataFrame({
    'SampleID': test['SampleID'],
    'label': list(map(idx2class.get, y_pred))
})

subm

Unnamed: 0,SampleID,label
0,16001,sadness
1,16002,sadness
2,16003,sadness
3,16004,joy
4,16005,sadness
...,...,...
1995,17996,anger
1996,17997,anger
1997,17998,joy
1998,17999,joy


In [24]:
subm.to_csv("submission.csv", index=False)