In [1]:
# https://platform.olimpiada-ai.ro/problems/68

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [2]:
train = pd.read_csv("/kaggle/input/author-of-verses/train.csv")
test = pd.read_csv("/kaggle/input/author-of-verses/test.csv")

train['Versuri'] = train['Versuri'].map(lambda x: x.replace('\n', ' '))
test['Versuri'] = test['Versuri'].map(lambda x: x.replace('\n', ' '))

train.shape, test.shape

((3415, 3), (854, 2))

In [3]:
train.head(3)

Unnamed: 0,Id,Versuri,Autor
0,Zho5dMCjumAcYUawccJqMy,"Pe barbari de-i risipeşte, Ş-apoi vecinic priv...",Vasile Alecsandri
1,DphNgshYSbs9vZGUempJMG,Sau va avea mulţi copii. Pentru că totuna este...,Grigore Vieru
2,KnZnz7NwJqW7zdgnUFwk6R,"Copilăriei noastre. Frate, ei urăsc Cântecul t...",Grigore Vieru


In [4]:
train['Autor'].value_counts()

Autor
Grigore Vieru        949
Mihai Eminescu       759
Vasile Alecsandri    541
Ana Blandiana        387
George Toparceanu    348
Lucian Blaga         242
George Bacovia       189
Name: count, dtype: int64

In [5]:
train['Versuri'][0]

'Pe barbari de-i risipeşte, Ş-apoi vecinic priveghează, Sentinelă mult vetează, Şi te-aţine la hotare'

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("intfloat/multilingual-e5-large")

2025-12-27 05:55:15.366982: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766814915.556209      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766814915.610347      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766814916.066218      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766814916.066260      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766814916.066263      55 computation_placer.cc:177] computation placer alr

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

In [7]:
sentences = [
    "Vremea este frumoasă astăzi.",
    "E atât de soare afară!",
    "A condus spre stadion."
]
embeddings = model.encode(sentences)

embeddings.shape

(3, 1024)

In [8]:
def get_embeddings(df, batch_size=256):
    embeddings = []
    for i in tqdm(range(0, len(df), batch_size), desc='Getting Embeddings'):
        sentences = df.iloc[i:i+batch_size, 1].tolist()
        emb = model.encode(sentences)
        embeddings.append(emb)
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

train_embeddings = get_embeddings(train)
test_embeddings = get_embeddings(test)

Getting Embeddings:   0%|          | 0/14 [00:00<?, ?it/s]

Getting Embeddings:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
train_embeddings.shape, test_embeddings.shape

((3415, 1024), (854, 1024))

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['Autor'] = le.fit_transform(train['Autor'])

In [11]:
from sklearn.model_selection import train_test_split

y = train['Autor']

train_embeddings, valid_embeddings, y_train, y_valid = train_test_split(train_embeddings, y, stratify=y, test_size=0.1, random_state=42)
train_embeddings.shape, valid_embeddings.shape, y_train.shape, y_valid.shape

((3073, 1024), (342, 1024), (3073,), (342,))

In [12]:
# from catboost import CatBoostClassifier

# params = {
#     'iterations': 1000,
#     'loss_function': 'MultiClass',
#     'eval_metric': 'Accuracy',
#     'metric_period': 100,
#     'max_depth': 4,
#     'random_state': 42,
#     'task_type': 'GPU'
# }

# model = CatBoostClassifier(**params)

# model.fit(
#     train_embeddings, y_train,
#     eval_set=(valid_embeddings, y_valid)
# )

In [13]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(max_iter=50,
                      hidden_layer_sizes=(256, 128),
                      early_stopping=False,
                      random_state=42)

model.fit(train_embeddings, y_train)



In [14]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(valid_embeddings)

score = accuracy_score(y_valid, y_pred)

print(f'Score: {score:.5f}')

Score: 0.69006


In [15]:
y_pred = model.predict(test_embeddings)

subm = pd.DataFrame({
    'Id': test['Id'],
    'Autor': le.inverse_transform(y_pred)
})

subm.to_csv("submission.csv", index=False)
subm.head()

Unnamed: 0,Id,Autor
0,7oYYGUufTgpKPCcVUhTpAS,Vasile Alecsandri
1,EGWuQxkFSjW59anvGxW7nS,Grigore Vieru
2,XfHVBFedR9cDb4DBvuwtsF,Mihai Eminescu
3,E3EbfZuVsZhUxqRYvBbywM,Lucian Blaga
4,kAxEyKrjhpvUu25CAjh5F6,Mihai Eminescu
