In [50]:
# https://platform.olimpiada-ai.ro/problems/34

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [51]:
train = pd.read_csv("/kaggle/input/students-academic-status/train.csv")
test = pd.read_csv("/kaggle/input/students-academic-status/test.csv")

In [53]:
cat_features = ['Marital status', 'Application mode', 'Course', 'Previous qualification',
                "Mother's qualification", "Father's qualification",
                "Mother's occupation", "Father's occupation"]

class2idx = {
    'Graduate': 0,
    'Dropout': 1,
    'Enrolled': 2
}

idx2class = {v: k for k, v in class2idx.items()}

In [54]:
def process_df(df):
    df['Nacionality'] = (df['Nacionality']==1).astype(int)
    if 'Target' in df.columns:
        df['Target'] = df['Target'].map(class2idx.get)
    for col in cat_features:
        df[col] = df[col].astype(str)
    return df

train = process_df(train)
test = process_df(test)

In [55]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = [col for col in train.columns if col not in ['SampleID', 'Target']]

X, y = train[features], train['Target']
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.2)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

In [67]:
from catboost import CatBoostClassifier

params = {
    'iterations': 100,
    'loss_function': 'MultiClass',
    'eval_metric': 'TotalF1:average=Macro',
    'metric_period': 10,
    'max_depth': 6,
    'random_state': 42
}

model = CatBoostClassifier(**params)

model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.265685
0:	learn: 0.6750581	test: 0.6583064	best: 0.6583064 (0)	total: 10.8ms	remaining: 1.07s
10:	learn: 0.6723890	test: 0.6532476	best: 0.6583064 (0)	total: 109ms	remaining: 886ms
20:	learn: 0.7156952	test: 0.6683041	best: 0.6683041 (20)	total: 207ms	remaining: 780ms
30:	learn: 0.7397244	test: 0.6715990	best: 0.6715990 (30)	total: 297ms	remaining: 660ms
40:	learn: 0.7584741	test: 0.6717782	best: 0.6717782 (40)	total: 393ms	remaining: 566ms
50:	learn: 0.7821078	test: 0.6876870	best: 0.6876870 (50)	total: 488ms	remaining: 469ms
60:	learn: 0.8047641	test: 0.7079577	best: 0.7079577 (60)	total: 585ms	remaining: 374ms
70:	learn: 0.8079281	test: 0.6977091	best: 0.7079577 (60)	total: 690ms	remaining: 282ms
80:	learn: 0.8236237	test: 0.6966015	best: 0.7079577 (60)	total: 788ms	remaining: 185ms
90:	learn: 0.8332393	test: 0.7080427	best: 0.7080427 (90)	total: 885ms	remaining: 87.5ms
99:	learn: 0.8447289	test: 0.7110946	best: 0.7110946 (99)	total: 971ms	remaining: 0us

best

<catboost.core.CatBoostClassifier at 0x7ac90ff02ad0>

In [72]:
from sklearn.metrics import f1_score

y_pred = model.predict(X_valid).flatten()

score = f1_score(y_valid, y_pred, average='macro')

print(f'Score: {score:.5f}')

Score: 0.71109


In [73]:
y_pred = model.predict(X_test).flatten()

subm = pd.DataFrame({
    'SampleID': test['SampleID'],
    'Target': y_pred
})

subm['Target'] = subm['Target'].map(idx2class.get)

subm

Unnamed: 0,SampleID,Target
0,1853,Graduate
1,2399,Graduate
2,510,Dropout
3,242,Graduate
4,3392,Graduate
...,...,...
880,555,Enrolled
881,2418,Graduate
882,2367,Graduate
883,1801,Graduate


In [75]:
subm.to_csv("submission.csv", index=False)