In [49]:
# https://platform.olimpiada-ai.ro/problems/49

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [50]:
train = pd.read_csv("/kaggle/input/elite-high-school/train.csv").drop('Unnamed: 0', axis=1)
test = pd.read_csv("/kaggle/input/elite-high-school/test.csv").drop('Unnamed: 0', axis=1)

In [51]:
train.groupby('gen')['status_admitere'].mean()

gen
F    0.276596
M    0.248062
Name: status_admitere, dtype: float64

In [52]:
train.groupby('judet')['status_admitere'].mean().sort_values(ascending=False)

judet
AR    1.000000
TR    0.833333
CS    0.800000
VS    0.571429
CT    0.500000
TL    0.500000
BZ    0.500000
OT    0.444444
VN    0.400000
IL    0.375000
MM    0.333333
GJ    0.333333
SJ    0.333333
BT    0.333333
BR    0.333333
BC    0.333333
HR    0.285714
CL    0.250000
CV    0.250000
PH    0.250000
VL    0.250000
GR    0.250000
AG    0.250000
AB    0.222222
BN    0.200000
IS    0.200000
MS    0.181818
B     0.166667
DJ    0.166667
TM    0.166667
CJ    0.142857
SB    0.142857
NT    0.125000
SV    0.111111
HD    0.111111
BH    0.083333
SM    0.000000
BV    0.000000
DB    0.000000
GL    0.000000
MH    0.000000
IF    0.000000
Name: status_admitere, dtype: float64

In [53]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = ['gen', 'judet', 'NT', 'MEV', 'MATE', 'MGIM']
cat_features = ['gen', 'judet']

X, y = train[features], train['status_admitere']
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.1)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

In [54]:
from catboost import CatBoostClassifier

params = {
    'iterations': 1,
    'loss_function': 'Logloss',
    'eval_metric': 'Accuracy',
    'metric_period': 1,
    'max_depth': 1
}

model = CatBoostClassifier(**params)

model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.450976
0:	learn: 0.9835391	test: 0.9629630	best: 0.9629630 (0)	total: 308us	remaining: 0us

bestTest = 0.962962963
bestIteration = 0



<catboost.core.CatBoostClassifier at 0x7f32ee709650>

In [55]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_valid).flatten()

score = accuracy_score(y_valid, y_pred)

print(f"Score: {score:.5f}")

Score: 0.96296


In [56]:
rankings = test['MEV'].sort_values(ascending=False).index.tolist()
rank_map = dict()
for i, v in enumerate(rankings):
    rank_map[v] = i+1

y_pred = model.predict(X_test).flatten()

subm = []

for i in range(len(test)):
    for sid in range(1, 4):
        answer = y_pred[i]
        if sid==1:
            answer = round(test['NT'][i] - test['MEV'][i], 2)
        elif sid==2:
            answer = rank_map[i]
        subm.append({
            'subtaskID': sid,
            'datapointID': test['id'][i],
            'answer': answer
        })

subm = pd.DataFrame(subm)

subm

Unnamed: 0,subtaskID,datapointID,answer
0,1,289,3.28
1,2,289,168.00
2,3,289,1.00
3,1,312,-1.00
4,2,312,30.00
...,...,...,...
535,2,362,151.00
536,3,362,0.00
537,1,372,-3.57
538,2,372,175.00


In [57]:
subm.to_csv("submission.csv", index=False)