In [67]:
# https://platform.olimpiada-ai.ro/problems/31

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [68]:
train = pd.read_csv("/kaggle/input/academic-performance/train.csv")
test = pd.read_csv("/kaggle/input/academic-performance/test.csv")

for c in train.columns:
    if c not in ['Exam_score', 'SampleID'] and train[c].dtype=='int64':
        train[c] = train[c].astype(np.float64)

for c in test.columns:
    if c not in ['Exam_score', 'SampleID'] and test[c].dtype=='int64':
        test[c] = test[c].astype(np.float64)

train.shape, test.shape

((5102, 21), (1276, 20))

In [69]:
from sklearn.model_selection import train_test_split
from catboost import Pool
from sklearn.preprocessing import StandardScaler

features = [c for c in train.columns if c not in ['Exam_Score', 'SampleID']]
features_to_scale = [c for c in features if train[c].dtype!='object']
cat_features = [c for c in features if train[c].dtype=='object']

X, y = train[features], train['Exam_Score']
X_test = test[features]

scaler = StandardScaler()
scaler.fit(X[features_to_scale])

X.loc[:, features_to_scale] = scaler.transform(X[features_to_scale])
X_test.loc[:, features_to_scale] = scaler.transform(X_test[features_to_scale])


X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.1)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

In [70]:
from catboost import CatBoostRegressor

params = {
    'iterations': 200,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'metric_period': 100,
    'max_depth': 6,
    'random_state': 42
}

model = CatBoostRegressor(**params)

model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.172658
0:	learn: 3.5286082	test: 4.2349555	best: 4.2349555 (0)	total: 9.6ms	remaining: 1.91s
100:	learn: 1.7666019	test: 3.1103602	best: 3.1103602 (100)	total: 611ms	remaining: 599ms
199:	learn: 1.5820302	test: 3.0990200	best: 3.0990200 (199)	total: 1.23s	remaining: 0us

bestTest = 3.09902001
bestIteration = 199



<catboost.core.CatBoostRegressor at 0x7a7cd2abb210>

In [71]:
from sklearn.metrics import mean_squared_error as mae

y_pred = model.predict(X_valid).flatten()

score = mae(y_valid, y_pred, squared=False)

print(f"Score: {score:.5f}")

Score: 3.09902


In [72]:
y_pred = model.predict(X_test).flatten()

subm = pd.DataFrame({
    'SampleID': test['SampleID'],
    'Exam_Score': y_pred
})

subm.head()

Unnamed: 0,SampleID,Exam_Score
0,2719,73.854693
1,4276,65.864823
2,4142,70.645244
3,1200,71.465829
4,4547,68.532923


In [73]:
subm.to_csv("submission.csv", index=False)