In [1]:
# https://platform.olimpiada-ai.ro/problems/31
# https://platform.olimpiada-ai.ro/problems/27

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# Problem 31

In [2]:
train = pd.read_csv("/kaggle/input/academic-performance/train.csv")
test = pd.read_csv("/kaggle/input/academic-performance/test.csv")

for c in train.columns:
    if c not in ['Exam_score', 'SampleID'] and train[c].dtype=='int64':
        train[c] = train[c].astype(np.float64)

for c in test.columns:
    if c not in ['Exam_score', 'SampleID'] and test[c].dtype=='int64':
        test[c] = test[c].astype(np.float64)

train.shape, test.shape

((5102, 21), (1276, 20))

In [3]:
from sklearn.model_selection import train_test_split
from catboost import Pool
from sklearn.preprocessing import StandardScaler

features = [c for c in train.columns if c not in ['Exam_Score', 'SampleID']]
features_to_scale = [c for c in features if train[c].dtype!='object']
cat_features = [c for c in features if train[c].dtype=='object']

X, y = train[features], train['Exam_Score']
X_test = test[features]

scaler = StandardScaler()
scaler.fit(X[features_to_scale])

X.loc[:, features_to_scale] = scaler.transform(X[features_to_scale])
X_test.loc[:, features_to_scale] = scaler.transform(X_test[features_to_scale])


X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.1)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

In [4]:
from catboost import CatBoostRegressor

params = {
    'iterations': 200,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'metric_period': 100,
    'max_depth': 6,
    'random_state': 42
}

model = CatBoostRegressor(**params)

model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.172658
0:	learn: 3.5286082	test: 4.2349555	best: 4.2349555 (0)	total: 63ms	remaining: 12.5s
100:	learn: 1.7666019	test: 3.1103602	best: 3.1103602 (100)	total: 995ms	remaining: 976ms
199:	learn: 1.5820302	test: 3.0990200	best: 3.0990200 (199)	total: 1.9s	remaining: 0us

bestTest = 3.09902001
bestIteration = 199



<catboost.core.CatBoostRegressor at 0x7d74496ae990>

In [5]:
from sklearn.metrics import mean_squared_error as mae

y_pred = model.predict(X_valid).flatten()

score = mae(y_valid, y_pred, squared=False)

print(f"Score: {score:.5f}")

Score: 3.09902


In [6]:
y_pred = model.predict(X_test).flatten()

subm = pd.DataFrame({
    'SampleID': test['SampleID'],
    'Exam_Score': y_pred
})

subm.head()

Unnamed: 0,SampleID,Exam_Score
0,2719,73.854693
1,4276,65.864823
2,4142,70.645244
3,1200,71.465829
4,4547,68.532923


In [7]:
subm.to_csv("submission31.csv", index=False)

# Problem 27

In [8]:
train = pd.read_csv("/kaggle/input/academic-performance-2/train.csv")
test = pd.read_csv("/kaggle/input/academic-performance-2/test.csv")

for c in train.columns:
    if c not in ['Exam_score', 'ID'] and train[c].dtype=='int64':
        train[c] = train[c].astype(np.float64)

train['Parental_Education_Level'] = train['Parental_Education_Level'].fillna('noinfo')
train['Distance_from_Home'] = train['Distance_from_Home'].fillna('noinfo')
train['Teacher_Quality'] = train['Teacher_Quality'].fillna('noinfo')

for c in test.columns:
    if c not in ['Exam_score', 'ID'] and test[c].dtype=='int64':
        test[c] = test[c].astype(np.float64)
        
test['Parental_Education_Level'] = test['Parental_Education_Level'].fillna('noinfo')
test['Distance_from_Home'] = test['Distance_from_Home'].fillna('noinfo')
test['Teacher_Quality'] = test['Teacher_Quality'].fillna('noinfo')

train.shape, test.shape

((5285, 21), (1322, 20))

In [9]:
from sklearn.model_selection import train_test_split
from catboost import Pool
from sklearn.preprocessing import StandardScaler

features = [c for c in train.columns if c not in ['Exam_Score', 'ID']]
features_to_scale = [c for c in features if train[c].dtype!='object']
cat_features = [c for c in features if train[c].dtype=='object']

X, y = train[features], train['Exam_Score']
X_test = test[features]

scaler = StandardScaler()
scaler.fit(X[features_to_scale])

X.loc[:, features_to_scale] = scaler.transform(X[features_to_scale])
X_test.loc[:, features_to_scale] = scaler.transform(X_test[features_to_scale])


X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.1)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

In [10]:
from catboost import CatBoostRegressor

params = {
    'iterations': 2000,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'metric_period': 100,
    'max_depth': 6,
    'random_state': 42
}

model = CatBoostRegressor(**params)

model.fit(train_pool, eval_set=valid_pool)

0:	learn: 2.8015718	test: 2.6184489	best: 2.6184489 (0)	total: 13.1ms	remaining: 26.2s
100:	learn: 1.0965740	test: 1.0057174	best: 1.0057174 (100)	total: 1.11s	remaining: 20.9s
200:	learn: 0.7400800	test: 0.6671031	best: 0.6671031 (200)	total: 2.05s	remaining: 18.4s
300:	learn: 0.5895634	test: 0.5214906	best: 0.5214906 (300)	total: 2.89s	remaining: 16.3s
400:	learn: 0.5175625	test: 0.4633531	best: 0.4633531 (400)	total: 3.71s	remaining: 14.8s
500:	learn: 0.4806305	test: 0.4373959	best: 0.4373959 (500)	total: 4.51s	remaining: 13.5s
600:	learn: 0.4577065	test: 0.4260045	best: 0.4260045 (600)	total: 5.3s	remaining: 12.3s
700:	learn: 0.4432630	test: 0.4200944	best: 0.4200944 (700)	total: 6.1s	remaining: 11.3s
800:	learn: 0.4326658	test: 0.4178572	best: 0.4178572 (800)	total: 6.9s	remaining: 10.3s
900:	learn: 0.4233358	test: 0.4159046	best: 0.4159046 (900)	total: 7.81s	remaining: 9.53s
1000:	learn: 0.4162968	test: 0.4146340	best: 0.4146340 (1000)	total: 8.62s	remaining: 8.6s
1100:	learn: 0.

<catboost.core.CatBoostRegressor at 0x7d74595ae690>

In [11]:
hours_studied_avg = train['Hours_Studied'].mean()
motivation_level_dict = train['Motivation_Level'].value_counts().to_dict()

y_pred = model.predict(X_test)

subm = {
    'subtaskID': [],
    'datapointID': [],
    'answer': []
}

for i in range(len(test)):
    for sid in range(1, 6):
        answer = y_pred[i]
        if sid==1:
            answer = abs(test['Hours_Studied'][i]-hours_studied_avg)
        elif sid==2:
            answer = test['Sleep_Hours'][i] < 7
        elif sid==3:
            answer = (train['Previous_Scores'] >= test['Previous_Scores'][i]).sum()
        elif sid==4:
            answer = motivation_level_dict.get(test['Motivation_Level'][i], 0)
        subm['subtaskID'].append(sid)
        subm['datapointID'].append(test['ID'][i])
        subm['answer'].append(answer)

subm = pd.DataFrame(subm)

subm.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,5286,0.032923
1,2,5286,False
2,3,5286,1424
3,4,5286,1039
4,5,5286,64.529961


In [12]:
subm.to_csv("submission27.csv", index=False)