In [63]:
# https://platform.olimpiada-ai.ro/en/problems/82

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [64]:
train = pd.read_csv("/kaggle/input/educational-performance/train.csv")
test = pd.read_csv("/kaggle/input/educational-performance/test.csv")

train.shape, test.shape

((8167, 14), (869, 13))

In [65]:
train.head()

Unnamed: 0,SampleID,County,School Type,Alternative Education Accountability,Charter,Number of Students,% Economically Disadvantaged,% EB/EL Students,Overall Rating,Student Achievement Rating,School Progress Rating,Academic Growth Rating,Closing the Gaps Rating,Relative Performance Rating
0,1,MCLENNAN,Elementary,No,No,502,0.584,0.082,B,B,B,C,B,B
1,2,DALLAS,High School,No,No,447,0.718,0.233,A,A,A,B,A,A
2,3,DALLAS,Elementary,No,No,473,0.812,0.609,D,D,D,D,D,D
3,4,REFUGIO,Middle School,No,No,139,0.676,0.072,B,C,B,C,C,B
4,5,TARRANT,Middle School,No,No,793,0.916,0.327,C,F,C,C,C,F


In [66]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8167 entries, 0 to 8166
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   SampleID                              8167 non-null   int64  
 1   County                                8167 non-null   object 
 2   School Type                           8167 non-null   object 
 3   Alternative Education Accountability  8167 non-null   object 
 4   Charter                               8167 non-null   object 
 5   Number of Students                    8167 non-null   int64  
 6   % Economically Disadvantaged          8167 non-null   float64
 7   % EB/EL Students                      8167 non-null   float64
 8   Overall Rating                        8167 non-null   object 
 9   Student Achievement Rating            8167 non-null   object 
 10  School Progress Rating                8167 non-null   object 
 11  Academic Growth R

In [67]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 869 entries, 0 to 868
Data columns (total 13 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   SampleID                              869 non-null    int64  
 1   County                                869 non-null    object 
 2   School Type                           869 non-null    object 
 3   Alternative Education Accountability  869 non-null    object 
 4   Charter                               869 non-null    object 
 5   Number of Students                    869 non-null    int64  
 6   % Economically Disadvantaged          869 non-null    float64
 7   % EB/EL Students                      869 non-null    float64
 8   Overall Rating                        869 non-null    object 
 9   Student Achievement Rating            869 non-null    object 
 10  School Progress Rating                869 non-null    object 
 11  Academic Growth Rat

In [68]:
train['Relative Performance Rating'].value_counts(normalize=True).sort_index()

Relative Performance Rating
A    0.100282
B    0.352761
C    0.297539
D    0.143872
F    0.105547
Name: proportion, dtype: float64

In [69]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = [c for c in train.columns if c not in ['SampleID', 'Relative Performance Rating']]
cat_features = [c for c in train.select_dtypes('object').columns if c in features]
target_col = 'Relative Performance Rating'

X, y = train[features], train[target_col]
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.1, random_state=42)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

In [70]:
from catboost import CatBoostClassifier

params = {
    'iterations': 300,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'metric_period': 100,
    'max_depth': 6,
    'random_state': 42
}

model = CatBoostClassifier(**params)

model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.177989
0:	learn: 0.7997279	test: 0.7980416	best: 0.7980416 (0)	total: 79.2ms	remaining: 23.7s
100:	learn: 0.9111565	test: 0.8935129	best: 0.8935129 (100)	total: 8.44s	remaining: 16.6s
200:	learn: 0.9356463	test: 0.8947368	best: 0.8947368 (200)	total: 16.7s	remaining: 8.25s
299:	learn: 0.9519728	test: 0.8922889	best: 0.8947368 (200)	total: 25.1s	remaining: 0us

bestTest = 0.8947368421
bestIteration = 200

Shrink model to first 201 iterations.


<catboost.core.CatBoostClassifier at 0x791033d82390>

In [71]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_valid).flatten()

score = accuracy_score(y_valid, y_pred)

print(f'Score: {score:.5f}')

Score: 0.89474


In [72]:
y_pred = model.predict(X_test).flatten()

subm = pd.DataFrame({
    'subtaskID': [1, 2] + [3] * len(test),
    'datapointID': [1, 2] + test['SampleID'].tolist(),
    'answer': (
        [((test['County']=='ANDERSON') & (test['School Type']=='Elementary')).sum().item()] +
        [test['Academic Growth Rating'].value_counts().sort_values().index[-1]] + 
        y_pred.tolist()
    )
})

subm.to_csv("submission.csv", index=False)

subm.head(7)

Unnamed: 0,subtaskID,datapointID,answer
0,1,1,1
1,2,2,C
2,3,8168,B
3,3,8169,C
4,3,8170,D
5,3,8171,C
6,3,8172,C
