In [1]:
# https://platform.olimpiada-ai.ro/en/problems/85

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("/kaggle/input/bac-scrutiny/train.csv")
test = pd.read_csv('/kaggle/input/bac-scrutiny/test.csv')

train.shape, test.shape

((120759, 10), (13017, 8))

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120759 entries, 0 to 120758
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  120759 non-null  int64  
 1   an                  120759 non-null  int64  
 2   materie             120759 non-null  object 
 3   liceu               120759 non-null  object 
 4   judet               120759 non-null  object 
 5   medie               120759 non-null  float64
 6   procent_reusita     120759 non-null  float64
 7   numar_candidati     120759 non-null  int64  
 8   preferinta_materie  120759 non-null  float64
 9   anomalie            120759 non-null  int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 9.2+ MB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13017 entries, 0 to 13016
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  13017 non-null  int64  
 1   an                  13017 non-null  int64  
 2   materie             13017 non-null  object 
 3   liceu               13017 non-null  object 
 4   judet               13017 non-null  object 
 5   procent_reusita     13017 non-null  float64
 6   numar_candidati     13017 non-null  int64  
 7   preferinta_materie  13017 non-null  float64
dtypes: float64(2), int64(3), object(3)
memory usage: 813.7+ KB


In [5]:
train.head(2)

Unnamed: 0,id,an,materie,liceu,judet,medie,procent_reusita,numar_candidati,preferinta_materie,anomalie
0,1,2014,"ANATOMIE SI FIZIOLOGIE UMANA, GENETICA SI ECOL...","LICEUL TEHNOLOGIC AGRICOL ""MIHAIL KOGALNICEANU...",Iași,10.0,100.0,1,2.9,0
1,2,2014,"ANATOMIE SI FIZIOLOGIE UMANA, GENETICA SI ECOL...",LICEUL TEORETIC GATAIA,Timiș,10.0,100.0,1,1.3,0


In [6]:
train['materie'].value_counts(normalize=True)

materie
LIMBA ROMANA                                                0.110841
GENERAL                                                     0.110708
GEOGRAFIE                                                   0.086370
BIOLOGIE VEGETALA SI ANIMALA                                0.072906
ANATOMIE SI FIZIOLOGIE UMANA, GENETICA SI ECOLOGIE UMANA    0.065370
MATEMATICA TEHN                                             0.065221
ISTORIE                                                     0.060269
MATEMATICA MATE-INFO                                        0.049007
LOGICA, ARGUMENTARE SI COMUNICARE                           0.046680
MATEMATICA ST-NAT                                           0.042556
PSIHOLOGIE                                                  0.036925
FIZICA TEO                                                  0.034142
INFORMATICA MI C-C++                                        0.031517
CHIMIE ORGANICA TEO NIVEL I-II                              0.028072
FIZICA TEH                

In [7]:
train['liceu'].value_counts(normalize=True)

liceu
LICEUL TEORETIC "NICOLAE IORGA"                 0.004339
COLEGIUL TEHNIC "COSTIN D. NENITESCU"           0.003619
LICEUL TEORETIC "ION BARBU"                     0.002393
LICEUL TEORETIC "TRAIAN"                        0.001375
COLEGIUL NATIONAL "ANDREI MURESANU" BISTRITA    0.001342
                                                  ...   
LICEUL TEHNOLOGIC SPECIAL"PELENDAVA" CRAIOVA    0.000033
GRADINITA SFANTUL ANDREI                        0.000033
SCOALA PROFESIONALA RUSCOVA                     0.000033
SCOALA GIMNAZIALA "LUCIAN BLAGA" FARCASA        0.000017
SCOALA PROFESIONALA POIENILE DE SUB MUNTE       0.000017
Name: proportion, Length: 1772, dtype: float64

In [8]:
train['judet'].value_counts(normalize=True)

judet
București          0.100721
Cluj               0.042788
Constanța          0.037902
Prahova            0.037471
Iași               0.034084
Bihor              0.033985
Timiș              0.032817
Suceava            0.031675
Dolj               0.028644
Brașov             0.028445
Argeș              0.028288
Bacău              0.027128
Mureș              0.027062
Maramureș          0.025100
Arad               0.024843
Galați             0.024205
Harghita           0.023949
Hunedoara          0.022657
Neamț              0.022359
Alba               0.021646
Sibiu              0.020851
Gorj               0.020346
Dâmbovița          0.020272
Botoșani           0.019369
Buzău              0.018475
Vâlcea             0.018400
Olt                0.018061
Satu-Mare          0.017845
Brăila             0.017705
Caraș-Severin      0.017332
Bistrița-Năsăud    0.016371
Vaslui             0.016214
Vrancea            0.014864
Teleorman          0.014434
Sălaj              0.014036
Covasna       

In [9]:
train['an'].value_counts(normalize=True)

an
2020    0.113615
2015    0.113515
2014    0.113424
2017    0.110890
2016    0.110725
2019    0.110327
2021    0.110029
2018    0.109433
2022    0.108042
Name: proportion, dtype: float64

In [10]:
train, valid = train[train['an']<2022], train[train['an']==2022]

train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)

In [11]:
from catboost import Pool

features = [c for c in train.columns if c not in ['medie', 'anomalie']]
cat_features = [c for c in train.select_dtypes('object').columns if c in features]
target_col = 'medie'

X_train, y_train = train[features], train[target_col]
X_valid, y_valid = valid[features], valid[target_col]

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

In [12]:
from catboost import CatBoostRegressor

params = {
    'iterations': 3500,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'metric_period': 500,
    'random_state': 42,
    'max_depth': 4,
    'task_type': 'GPU'
}

model = CatBoostRegressor(**params)

model.fit(train_pool, eval_set=valid_pool)

0:	learn: 1.5077881	test: 1.4461026	best: 1.4461026 (0)	total: 3.7s	remaining: 3h 35m 59s
500:	learn: 0.5036641	test: 0.5023163	best: 0.5023163 (500)	total: 8.2s	remaining: 49.1s
1000:	learn: 0.4791705	test: 0.4831252	best: 0.4831252 (1000)	total: 12.6s	remaining: 31.5s
1500:	learn: 0.4681916	test: 0.4755785	best: 0.4755785 (1500)	total: 17.1s	remaining: 22.8s
2000:	learn: 0.4613115	test: 0.4716112	best: 0.4716112 (2000)	total: 21.4s	remaining: 16s
2500:	learn: 0.4559398	test: 0.4696136	best: 0.4696136 (2500)	total: 25.8s	remaining: 10.3s
3000:	learn: 0.4518598	test: 0.4686532	best: 0.4686532 (3000)	total: 30.1s	remaining: 5s
3499:	learn: 0.4484616	test: 0.4686990	best: 0.4686532 (3000)	total: 34.5s	remaining: 0us
bestTest = 0.4686531821
bestIteration = 3000
Shrink model to first 3001 iterations.


<catboost.core.CatBoostRegressor at 0x7b3dde0e2ab0>

In [13]:
y_pred1 = model.predict(test[features])
y_pred1.shape

(13017,)

In [14]:
features = [c for c in train.columns if c not in ['medie', 'anomalie']]
cat_features = [c for c in train.select_dtypes('object').columns if c in features]
target_col = 'anomalie'

X_train, y_train = train[features], train[target_col]
X_valid, y_valid = valid[features], valid[target_col]

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

In [15]:
from catboost import CatBoostClassifier

params = {
    'iterations': 1000,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'metric_period': 100,
    'random_state': 42,
    'max_depth': 4,
    'task_type': 'GPU'
}

model = CatBoostClassifier(**params)

model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.047839
0:	test: 0.8363491	best: 0.8363491 (0)	total: 5.66s	remaining: 1h 34m 16s
100:	test: 0.9996500	best: 0.9996500 (100)	total: 6.98s	remaining: 1m 2s
200:	test: 0.9996637	best: 0.9996637 (200)	total: 8.27s	remaining: 32.9s
300:	test: 0.9996770	best: 0.9996770 (300)	total: 9.54s	remaining: 22.2s
400:	test: 0.9996845	best: 0.9996845 (400)	total: 10.8s	remaining: 16.1s
500:	test: 0.9996890	best: 0.9996890 (500)	total: 12s	remaining: 11.9s
600:	test: 0.9996927	best: 0.9996927 (600)	total: 13.3s	remaining: 8.8s
700:	test: 0.9996950	best: 0.9996950 (700)	total: 14.5s	remaining: 6.19s
800:	test: 0.9997005	best: 0.9997005 (800)	total: 15.8s	remaining: 3.92s
900:	test: 0.9997029	best: 0.9997029 (900)	total: 17.1s	remaining: 1.88s
999:	test: 0.9997050	best: 0.9997050 (999)	total: 18.3s	remaining: 0us
bestTest = 0.999704957
bestIteration = 999


<catboost.core.CatBoostClassifier at 0x7b3dddba00e0>

In [16]:
y_pred2 = model.predict_proba(test[features])[:, 1]
y_pred2.shape

(13017,)

In [17]:
test[test['liceu']=='COLEGIUL NATIONAL "UNIREA" FOCSANI'].sort_values('numar_candidati', ascending=False).head(3)

Unnamed: 0,id,an,materie,liceu,judet,procent_reusita,numar_candidati,preferinta_materie
3565,124325,2023,GENERAL,"COLEGIUL NATIONAL ""UNIREA"" FOCSANI",Vrancea,98.6,212,100.0
7860,128620,2023,LIMBA ROMANA,"COLEGIUL NATIONAL ""UNIREA"" FOCSANI",Vrancea,99.5,212,100.0
10122,130882,2023,MATEMATICA MATE-INFO,"COLEGIUL NATIONAL ""UNIREA"" FOCSANI",Vrancea,100.0,115,54.2


In [18]:
train = pd.read_csv("/kaggle/input/bac-scrutiny/train.csv")

train[(train['liceu']=='COLEGIUL NATIONAL "UNIREA" FOCSANI') & (train['materie']=='INFORMATICA MI C-C++')].sort_values('preferinta_materie', ascending=False).head(3)

Unnamed: 0,id,an,materie,liceu,judet,medie,procent_reusita,numar_candidati,preferinta_materie,anomalie
113940,113941,2022,INFORMATICA MI C-C++,"COLEGIUL NATIONAL ""UNIREA"" FOCSANI",Vrancea,9.6,100.0,36,18.8,0
100860,100861,2021,INFORMATICA MI C-C++,"COLEGIUL NATIONAL ""UNIREA"" FOCSANI",Vrancea,9.51,100.0,34,18.7,0
47589,47590,2017,INFORMATICA MI C-C++,"COLEGIUL NATIONAL ""UNIREA"" FOCSANI",Vrancea,9.02,100.0,32,15.2,0


In [19]:
subm = {
    'id': [1, 2] + test['id'].tolist() * 2,
    'subtaskID': [1, 2] + [3] * len(test) + [4] * len(test),
    'answer': ['MATEMATICA MATE-INFO', 2022] + y_pred1.tolist() + y_pred2.tolist()
}

subm = pd.DataFrame(subm)
subm.to_csv("submission.csv", index=False)

subm.head()

Unnamed: 0,id,subtaskID,answer
0,1,1,MATEMATICA MATE-INFO
1,2,2,2022
2,120760,3,7.603672
3,120761,3,6.833541
4,120762,3,8.391938
