In [1]:
# dependencies
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [2]:
# load data
data_path = Path('../data')
submissions_path = Path('../submissions')

train_df = pd.read_csv(data_path / 'train.csv')
test_df = pd.read_csv(data_path / 'test.csv')

train_df.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [3]:
# prepare data
features = [c for c in train_df.columns if c not in ('id', 'Target')]
target = 'Target'

cat_features = [
    'Marital status', 
    'Application mode',
    'Course', 
    'Previous qualification',
    'Nacionality',
    "Mother's qualification", 
    "Father's qualification", 
    "Mother's occupation", 
    "Father's occupation" 
]
num_features = [c for c in features if c not in cat_features]

for c in cat_features:
    train_df[c] = train_df[c].astype('category')
    test_df[c] = test_df[c].astype('category')

In [4]:
for col in cat_features:
    print(f"Column: {col}")
    print(train_df[col].value_counts(normalize=True, ascending=False))
    print("-" * 20)

Column: Marital status
Marital status
1    0.917287
2    0.069212
4    0.011318
5    0.001516
6    0.000457
3    0.000209
Name: proportion, dtype: float64
--------------------
Column: Application mode
Application mode
1     0.460375
17    0.215936
39    0.189746
44    0.039363
43    0.034907
7     0.019577
18    0.016414
42    0.007201
51    0.005776
16    0.003463
53    0.002940
15    0.002392
5     0.001137
10    0.000562
2     0.000105
27    0.000026
4     0.000013
26    0.000013
35    0.000013
12    0.000013
9     0.000013
3     0.000013
Name: proportion, dtype: float64
--------------------
Column: Course
Course
9500    0.157793
9773    0.107347
9238    0.103701
9147    0.101166
9254    0.070898
9085    0.070219
9670    0.062208
9991    0.053020
9003    0.048786
9070    0.042879
9853    0.041794
9119    0.039259
171     0.037364
8014    0.031862
9130    0.020989
9556    0.009749
33      0.000941
39      0.000013
979     0.000013
Name: proportion, dtype: float64
--------------------

In [5]:
# split data
X, y = train_df[features], train_df[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# model
model = CatBoostClassifier(cat_features=cat_features, random_state=42)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose_eval=250)

Learning rate set to 0.117237
0:	learn: 0.9743782	test: 0.9744517	best: 0.9744517 (0)	total: 363ms	remaining: 6m 2s
250:	learn: 0.4181911	test: 0.4346076	best: 0.4346076 (250)	total: 1m 6s	remaining: 3m 18s
500:	learn: 0.3983532	test: 0.4302010	best: 0.4302010 (500)	total: 2m 16s	remaining: 2m 16s
750:	learn: 0.3823362	test: 0.4283352	best: 0.4283352 (750)	total: 3m 29s	remaining: 1m 9s
999:	learn: 0.3678556	test: 0.4281922	best: 0.4279657 (855)	total: 4m 43s	remaining: 0us

bestTest = 0.4279656876
bestIteration = 855

Shrink model to first 856 iterations.


<catboost.core.CatBoostClassifier at 0x125d3690610>

In [7]:
predictions = model.predict(test_df[features])
output = pd.DataFrame({
    'id': test_df['id'],
    'Target': np.squeeze(predictions)}
)
output.to_csv(submissions_path / 'base.csv', index=False)
# output