In [43]:
# https://platform.olimpiada-ai.ro/problems/63

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [44]:
train = pd.read_csv("/kaggle/input/alzheimer/train.csv")
test = pd.read_csv("/kaggle/input/alzheimer/test.csv")

def process_df(df):
    df['Ethnicity'] = df['Ethnicity'].astype(str)
    df['EducationLevel'] = df['EducationLevel'].astype(str)
    return df

train = process_df(train)
test = process_df(test)

train.shape, test.shape

((1719, 34), (430, 33))

In [45]:
train['Diagnosis'].value_counts()

Diagnosis
0    1111
1     608
Name: count, dtype: int64

In [46]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = [c for c in train.columns if c not in ['PatientID', 'Diagnosis']]
cat_features = ['Ethnicity', 'EducationLevel']
target_col = 'Diagnosis'

X, y = train[features], train[target_col]
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)
full_pool = Pool(X, y, cat_features=cat_features)

In [47]:
from catboost import CatBoostClassifier

params = {
    'iterations': 100,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'metric_period': 10,
    'max_depth': 4,
    'random_state': 42
}

model = CatBoostClassifier(**params)

model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.09334
0:	test: 0.8699047	best: 0.8699047 (0)	total: 4.82ms	remaining: 477ms
10:	test: 0.9408876	best: 0.9408876 (10)	total: 36.6ms	remaining: 296ms
20:	test: 0.9454106	best: 0.9454106 (20)	total: 64.9ms	remaining: 244ms
30:	test: 0.9466844	best: 0.9466844 (30)	total: 93.3ms	remaining: 208ms
40:	test: 0.9467582	best: 0.9467582 (40)	total: 122ms	remaining: 176ms
50:	test: 0.9451706	best: 0.9467582 (40)	total: 151ms	remaining: 145ms
60:	test: 0.9458352	best: 0.9467582 (40)	total: 180ms	remaining: 115ms
70:	test: 0.9452075	best: 0.9467582 (40)	total: 210ms	remaining: 85.6ms
80:	test: 0.9452444	best: 0.9467582 (40)	total: 237ms	remaining: 55.5ms
90:	test: 0.9469798	best: 0.9469798 (90)	total: 265ms	remaining: 26.2ms
99:	test: 0.9475336	best: 0.9475336 (99)	total: 290ms	remaining: 0us

bestTest = 0.9475335992
bestIteration = 99



<catboost.core.CatBoostClassifier at 0x7dc02f687690>

In [48]:
from sklearn.metrics import roc_auc_score

y_pred = model.predict_proba(X_valid)[:, 1]

score = roc_auc_score(y_valid, y_pred)

print(f'Score: {score:.5f}')

Score: 0.94753


In [49]:
model = CatBoostClassifier(**params)

model.fit(full_pool)

Learning rate set to 0.10725
0:	total: 4.67ms	remaining: 462ms
10:	total: 35ms	remaining: 283ms
20:	total: 67.1ms	remaining: 252ms
30:	total: 97.1ms	remaining: 216ms
40:	total: 128ms	remaining: 184ms
50:	total: 158ms	remaining: 152ms
60:	total: 187ms	remaining: 120ms
70:	total: 219ms	remaining: 89.3ms
80:	total: 250ms	remaining: 58.5ms
90:	total: 281ms	remaining: 27.8ms
99:	total: 307ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7dc0a825fd10>

In [50]:
sids, dpids, answers = [], [], []

age_dict = train['Age'].value_counts().to_dict()
age_smoker_dict = train[train['Smoking']==1]['Age'].value_counts().to_dict()

y_pred = model.predict_proba(X_test)[:, 1]

for i, row in test.iterrows():
    sids.append(1)
    dpids.append(row['PatientID'])
    answers.append(age_dict.get(row['Age'], 0))

    sids.append(2)
    dpids.append(row['PatientID'])
    answers.append(round(100 * age_smoker_dict.get(row['Age'], 0) / age_dict.get(row['Age'], 1), 2))

    sids.append(3)
    dpids.append(row['PatientID'])
    answers.append(y_pred[i])

subm = pd.DataFrame({
    'subtaskID': sids,
    'datapointID': dpids,
    'answer': answers
})

subm.to_csv("submission.csv", index=False)

subm.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,5202,44.0
1,2,5202,34.09
2,3,5202,0.93513
3,1,6831,66.0
4,2,6831,30.3
