In [147]:
# https://platform.olimpiada-ai.ro/en/problems/79

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [148]:
train = pd.read_csv("/kaggle/input/analysis-parkinson/train.csv")
test = pd.read_csv("/kaggle/input/analysis-parkinson/test.csv")

train.shape, test.shape

((1684, 39), (421, 38))

In [149]:
train.head(5)

Unnamed: 0,PatientID,Age,AgeGroup,AlcoholConsumption,BMI,Bradykinesia,CholesterolHDL,CholesterolLDL,CholesterolTotal,CholesterolTriglycerides,...,SleepDisorders,SleepQuality,Smoking,SpeechProblems,Stroke,SystolicBP,TraumaticBrainInjury,Tremor,UPDRS,Diagnosis
0,d4e0fb07-37c6-4c7d-a9d5-bfadc41b8164,56,1,14.40175,38.165782,False,98.305359,185.601755,214.446455,177.613258,...,False,8.839484,0,True,0,173,0,False,114.941744,0
1,97cb29cd-aa0c-4802-80ee-4fb82c0d5059,84,3,15.545237,33.877785,False,29.089431,130.446298,168.545178,237.987107,...,False,6.183109,0,False,0,111,0,True,191.992824,1
2,d9584f0e-f6fb-4821-8737-679a25f4c5cd,53,1,5.942235,30.111818,False,40.764986,186.558645,291.316103,342.071323,...,False,8.590509,0,False,0,161,0,False,121.425375,0
3,d3cb6b85-7286-4b09-9f6f-349ad8e34bd1,88,3,7.315375,19.931085,False,38.752199,191.811289,174.858648,375.127417,...,False,6.359914,0,False,0,122,1,False,30.952378,0
4,d6b091f4-099a-4f85-8417-2cb7110e4f93,77,3,6.037814,32.591481,True,32.477083,118.043431,231.507811,385.517466,...,False,4.679,0,False,0,113,0,True,63.069273,1


In [150]:
train['PatientID'].nunique()

1684

In [151]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1684 entries, 0 to 1683
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   PatientID                 1684 non-null   object 
 1   Age                       1684 non-null   int64  
 2   AgeGroup                  1684 non-null   int64  
 3   AlcoholConsumption        1684 non-null   float64
 4   BMI                       1684 non-null   float64
 5   Bradykinesia              1684 non-null   bool   
 6   CholesterolHDL            1684 non-null   float64
 7   CholesterolLDL            1684 non-null   float64
 8   CholesterolTotal          1684 non-null   float64
 9   CholesterolTriglycerides  1684 non-null   float64
 10  Constipation              1684 non-null   bool   
 11  Depression                1684 non-null   int64  
 12  Diabetes                  1684 non-null   int64  
 13  DiastolicBP               1684 non-null   int64  
 14  DietQual

In [152]:
len(set(test['PatientID']) & set(train['PatientID']))

0

In [153]:
def process_df(df):
    bool_cols = df.select_dtypes('bool').columns.tolist()
    df.loc[:, bool_cols] = df.loc[:, bool_cols].astype(int)
    df['CardiometabolicRiskScore'] = (df['Hypertension'] == 1).astype(int) + (df['Diabetes'] == 1).astype(int) + (df['BMI'] > 30).astype(int)
    df['LifestyleRiskIndex'] = (df['Smoking'] == 1).astype(int) + (df['AlcoholConsumption'] > 2).astype(int) + (df['PhysicalActivity'] < 1).astype(int)
    return df

train = process_df(train)
test = process_df(test)

In [154]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = [c for c in train.columns if c not in ['Diagnosis', 'PatientID']]
cat_features = ['DoctorInCharge', 'EyeColor']
target_col = 'Diagnosis'

X, y = train[features], train[target_col]
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

In [155]:
from catboost import CatBoostClassifier

params = {
    'iterations': 100,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'metric_period': 10,
    'max_depth': 4,
    'random_state': 42
}

model = CatBoostClassifier(**params)

model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.095603
0:	test: 0.8451637	best: 0.8451637 (0)	total: 4.35ms	remaining: 431ms
10:	test: 0.9207589	best: 0.9207589 (10)	total: 35.6ms	remaining: 288ms
20:	test: 0.9424107	best: 0.9424107 (20)	total: 68.6ms	remaining: 258ms
30:	test: 0.9491071	best: 0.9491071 (30)	total: 101ms	remaining: 224ms
40:	test: 0.9541667	best: 0.9541667 (40)	total: 132ms	remaining: 189ms
50:	test: 0.9586310	best: 0.9586310 (50)	total: 164ms	remaining: 158ms
60:	test: 0.9602679	best: 0.9602679 (60)	total: 197ms	remaining: 126ms
70:	test: 0.9630952	best: 0.9630952 (70)	total: 230ms	remaining: 94.1ms
80:	test: 0.9645833	best: 0.9645833 (80)	total: 263ms	remaining: 61.7ms
90:	test: 0.9644345	best: 0.9645833 (80)	total: 297ms	remaining: 29.3ms
99:	test: 0.9668155	best: 0.9668155 (99)	total: 327ms	remaining: 0us

bestTest = 0.9668154762
bestIteration = 99



<catboost.core.CatBoostClassifier at 0x7d6aa04d3b00>

In [156]:
from sklearn.metrics import roc_auc_score

y_pred = model.predict_proba(X_valid)[:, 1]
score = roc_auc_score(y_valid, y_pred)

print(f"Score: {score:.5f}")

Score: 0.96682


In [157]:
y_pred = model.predict_proba(X_test)[:, 1]

subm = pd.DataFrame({
    'PatientID': test['PatientID'].tolist() * 3,
    'subtaskID': ['Task1'] * len(test) + ['Task2'] * len(test) + ['Task3'] * len(test),
    'Answer': test['CardiometabolicRiskScore'].tolist() + test['LifestyleRiskIndex'].tolist() + y_pred.tolist()
})

subm.to_csv("submission.csv", index=False)

subm.head()

Unnamed: 0,PatientID,subtaskID,Answer
0,a00fa494-651e-4674-8fb8-aa006b14cbf7,Task1,0.0
1,f31cda69-4e3f-4265-928c-3fe9b83f0896,Task1,1.0
2,e4d181d2-803a-42f5-8065-239dd591fb9d,Task1,0.0
3,c585efb2-37e3-4620-aeda-a2299191d3a9,Task1,2.0
4,74fbfa2b-901e-4028-b325-9aaaa07cbf4d,Task1,0.0
