In [126]:
# https://platform.olimpiada-ai.ro/problems/24
# https://platform.olimpiada-ai.ro/problems/33

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [127]:
train = pd.read_csv("/kaggle/input/patient-status/train.csv")
test = pd.read_csv("/kaggle/input/patient-status/test.csv")

len(train), len(test)

(3219, 805)

In [128]:
def process_df(df):
    df['Tumor NoInfo'] = df['Tumor Size'].isna().astype(int)
    df['Tumor Size'] = df['Tumor Size'].fillna(0)
    df['6th Stage NoInfo'] = df['6th Stage'].isna().astype(int)
    df['6th Stage'] = df['6th Stage'].fillna('noinfo')
    df['Hormone_Status_1'] = df['Hormone_Status'].map(lambda x: x.split('_')[0])
    df['Hormone_Status_2'] = df['Hormone_Status'].map(lambda x: x.split('_')[1])
    if 'Status' in df.columns:
        df['Status'] = df['Status'].map(lambda x: 0 if x=='Alive' else 1)
    return df

train = process_df(train)
test = process_df(test)

In [129]:
train.columns

Index(['ID', 'Age', 'Race', 'Marital Status', 'T Stage', 'N Stage',
       '6th Stage', 'differentiate', 'Grade', 'A Stage', 'Tumor Size',
       'Estrogen Status', 'Progesterone Status', 'Regional Node Examined',
       'Reginol Node Positive', 'Status', 'T_N_Stage', 'Hormone_Status',
       'Reginol Node Negative', 'Blood Pressure', 'Diastolic Pressure',
       'Cholesterol', 'Body Temperature', 'Oxygen Saturation',
       'Respiratory Rate', 'Blood Glucose', 'BMI', 'Heart Rate',
       'Serum Creatinine', 'Uric Acid', 'Hemoglobin', 'GFR', 'Serum Sodium',
       'Serum Potassium', 'Serum Albumin', 'Lactate', 'Tumor NoInfo',
       '6th Stage NoInfo', 'Hormone_Status_1', 'Hormone_Status_2'],
      dtype='object')

In [130]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = [c for c in train.columns if c not in ['ID', 'Status']]
cat_features = [c for c in features if train[c].dtype == 'object']

X, y = train[features], train['Status']
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

In [136]:
from catboost import CatBoostClassifier

params = {
    'iterations': 11,
    'learning_rate': 0.1,
    'loss_function': 'Logloss',
    'eval_metric': 'Precision',
    'metric_period': 2,
    'max_depth': 8,
    'random_state': 42
}

model = CatBoostClassifier(**params)

model.fit(train_pool, eval_set=valid_pool)

0:	learn: 0.7631579	test: 0.4666667	best: 0.4666667 (0)	total: 22.6ms	remaining: 226ms
2:	learn: 1.0000000	test: 0.0000000	best: 0.4666667 (0)	total: 70.6ms	remaining: 188ms
4:	learn: 1.0000000	test: 0.5000000	best: 0.5000000 (4)	total: 116ms	remaining: 139ms
6:	learn: 1.0000000	test: 0.0000000	best: 0.5000000 (4)	total: 151ms	remaining: 86.5ms
8:	learn: 1.0000000	test: 0.0000000	best: 0.5000000 (4)	total: 185ms	remaining: 41.2ms
10:	learn: 1.0000000	test: 0.6666667	best: 0.6666667 (10)	total: 229ms	remaining: 0us

bestTest = 0.6666666667
bestIteration = 10



<catboost.core.CatBoostClassifier at 0x7c1e69d10f10>

In [137]:
from sklearn.metrics import precision_score

y_pred = model.predict(X_valid).flatten()

score = precision_score(y_valid, y_pred)

print(f"Score: {score:.5f}")

Score: 0.66667


In [152]:
quantiles = np.quantile(train['Serum Creatinine'].to_numpy(), [0.25, 0.5, 0.75])
bmi_median = np.median(train['BMI'])
t_stage_counts = train['T Stage'].value_counts().to_dict()

y_pred = model.predict(X_test).flatten()

subm = []

for i in range(len(test)):
    for sid in range(1, 6):
        answer = y_pred[i]
        gfr = test['GFR'][i]
        scr = test['Serum Creatinine'][i]
        bmi = test['BMI'][i]
        t_stage = test['T Stage'][i]
        if sid==1:
            if gfr >= 90:
                answer = 'Normal'
            elif 60 <= gfr < 90:
                answer = 'Mildly Decreased'
        elif sid==2:
            if scr <= quantiles[0]:
                answer = 'Very Low'
            elif quantiles[0] < scr <= quantiles[1]:
                answer = 'Low'
            elif quantiles[1] < scr <= quantiles[2]:
                answer = 'High'
            else:
                answer = 'Very High'
        elif sid==3:
            if bmi > bmi_median:
                answer = 1
            else:
                answer = 0
        elif sid==4:
            answer = t_stage_counts[t_stage]

        subm.append({
            'subtaskID': sid,
            'datapointID': test['ID'][i],
            'answer': answer
        })

subm = pd.DataFrame(subm)

subm.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,3220,Normal
1,2,3220,High
2,3,3220,0
3,4,3220,1281
4,5,3220,0


In [153]:
subm.to_csv("submission.csv", index=False)