In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [129]:
X = pd.read_csv("train.csv").drop(columns=["PatientID", "Diagnosis", "EyeColor", "DoctorInCharge"])
y = pd.read_csv("train.csv")["Diagnosis"]

In [130]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1684 entries, 0 to 1683
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1684 non-null   int64  
 1   AgeGroup                  1684 non-null   int64  
 2   AlcoholConsumption        1684 non-null   float64
 3   BMI                       1684 non-null   float64
 4   Bradykinesia              1684 non-null   bool   
 5   CholesterolHDL            1684 non-null   float64
 6   CholesterolLDL            1684 non-null   float64
 7   CholesterolTotal          1684 non-null   float64
 8   CholesterolTriglycerides  1684 non-null   float64
 9   Constipation              1684 non-null   bool   
 10  Depression                1684 non-null   int64  
 11  Diabetes                  1684 non-null   int64  
 12  DiastolicBP               1684 non-null   int64  
 13  DietQuality               1684 non-null   float64
 14  Dyslipid

In [131]:
X.head()

Unnamed: 0,Age,AgeGroup,AlcoholConsumption,BMI,Bradykinesia,CholesterolHDL,CholesterolLDL,CholesterolTotal,CholesterolTriglycerides,Constipation,...,Rigidity,SleepDisorders,SleepQuality,Smoking,SpeechProblems,Stroke,SystolicBP,TraumaticBrainInjury,Tremor,UPDRS
0,56,1,14.40175,38.165782,False,98.305359,185.601755,214.446455,177.613258,True,...,False,False,8.839484,0,True,0,173,0,False,114.941744
1,84,3,15.545237,33.877785,False,29.089431,130.446298,168.545178,237.987107,False,...,False,False,6.183109,0,False,0,111,0,True,191.992824
2,53,1,5.942235,30.111818,False,40.764986,186.558645,291.316103,342.071323,False,...,False,False,8.590509,0,False,0,161,0,False,121.425375
3,88,3,7.315375,19.931085,False,38.752199,191.811289,174.858648,375.127417,False,...,True,False,6.359914,0,False,0,122,1,False,30.952378
4,77,3,6.037814,32.591481,True,32.477083,118.043431,231.507811,385.517466,False,...,True,False,4.679,0,False,0,113,0,True,63.069273


In [132]:
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

def feature_engineering(df):
    bool_cols = df.select_dtypes(include=['bool']).columns
    df[bool_cols] = df[bool_cols].astype(int)

    df["CardiometabolicRiskScore"] = (df["Hypertension"].astype(int) == 1).astype(int) + (df["Diabetes"].astype(int) == 1).astype(int) + (df["BMI"] > 30).astype(int)
    df["LifestyleRiskIndex"] = (df["Smoking"].astype(int) == 1).astype(int) + (df["AlcoholConsumption"] > 2).astype(int) + (df["PhysicalActivity"].astype(int) < 1).astype(int)

feature_engineering(df=X)

pipeline = Pipeline([
    ("preprocessing",
            make_column_transformer(
                (OneHotEncoder(), make_column_selector(dtype_include=object)),
                (StandardScaler(), make_column_selector(dtype_include=np.number))
            )
    
     ),
    ("model", RandomForestClassifier(n_estimators=2000,
                            random_state=42,
                            class_weight='balanced',
                            min_samples_leaf=2
    ))
])

cross_val_score(pipeline, X, y, cv=10, scoring="roc_auc")

array([0.97692308, 0.96369048, 0.95550595, 0.98407738, 0.94411058,
       0.94005409, 0.94711538, 0.91601562, 0.97265625, 0.9438101 ])

In [133]:
X.head()

Unnamed: 0,Age,AgeGroup,AlcoholConsumption,BMI,Bradykinesia,CholesterolHDL,CholesterolLDL,CholesterolTotal,CholesterolTriglycerides,Constipation,...,SleepQuality,Smoking,SpeechProblems,Stroke,SystolicBP,TraumaticBrainInjury,Tremor,UPDRS,CardiometabolicRiskScore,LifestyleRiskIndex
0,56,1,14.40175,38.165782,0,98.305359,185.601755,214.446455,177.613258,1,...,8.839484,0,1,0,173,0,0,114.941744,1,1
1,84,3,15.545237,33.877785,0,29.089431,130.446298,168.545178,237.987107,0,...,6.183109,0,0,0,111,0,1,191.992824,1,1
2,53,1,5.942235,30.111818,0,40.764986,186.558645,291.316103,342.071323,0,...,8.590509,0,0,0,161,0,0,121.425375,1,1
3,88,3,7.315375,19.931085,0,38.752199,191.811289,174.858648,375.127417,0,...,6.359914,0,0,0,122,1,0,30.952378,0,1
4,77,3,6.037814,32.591481,1,32.477083,118.043431,231.507811,385.517466,0,...,4.679,0,0,0,113,0,1,63.069273,1,1


In [134]:
X_test = pd.read_csv("test.csv").drop(columns=["DoctorInCharge", "EyeColor"])
feature_engineering(df=X_test)
pipeline.fit(X, y)

y_test = pipeline.predict(X_test.drop(columns=["PatientID"]))

pd.concat([
    pd.DataFrame({
        "subtaskID": "Task1",
        "PatientID": X_test["PatientID"],
        "Answer": X_test["CardiometabolicRiskScore"]
    }),
    pd.DataFrame({
        "subtaskID": "Task2",
        "PatientID": X_test["PatientID"],
        "Answer": X_test["LifestyleRiskIndex"]
    }),
    pd.DataFrame({
        "subtaskID": "Task3",
        "PatientID": X_test["PatientID"],
        "Answer": y_test
    })
]).to_csv("submission.csv")