In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb

In [4]:
df = pd.read_csv('train_values.csv')
df_label = pd.read_csv('train_labels.csv')
df_test = pd.read_csv('test_values.csv')

In [5]:
## Categorical column
## 'slope_of_peak_exercise_st_segment', 'num_major_vessels' are actually numerical
## however, after testing, treating them as categorical gives better result

category_cols = ['chest_pain_type', 'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results', 'sex',
                'exercise_induced_angina', 'slope_of_peak_exercise_st_segment', 'num_major_vessels']

## Numeric columns

numerical_cols = ['resting_blood_pressure', 'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 
                  'age', 'max_heart_rate_achieved']

### Quality

* the column `thal` is a string and needs to be encoded
* the column `fasting_blood_sugar_gt_120_mg_per_dl` is a boolean, convert it to bool datatype to avoid confusion
* the column `sex` is a boolean, convert it to bool datatype to avoid confusion
* the column `exercise_induced_angina` is a boolean, convert it to bool datatype to avoid confusion

### Tidiness

There is no structural issue with the data

## cleaning

In [6]:
df_clean = df.copy()
df_test_clean = df_test.copy()

In [7]:
lbl_encoder = LabelEncoder()
lbl_encoder.fit(df_clean['thal'])
df_clean['thal'] = lbl_encoder.transform(df_clean['thal'])
df_test_clean['thal'] = lbl_encoder.transform(df_test_clean['thal'])

df_clean['fasting_blood_sugar_gt_120_mg_per_dl'] = df_clean['fasting_blood_sugar_gt_120_mg_per_dl'].astype(bool)
df_test_clean['fasting_blood_sugar_gt_120_mg_per_dl'] = df_test_clean['fasting_blood_sugar_gt_120_mg_per_dl'].astype(bool)
df_clean['sex'] = df_clean['sex'].astype(bool)
df_test_clean['sex'] = df_test_clean['sex'].astype(bool)
df_clean['exercise_induced_angina'] = df_clean['exercise_induced_angina'].astype(bool)
df_test_clean['exercise_induced_angina'] = df_test_clean['exercise_induced_angina'].astype(bool)

In [8]:
df_train = df_clean.merge(df_label, on='patient_id', how='left')
X_preds = df_test_clean.drop(['patient_id'], axis=1)

## One Hot Encoding

In [9]:
df_train_onehot = df_train.copy()
X_preds_onehot = X_preds.copy()

In [10]:
for col in category_cols:
    #encoder = OneHotEncoder(sparse=False, drop='first', categories='auto')
    encoder = OneHotEncoder(sparse=False, categories='auto')
    encoder.fit(np.array(df_train_onehot[col]).reshape(-1,1))

    feature_names = []
    for val in encoder.categories_[0]:
        feature_name = col + "_" + str(val)
        feature_names.append(feature_name)
    #feature_names.pop(0)    

    onehot_cols = pd.DataFrame(encoder.transform(np.array(df_train_onehot[col]).reshape(-1,1)), columns=feature_names, index=df_train_onehot.index)
    df_train_onehot.drop([col], axis=1, inplace=True)
    df_train_onehot = pd.concat([df_train_onehot, onehot_cols], axis=1)

    onehot_cols_test = pd.DataFrame(encoder.transform(np.array(X_preds_onehot[col]).reshape(-1,1)), columns=feature_names, index=X_preds_onehot.index)
    X_preds_onehot.drop([col], axis=1, inplace=True)
    X_preds_onehot = pd.concat([X_preds_onehot, onehot_cols_test], axis=1)

## MinMax Scaling

In [11]:
df_scaled = df_train_onehot.copy()
X_pred_scaled = X_preds_onehot.copy()

for col in numerical_cols:
    scaler = StandardScaler()
    df_scaled[col] = scaler.fit_transform(np.array(df_scaled[col]).reshape(-1,1))
    X_pred_scaled[col] = scaler.transform(np.array(X_pred_scaled[col]).reshape(-1,1))

In [12]:
X = df_scaled.drop(['patient_id','heart_disease_present'], axis=1)
y = df_scaled['heart_disease_present']

## XGBoost Classifier

In [15]:
xg = xgb.XGBClassifier(n_estimators=100,
                        tree_method='gpu_hist',
                        n_jobs=4,
                        n_gpus=1,
                        max_depth=9,
                        seed=1,
                        learning_rate=0.05,
                        subsample=0.5,
                        colsample_bytree=0.5)

In [16]:
xg.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=None, n_estimators=100, n_gpus=1,
              n_jobs=4, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=1, silent=None, subsample=0.5, tree_method='gpu_hist',
              verbosity=1)

In [17]:
predicts = xg.predict_proba(X_pred_scaled)[:,1]

In [18]:
import csv
with open('test.csv', 'w', newline='') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerow(["patient_id","heart_disease_present"])
    for i in range(len(predicts)):
        patient_id = df_test.loc[i, "patient_id"]
        pred = predicts[i]
        writer.writerow([patient_id, pred])