In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

In [8]:
df = pd.read_csv('train_values.csv')
df_label = pd.read_csv('train_labels.csv')
df_test = pd.read_csv('test_values.csv')

## Data Assessment

In [9]:
df.shape

(180, 14)

In [10]:
df_label.shape

(180, 2)

In [11]:
df_test.shape

(90, 14)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 14 columns):
patient_id                              180 non-null object
slope_of_peak_exercise_st_segment       180 non-null int64
thal                                    180 non-null object
resting_blood_pressure                  180 non-null int64
chest_pain_type                         180 non-null int64
num_major_vessels                       180 non-null int64
fasting_blood_sugar_gt_120_mg_per_dl    180 non-null int64
resting_ekg_results                     180 non-null int64
serum_cholesterol_mg_per_dl             180 non-null int64
oldpeak_eq_st_depression                180 non-null float64
sex                                     180 non-null int64
age                                     180 non-null int64
max_heart_rate_achieved                 180 non-null int64
exercise_induced_angina                 180 non-null int64
dtypes: float64(1), int64(11), object(2)
memory usage: 19.8+ KB


In [13]:
df_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 2 columns):
patient_id               180 non-null object
heart_disease_present    180 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.9+ KB


In [14]:
df['thal'].value_counts()

normal               98
reversible_defect    74
fixed_defect          8
Name: thal, dtype: int64

In [15]:
df.describe()

Unnamed: 0,slope_of_peak_exercise_st_segment,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
count,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0
mean,1.55,131.311111,3.155556,0.694444,0.161111,1.05,249.211111,1.01,0.688889,54.811111,149.483333,0.316667
std,0.618838,17.010443,0.938454,0.969347,0.368659,0.998742,52.717969,1.121357,0.464239,9.334737,22.063513,0.466474
min,1.0,94.0,1.0,0.0,0.0,0.0,126.0,0.0,0.0,29.0,96.0,0.0
25%,1.0,120.0,3.0,0.0,0.0,0.0,213.75,0.0,0.0,48.0,132.0,0.0
50%,1.0,130.0,3.0,0.0,0.0,2.0,245.5,0.8,1.0,55.0,152.0,0.0
75%,2.0,140.0,4.0,1.0,0.0,2.0,281.25,1.6,1.0,62.0,166.25,1.0
max,3.0,180.0,4.0,3.0,1.0,2.0,564.0,6.2,1.0,77.0,202.0,1.0


In [16]:
df['slope_of_peak_exercise_st_segment'].value_counts()

1    93
2    75
3    12
Name: slope_of_peak_exercise_st_segment, dtype: int64

In [17]:
df['resting_blood_pressure'].value_counts()

130    26
140    22
120    21
110    11
150    10
160     9
112     8
128     8
125     7
108     5
124     5
135     4
118     4
145     4
132     3
180     3
138     3
134     2
178     2
126     2
142     2
100     2
152     2
136     2
115     2
172     1
102     1
105     1
106     1
117     1
170     1
156     1
155     1
122     1
144     1
94      1
Name: resting_blood_pressure, dtype: int64

In [18]:
df['chest_pain_type'].value_counts()

4    82
3    57
2    28
1    13
Name: chest_pain_type, dtype: int64

In [19]:
df['num_major_vessels'].value_counts()

0    106
1     37
2     23
3     14
Name: num_major_vessels, dtype: int64

In [20]:
df['fasting_blood_sugar_gt_120_mg_per_dl'].value_counts()

0    151
1     29
Name: fasting_blood_sugar_gt_120_mg_per_dl, dtype: int64

In [21]:
df['resting_ekg_results'].value_counts()

2    94
0    85
1     1
Name: resting_ekg_results, dtype: int64

In [22]:
df['chest_pain_type'].value_counts()

4    82
3    57
2    28
1    13
Name: chest_pain_type, dtype: int64

In [23]:
df['serum_cholesterol_mg_per_dl'].value_counts()

204    4
239    4
219    3
254    3
309    3
226    3
303    3
211    3
282    3
233    3
234    3
256    3
263    3
277    2
265    2
203    2
199    2
197    2
188    2
177    2
258    2
149    2
330    2
308    2
283    2
266    2
304    2
267    2
302    2
299    2
      ..
215    1
218    1
200    1
201    1
213    1
205    1
206    1
207    1
209    1
180    1
175    1
321    1
126    1
322    1
325    1
326    1
327    1
335    1
353    1
360    1
141    1
174    1
222    1
160    1
417    1
164    1
167    1
168    1
220    1
172    1
Name: serum_cholesterol_mg_per_dl, Length: 118, dtype: int64

In [24]:
df['oldpeak_eq_st_depression'].value_counts()

0.0    56
1.4    11
1.6    10
1.2     9
1.0     7
0.2     7
0.6     7
0.4     7
0.8     7
1.8     6
2.0     5
0.1     5
0.5     4
2.2     4
2.4     3
1.5     3
1.9     3
2.6     3
2.8     2
3.0     2
2.3     2
0.9     2
4.2     2
0.3     2
3.4     2
1.1     1
2.5     1
1.3     1
3.8     1
3.2     1
5.6     1
3.1     1
0.7     1
6.2     1
Name: oldpeak_eq_st_depression, dtype: int64

In [25]:
df['sex'].value_counts()

1    124
0     56
Name: sex, dtype: int64

In [26]:
df['age'].value_counts()

54    12
58    10
64     9
51     9
44     9
57     8
60     7
62     7
59     6
55     6
65     6
48     6
67     6
41     6
52     5
45     5
63     5
49     4
42     4
50     4
53     4
56     4
66     4
61     3
46     3
68     3
70     3
71     3
40     3
47     2
69     2
43     2
39     2
35     2
34     1
37     1
77     1
38     1
74     1
29     1
Name: age, dtype: int64

In [27]:
df['max_heart_rate_achieved'].value_counts()

162    10
132     6
142     6
150     5
170     5
172     5
173     5
152     5
125     5
165     4
145     4
122     4
158     4
147     4
151     4
154     3
155     3
131     3
160     3
163     3
179     3
130     3
168     3
144     3
140     3
111     3
175     3
126     3
169     3
138     2
       ..
181     2
182     2
103     2
161     2
115     1
202     1
113     1
112     1
120     1
108     1
106     1
105     1
99      1
97      1
117     1
149     1
121     1
123     1
133     1
134     1
137     1
141     1
192     1
167     1
177     1
180     1
184     1
186     1
188     1
96      1
Name: max_heart_rate_achieved, Length: 73, dtype: int64

In [28]:
df['exercise_induced_angina'].value_counts()

0    123
1     57
Name: exercise_induced_angina, dtype: int64

### Quality

* the column `thal` is a string and needs to be encoded
* the column `fasting_blood_sugar_gt_120_mg_per_dl` is a boolean, convert it to bool datatype to avoid confusion
* the column `sex` is a boolean, convert it to bool datatype to avoid confusion
* the column `exercise_induced_angina` is a boolean, convert it to bool datatype to avoid confusion


### Tidyness

the data doesn't have structural issue

In [29]:
## Categorical column

category_cols = ['chest_pain_type', 'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results', 'sex',
                'exercise_induced_angina']

## Numeric columns

numerical_cols = ['slope_of_peak_exercise_st_segment', 'resting_blood_pressure', 'num_major_vessels',
                 'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'age', 'max_heart_rate_achieved']

## Cleaning

In [30]:
df_clean = df.copy()
df_test_clean = df_test.copy()

In [31]:
# encode strings in thal column

In [32]:
lbl_encoder = LabelEncoder()

In [33]:
df['thal'].value_counts()

normal               98
reversible_defect    74
fixed_defect          8
Name: thal, dtype: int64

In [34]:
df_test['thal'].value_counts()

normal               54
reversible_defect    30
fixed_defect          6
Name: thal, dtype: int64

In [35]:
lbl_encoder.fit(df_clean['thal'])
df_clean['thal'] = lbl_encoder.transform(df_clean['thal'])

In [36]:
df_test_clean['thal'] = lbl_encoder.transform(df_test_clean['thal'])

In [37]:
# fix data type in blood sugar, sex and exercise induced

In [38]:
df_clean['fasting_blood_sugar_gt_120_mg_per_dl'] = df_clean['fasting_blood_sugar_gt_120_mg_per_dl'].astype(bool)

In [39]:
df_test_clean['fasting_blood_sugar_gt_120_mg_per_dl'] = df_test_clean['fasting_blood_sugar_gt_120_mg_per_dl'].astype(bool)

In [40]:
df_clean['sex'] = df_clean['sex'].astype(bool)

In [41]:
df_test_clean['sex'] = df_test_clean['sex'].astype(bool)

In [42]:
df_clean['exercise_induced_angina'] = df_clean['exercise_induced_angina'].astype(bool)

In [43]:
df_test_clean['exercise_induced_angina'] = df_test_clean['exercise_induced_angina'].astype(bool)

## Decision Tree

In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [45]:
df_train = df_clean.merge(df_label, on='patient_id', how='left')

In [46]:
df_train.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,heart_disease_present
0,0z64un,1,1,128,2,0,False,2,308,0.0,True,45,170,False,0
1,ryoo3j,2,1,110,3,0,False,0,214,1.6,False,54,158,False,0
2,yt1s1x,1,1,125,4,3,False,2,304,0.0,True,77,162,True,1
3,l2xjde,1,2,152,4,0,False,0,223,0.0,True,40,181,False,1
4,oyt4ek,3,2,178,1,0,False,2,270,4.2,True,59,145,False,0


In [47]:
X = df_train.drop(['patient_id', 'heart_disease_present'], axis=1)
y = df_train['heart_disease_present']

In [48]:
X_preds = df_test_clean.drop(['patient_id'], axis=1)

In [41]:
y.value_counts()

0    100
1     80
Name: heart_disease_present, dtype: int64

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [80]:
tree_clf = DecisionTreeClassifier(max_depth=9)

In [83]:
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [87]:
tree_clf.score(X_test, y_test)

0.8888888888888888

In [104]:
y_preds = tree_clf.predict_proba(X_test)[:,1]

In [105]:
# metrics used in competition
log_loss(y_test, y_preds)

3.837664032696471

In [107]:
# Use GridSearchCV for parameter tuning

In [108]:
tree = DecisionTreeClassifier()
params = {
    "max_depth":[3,6,9,12,15],
    "min_samples_leaf":[2,3,4,5,6,7,8]
}

In [109]:
grid_cv = GridSearchCV(tree, params, cv=5)

In [110]:
grid_cv.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 6, 9, 12, 15],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8]},
          

In [111]:
grid_cv.best_params_

{'max_depth': 6, 'min_samples_leaf': 3}

In [119]:
y_preds = grid_cv.predict_proba(X_test)
print(log_loss(y_test, y_preds))

3.954645366042257


## Random Forest

In [89]:
from sklearn.ensemble import RandomForestClassifier

In [90]:
rf = RandomForestClassifier(n_estimators = 100, max_depth=9)

In [120]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [121]:
y_preds = rf.predict_proba(X_test)
print(log_loss(y_test, y_preds))

0.4246994545864953


In [93]:
rf = RandomForestClassifier(n_estimators=1000)
params = {
    "max_depth":[3,6,9,12,15],
    "min_samples_leaf":[2,3,4,5,6,7,8]
}
grid_rf = GridSearchCV(rf, params, cv=5)

In [94]:
grid_rf.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=1000, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='

In [95]:
grid_rf.best_params_

{'max_depth': 6, 'min_samples_leaf': 6}

In [96]:
y_preds = grid_rf.predict_proba(X_test)
print(log_loss(y_test, y_preds))

0.3278829713734511


## XGBoost

In [62]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score

In [123]:
X_train, X_test, y_train, y_test = train_test_split(numeric_interaction, y, test_size=0.2, stratify=y)

In [124]:
xg = xgb.XGBClassifier(n_estimators=1000,
                        tree_method='gpu_hist',
                        n_jobs=4,
                        n_gpus=1,
                        max_depth=9,
                        seed=1,
                        learning_rate=0.05,
                        subsample=0.5,
                        colsample_bytree=0.5)

In [125]:
xg.fit(X_train, y_train, eval_set=[(X_test,y_test)],eval_metric=['logloss'], early_stopping_rounds=5)

[0]	validation_0-logloss:0.685558
Will train until validation_0-logloss hasn't improved in 5 rounds.
[1]	validation_0-logloss:0.674423
[2]	validation_0-logloss:0.665858
[3]	validation_0-logloss:0.657676
[4]	validation_0-logloss:0.640784
[5]	validation_0-logloss:0.62804
[6]	validation_0-logloss:0.618129
[7]	validation_0-logloss:0.610511
[8]	validation_0-logloss:0.597794
[9]	validation_0-logloss:0.59686
[10]	validation_0-logloss:0.583394
[11]	validation_0-logloss:0.578165
[12]	validation_0-logloss:0.573741
[13]	validation_0-logloss:0.56663
[14]	validation_0-logloss:0.560218
[15]	validation_0-logloss:0.550279
[16]	validation_0-logloss:0.550061
[17]	validation_0-logloss:0.54497
[18]	validation_0-logloss:0.533877
[19]	validation_0-logloss:0.535371
[20]	validation_0-logloss:0.53321
[21]	validation_0-logloss:0.527865
[22]	validation_0-logloss:0.519955
[23]	validation_0-logloss:0.515059
[24]	validation_0-logloss:0.510747
[25]	validation_0-logloss:0.507419
[26]	validation_0-logloss:0.505019
[27

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=None, n_estimators=1000, n_gpus=1,
              n_jobs=4, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=1, silent=None, subsample=0.5, tree_method='gpu_hist',
              verbosity=1)

In [60]:
y_preds = xg.predict_proba(X_test)[:,1]
print(log_loss(y_test, y_preds))

0.22467343643721607


In [63]:
clf = xgb.XGBClassifier(n_estimators=100,
                        tree_method='gpu_hist',
                        n_jobs=4,
                        n_gpus=1,
                        max_depth=9,
                        seed=1,
                        learning_rate=0.05,
                        subsample=0.5,
                        colsample_bytree=0.5)

scores = cross_val_score(clf, X, y, cv=5, scoring='neg_log_loss')
scores

array([-0.49877289, -0.29392205, -0.2862945 , -0.5246355 , -0.45706416])

In [64]:
scores.mean()

-0.41213781942302985

In [126]:
clf = xgb.XGBClassifier(n_estimators=60,
                        tree_method='gpu_hist',
                        n_jobs=4,
                        n_gpus=1,
                        max_depth=9,
                        seed=1,
                        learning_rate=0.05,
                        subsample=0.5,
                        colsample_bytree=0.5)
scores = cross_val_score(clf, numeric_interaction, y, cv=5, scoring='neg_log_loss')
scores

array([-0.48532688, -0.34416411, -0.3662541 , -0.50650258, -0.41351266])

In [69]:
clf = xgb.XGBClassifier(n_estimators=30,
                        tree_method='gpu_hist',
                        n_jobs=4,
                        n_gpus=1,
                        max_depth=12,
                        seed=1,
                        learning_rate=0.05,
                        subsample=0.9,
                        colsample_bytree=0.5)
scores = cross_val_score(clf, X_interaction, y, cv=5, scoring='neg_log_loss')
scores

array([-0.51710579, -0.40306679, -0.40058918, -0.51540546, -0.48604108])

## Current top model

In [None]:
xg2 = xgb.XGBClassifier(n_estimators=100,
                        tree_method='gpu_hist',
                        n_jobs=4,
                        n_gpus=1,
                        max_depth=9,
                        seed=1,
                        learning_rate=0.05,
                        subsample=0.5,
                        colsample_bytree=0.5)

# use new_cat_cols and new_num_cols, onehot, minmaxscaler
# fit on X and Y

In [74]:
xg2 = xgb.XGBClassifier(n_estimators=100,
                        tree_method='gpu_hist',
                        n_jobs=4,
                        n_gpus=1,
                        max_depth=9,
                        seed=1,
                        learning_rate=0.05,
                        subsample=0.5,
                        colsample_bytree=0.5)

In [75]:
xg2.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=None, n_estimators=100, n_gpus=1,
              n_jobs=4, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=1, silent=None, subsample=0.5, tree_method='gpu_hist',
              verbosity=1)

In [76]:
predicts = xg2.predict_proba(X_pred_scaled)[:,1]

In [77]:
predicts

array([0.42444733, 0.09744233, 0.92069745, 0.1841056 , 0.9534574 ,
       0.04508975, 0.17429844, 0.7543481 , 0.55736756, 0.15916334,
       0.17148606, 0.48447058, 0.3689239 , 0.89207685, 0.1436992 ,
       0.06137694, 0.03895934, 0.17027284, 0.8144459 , 0.03826162,
       0.97219527, 0.10471484, 0.11924604, 0.07997429, 0.43903077,
       0.89895415, 0.3677992 , 0.22430696, 0.5659143 , 0.03754222,
       0.9137204 , 0.4376304 , 0.64825314, 0.26243648, 0.36007968,
       0.10022993, 0.517107  , 0.20768835, 0.33744097, 0.13466638,
       0.93112403, 0.0807114 , 0.9427654 , 0.1000771 , 0.9527496 ,
       0.11667621, 0.07427834, 0.2659645 , 0.26788476, 0.71760595,
       0.525817  , 0.03322913, 0.95922226, 0.10370567, 0.40682447,
       0.05726923, 0.8089874 , 0.1360619 , 0.15968548, 0.64035773,
       0.06006669, 0.9350121 , 0.23840338, 0.927897  , 0.22295949,
       0.89878863, 0.877822  , 0.59225506, 0.92804736, 0.7729987 ,
       0.10267598, 0.96606463, 0.9627954 , 0.96691954, 0.94688

In [78]:
import csv
with open('test.csv', 'w', newline='') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerow(["patient_id","heart_disease_present"])
    for i in range(len(predicts)):
        patient_id = df_test.loc[i, "patient_id"]
        pred = predicts[i]
        writer.writerow([patient_id, pred])

## One Hot Encode the categories

In [49]:
from sklearn.preprocessing import OneHotEncoder

In [50]:
df_train_onehot = df_train.copy()

In [51]:
X_preds_onehot = X_preds.copy()

In [52]:
for col in category_cols:
    #encoder = OneHotEncoder(sparse=False, drop='first', categories='auto')
    encoder = OneHotEncoder(sparse=False, categories='auto')
    encoder.fit(np.array(df_train_onehot[col]).reshape(-1,1))

    feature_names = []
    for val in encoder.categories_[0]:
        feature_name = col + "_" + str(val)
        feature_names.append(feature_name)
    #feature_names.pop(0)    

    onehot_cols = pd.DataFrame(encoder.transform(np.array(df_train_onehot[col]).reshape(-1,1)), columns=feature_names, index=df_train_onehot.index)
    df_train_onehot.drop([col], axis=1, inplace=True)
    df_train_onehot = pd.concat([df_train_onehot, onehot_cols], axis=1)

    onehot_cols_test = pd.DataFrame(encoder.transform(np.array(X_preds_onehot[col]).reshape(-1,1)), columns=feature_names, index=X_preds_onehot.index)
    X_preds_onehot.drop([col], axis=1, inplace=True)
    X_preds_onehot = pd.concat([X_preds_onehot, onehot_cols_test], axis=1)

### treat slope and num vessels as categorical (onehot)

In [53]:
new_cat_cols = ['slope_of_peak_exercise_st_segment', 'num_major_vessels']
for col in new_cat_cols:
    #encoder = OneHotEncoder(sparse=False, drop='first', categories='auto')
    encoder = OneHotEncoder(sparse=False, categories='auto')
    encoder.fit(np.array(df_train_onehot[col]).reshape(-1,1))

    feature_names = []
    for val in encoder.categories_[0]:
        feature_name = col + "_" + str(val)
        feature_names.append(feature_name)
    #feature_names.pop(0)    

    onehot_cols = pd.DataFrame(encoder.transform(np.array(df_train_onehot[col]).reshape(-1,1)), columns=feature_names, index=df_train_onehot.index)
    df_train_onehot.drop([col], axis=1, inplace=True)
    df_train_onehot = pd.concat([df_train_onehot, onehot_cols], axis=1)

    onehot_cols_test = pd.DataFrame(encoder.transform(np.array(X_preds_onehot[col]).reshape(-1,1)), columns=feature_names, index=X_preds_onehot.index)
    X_preds_onehot.drop([col], axis=1, inplace=True)
    X_preds_onehot = pd.concat([X_preds_onehot, onehot_cols_test], axis=1)

In [54]:
X = df_train_onehot.drop(['patient_id','heart_disease_present'], axis=1)
y = df_train_onehot['heart_disease_present']

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

## Scale the numeric

In [56]:
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler

In [49]:
numerical_cols

['slope_of_peak_exercise_st_segment',
 'resting_blood_pressure',
 'num_major_vessels',
 'serum_cholesterol_mg_per_dl',
 'oldpeak_eq_st_depression',
 'age',
 'max_heart_rate_achieved']

In [57]:
new_num_cols = [
 'resting_blood_pressure',
 'serum_cholesterol_mg_per_dl',
 'oldpeak_eq_st_depression',
 'age',
 'max_heart_rate_achieved']

In [58]:
df_scaled = df_train_onehot.copy()
X_pred_scaled = X_preds_onehot.copy()

In [107]:
for col in numerical_cols:
    scaler = StandardScaler()
    df_scaled[col] = scaler.fit_transform(np.array(df_scaled[col]).reshape(-1,1))
    X_pred_scaled[col] = scaler.transform(np.array(X_pred_scaled[col]).reshape(-1,1))

KeyError: 'slope_of_peak_exercise_st_segment'

In [59]:
for col in new_num_cols:
    scaler = MinMaxScaler()
    df_scaled[col] = scaler.fit_transform(np.array(df_scaled[col]).reshape(-1,1))
    X_pred_scaled[col] = scaler.transform(np.array(X_pred_scaled[col]).reshape(-1,1))

In [60]:
X = df_scaled.drop(['patient_id','heart_disease_present'], axis=1)
y = df_scaled['heart_disease_present']

## add interaction

In [54]:
from sklearn.preprocessing import PolynomialFeatures

In [55]:
interaction = PolynomialFeatures(degree=2, interaction_only=True, include_bias = False)

In [92]:
X[new_num_cols].shape

(180, 5)

In [90]:
X_interaction = interaction.fit_transform(X[new_num_cols])

In [104]:
numeric_interaction = pd.concat([X.drop(new_num_cols,axis=1), pd.DataFrame(X_interaction)], axis=1)

In [80]:
df_train_onehot[new_num_cols]

Unnamed: 0,resting_blood_pressure,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,age,max_heart_rate_achieved
0,128,308,0.0,45,170
1,110,214,1.6,54,158
2,125,304,0.0,77,162
3,152,223,0.0,40,181
4,178,270,4.2,59,145
5,130,180,0.0,42,150
6,150,258,2.6,60,157
7,150,276,0.6,57,112
8,170,326,3.4,59,140
9,120,219,1.6,50,158


In [128]:
X_preds.shape

(90, 13)