# Modeling & Inference

### Importing Libraries

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
import os
from feature_engine.encoding import MeanEncoder
from feature_engine.encoding import WoEEncoder
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import average_precision_score
from scipy.stats import ks_2samp

### Reading the data

In [2]:
# Read CSVs
real_world_df = pd.read_csv('./data/real-world-data/healthcare-dataset-stroke-data.csv')
train_synthetic_df = pd.read_csv('./data/synthetic-data/train.csv')

In [3]:
df = pd.concat([train_synthetic_df, real_world_df], ignore_index=True, axis=0)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


In [4]:
# Features
categorical = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
binary = ["hypertension", "heart_disease"] # Basically, categorical values with only 2 values. 1 or 0
continous_numerical = ["age", "avg_glucose_level", "bmi"]

### Using a Decision Tree to predict the missing BMI

In [5]:
DT_bmi_pipe = Pipeline( steps=[ 
                               ('scale',StandardScaler()),
                               ('lr',DecisionTreeRegressor(random_state=42))
                              ])
dt_df = df.copy()
X = dt_df[['age','gender','bmi']].copy()
X.gender = X.gender.replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)

missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
Y = X.pop('bmi')
DT_bmi_pipe.fit(X,Y)
predicted_bmi = pd.Series(DT_bmi_pipe.predict(missing[['age','gender']]),index=missing.index)
dt_df.loc[missing.index,'bmi'] = predicted_bmi

print('Missing values after decision tree regressor: ',sum(dt_df.isnull().sum()))

Missing values after decision tree regressor:  0


### Using a Simple Imputer

In [6]:
mean_imputer = SimpleImputer(strategy='mean')
imputed_df = df.copy()
imputed_df['bmi'] = mean_imputer.fit_transform(imputed_df['bmi'].values.reshape(-1,1))
print('Missing values after imputing: ',sum(imputed_df.isnull().sum()))

Missing values after imputing:  0


### Baseline
Logistic Regression

In [7]:
@ignore_warnings(category=ConvergenceWarning)
def baseline(train_df, cv, oof_preds, target, train_auc, val_auc, pipelines, model, encoder, scaler):
    # cast categorical features as categorical type
    train_df[categorical] = (train_df[categorical].astype('category'))
    for fold, (tr_ix, vl_ix) in enumerate(cv.split(train_df, target)):
        X_train, Y_train = train_df.iloc[tr_ix], target.iloc[tr_ix]
        X_val, Y_val = train_df.iloc[vl_ix], target.iloc[vl_ix]
        
        X_train = X_train.copy()
        X_val = X_val.copy()
        

        
        X_train = encoder.fit_transform(X_train)
        X_train = scaler.fit_transform(X_train)
        
        X_val = encoder.transform(X_val)
        X_val = scaler.transform(X_val)

        print('_'*50)
        
        model.fit(X_train, Y_train)
        
        oof_preds.iloc[vl_ix] = model.predict_proba(X_val)[:, 1]
        train_auc.append(roc_auc_score(Y_train, model.predict_proba(X_train)[:, 1]))
        val_auc.append(roc_auc_score(Y_val, model.predict_proba(X_val)[:, 1]))
        pipelines.append([encoder, scaler, model])

        print(f'Val AUC: {val_auc[-1]}')
    return pipelines, train_auc, val_auc, oof_preds

In [8]:
cv = StratifiedKFold(shuffle=True, random_state=42)
features = categorical + binary + continous_numerical
model = LogisticRegression()
encoder = OneHotEncoder(drop_last=True, variables=categorical)
scaler = SklearnTransformerWrapper(StandardScaler(), variables=binary+continous_numerical)

In [9]:
train_auc = []
val_auc = []
pipelines_synth = []
oof_preds = pd.Series(0, index=train_synthetic_df.index)

train_df = train_synthetic_df[features].copy()
target = train_synthetic_df['stroke']

pipelines_synth, train_auc, val_auc, oof_preds_synth = baseline(train_df, cv, oof_preds, target, train_auc, val_auc, pipelines_synth, model, encoder, scaler)
print()
print(f'Synthetic Train AUC: {np.mean(train_auc)}')
print(f'Synthetic Val AUC: {np.mean(val_auc)}')
print(f"Desicion Tree OOF AUC score: {roc_auc_score(target, oof_preds_synth)}")

__________________________________________________
Val AUC: 0.8822341202238988
__________________________________________________
Val AUC: 0.8836537681512128
__________________________________________________
Val AUC: 0.8949701839417312
__________________________________________________
Val AUC: 0.8765518573981933
__________________________________________________
Val AUC: 0.8813662479306652

Synthetic Train AUC: 0.8864338487351759
Synthetic Val AUC: 0.8837552355291403
Desicion Tree OOF AUC score: 0.8835476685117956


In [10]:
train_auc = []
val_auc = []
pipelines_imputed = []
oof_preds = pd.Series(0, index=imputed_df.index)

imputed_df_train = imputed_df[features].copy()
target = imputed_df['stroke']

pipelines_imputed, train_auc, val_auc, oof_preds_imputed = baseline(imputed_df_train, cv, oof_preds, target, train_auc, val_auc, pipelines_imputed, model, encoder, scaler)
print()
print(f'Imputed with mean Train AUC: {np.mean(train_auc)}')
print(f'Imputed with Val AUC: {np.mean(val_auc)}')
print(f"Desicion Tree OOF AUC score: {roc_auc_score(target, oof_preds_imputed)}")

__________________________________________________
Val AUC: 0.8723808665099937
__________________________________________________
Val AUC: 0.8739892849663773
__________________________________________________
Val AUC: 0.8699696930916537
__________________________________________________
Val AUC: 0.8829947263517521
__________________________________________________
Val AUC: 0.8683290276032211

Imputed with mean Train AUC: 0.8753984950971601
Imputed with Val AUC: 0.8735327197045996
Desicion Tree OOF AUC score: 0.8735038634522456


In [11]:
train_auc = []
val_auc = []
pipelines_dt = []
oof_preds = pd.Series(0, index=dt_df.index)

dt_df_train = dt_df[features].copy()
target = dt_df['stroke']

pipelines_dt, train_auc, val_auc, oof_preds_dt = baseline(dt_df_train, cv, oof_preds, target, train_auc, val_auc, pipelines_dt, model, encoder, scaler)
print()
print(f'Desicion Tree Train AUC: {np.mean(train_auc)}')
print(f'Desicion Tree Val AUC: {np.mean(val_auc)}')
print(f"Desicion Tree OOF AUC score: {roc_auc_score(target, oof_preds_dt)}")

__________________________________________________
Val AUC: 0.8724084975684668
__________________________________________________
Val AUC: 0.8739471112455498
__________________________________________________
Val AUC: 0.8700874886567234
__________________________________________________
Val AUC: 0.8831277970151672
__________________________________________________
Val AUC: 0.8684861285667738

Desicion Tree Train AUC: 0.875469115744238
Desicion Tree Val AUC: 0.8736114046105362
Desicion Tree OOF AUC score: 0.8735876007847949


Our synthetic dataframe seems to be performing the best among these dataframes. Eventhough it has way less data points.
We are going to try other encoders from feature-engine to see if we can improve the results.

In [12]:
test_df = pd.read_csv('./data/synthetic-data/test.csv', index_col="id")
preds = pd.Series(0, index=test_df.index)
X_test = test_df[features].copy()

for pipeline in pipelines_synth:
    X_test = test_df[features].copy()
    encoder, scaler, model = pipeline
    X_test = encoder.transform(X_test)
    X_test = scaler.transform(X_test)
    preds += model.predict_proba(X_test)[:, 1]

preds /= len(pipelines_synth)
preds.rename('stroke', inplace=True)
preds.to_csv('submission1.csv')

In [None]:
test_df = pd.read_csv('./data/synthetic-data/test.csv', index_col="id")
preds = pd.Series(0, index=test_df.index)
X_test = test_df[features].copy()

for pipeline in pipelines_dt:
    X_test = test_df[features].copy()
    encoder, scaler, model = pipeline
    X_test = encoder.transform(X_test)
    X_test = scaler.transform(X_test)
    preds += model.predict_proba(X_test)[:, 1]

preds /= len(pipelines_dt)
preds.rename('stroke', inplace=True)
preds.to_csv('submission2.csv')

In [None]:
test_df = pd.read_csv('./data/synthetic-data/test.csv', index_col="id")
preds = pd.Series(0, index=test_df.index)
X_test = test_df[features].copy()

for pipeline in pipelines_imputed:
    X_test = test_df[features].copy()
    encoder, scaler, model = pipeline
    X_test = encoder.transform(X_test)
    X_test = scaler.transform(X_test)
    preds += model.predict_proba(X_test)[:, 1]

preds /= len(pipelines_imputed)
preds.rename('stroke', inplace=True)
preds.to_csv('submission3.csv')

In [13]:
encoder = CountFrequencyEncoder(variables=categorical)
train_auc = []
val_auc = []
pipelines = []
oof_preds = pd.Series(0, index=imputed_df.index)
target = imputed_df['stroke']
pipelines, train_auc, val_auc, oof_preds_imputed = baseline(imputed_df, cv, oof_preds, target, train_auc, val_auc, pipelines, model, encoder, scaler)
print()
print(f'Imputed with mean Train AUC: {np.mean(train_auc)}')
print(f'Imputed with Val AUC: {np.mean(val_auc)}')

__________________________________________________
Val AUC: 0.9900629988133187
__________________________________________________
Val AUC: 0.9876649137910976
__________________________________________________
Val AUC: 0.9369037508434743
__________________________________________________
Val AUC: 0.9607340293507598
__________________________________________________
Val AUC: 0.9121921984825211

Imputed with mean Train AUC: 0.9586418238587173
Imputed with Val AUC: 0.9575115782562342


In [14]:
encoder = WoEEncoder(variables=categorical)
train_auc = []
val_auc = []
pipelines = []
oof_preds = pd.Series(0, index=imputed_df.index)
target = imputed_df['stroke']
pipelines, train_auc, val_auc, oof_preds_imputed = baseline(imputed_df, cv, oof_preds, target, train_auc, val_auc, pipelines, model, encoder, scaler)
print()
print(f'Imputed with mean Train AUC: {np.mean(train_auc)}')
print(f'Imputed with Val AUC: {np.mean(val_auc)}')

TypeError: WoEEncoder.fit() missing 1 required positional argument: 'y'

In [None]:
encoder = MeanEncoder(variables=categorical)
train_auc = []
val_auc = []
pipelines = []
oof_preds = pd.Series(0, index=imputed_df.index)
target = imputed_df['stroke']
pipelines, train_auc, val_auc, oof_preds_imputed = baseline(imputed_df, cv, oof_preds, target, train_auc, val_auc, pipelines, model, encoder, scaler)
print()
print(f'Imputed with mean Train AUC: {np.mean(train_auc)}')
print(f'Imputed with Val AUC: {np.mean(val_auc)}')

TypeError: MeanEncoder.fit() missing 1 required positional argument: 'y'

The Kolmogorov–Smirnov test is a nonparametric goodness-of-fit test and is used to determine wether two distributions differ, or whether an underlying probability distribution differes from a hypothesized distribution. It is used when we have two samples coming from two populations that can be different.

In [None]:
ks_2samp(oof_preds_dt[target==0], oof_preds_dt[target==1])

KstestResult(statistic=0.9120434332352834, pvalue=0.0, statistic_location=0.10622474607800855, statistic_sign=1)

In [None]:
ks_2samp(oof_preds_imputed[target==0], oof_preds_imputed[target==1])

KstestResult(statistic=0.9297365330640721, pvalue=0.0, statistic_location=0.07480233752247004, statistic_sign=1)

In [None]:
ks_2samp(oof_preds_synth[target==0], oof_preds_synth[target==1])

KstestResult(statistic=0.6111572201592976, pvalue=6.144495479016284e-218, statistic_location=0.042823373877734515, statistic_sign=1)

In [None]:
preds = pd.Series(0, index=test_df.index)
X_test = test_df[all_features].copy()

for pipeline in pipelines:
    X_test = test_df[all_features].copy()
    encoder, scaler, model = pipeline
    X_test = encoder.transform(X_test)
    X_test = scaler.transform(X_test)
    preds += model.predict_proba(X_test)[:, 1]

preds /= len(pipelines)
preds.rename('stroke', inplace=True)
preds.to_csv('submission.csv')