In [7]:

import warnings
warnings.filterwarnings('ignore')
import math
import numpy as np
import pandas as pd
# import scipy.stats as stats
import statsmodels.api as sm

import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline

import seaborn as sns
sns.set(rc={'figure.figsize':(16,10)}, font_scale=1.3)

from scipy import stats

from sklearn.linear_model import LogisticRegression as lr

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics



In [8]:
df = pd.read_csv('./healthcare-dataset-stroke-data.csv')

In [9]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [10]:
# delete row with NA value
df_na = df.dropna()

In [11]:
df_na.reset_index(inplace=True)

In [12]:
pd.isnull(df_na).any()

index                False
id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [13]:
df_na

Unnamed: 0,index,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4904,5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
4905,5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
4906,5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
4907,5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [14]:
df_na['ever_married'] = df_na.ever_married.map({'Yes':1, 'No':0})
# For gender, 1 means Male, 0 means Female
df_na['gender'] = df_na.gender.map({'Male':1, 'Female':0})
# For Residence_type, 1 means Urban, 0 means Rural
df_na['Residence_type'] = df_na.Residence_type.map({'Urban':1, 'Rural':0})

In [15]:
pd.isnull(df_na).any()
# delete the row with 'gender' value of 'Other'
df_na = df_na.dropna()
df_na.reset_index(drop=True,inplace=True)
pd.isnull(df_na).any()

index                False
id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [16]:
df_data = df_na.drop(columns=['index','id'])
df_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1.0,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1
1,1.0,80.0,0,1,1,Private,0,105.92,32.5,never smoked,1
2,0.0,49.0,0,0,1,Private,1,171.23,34.4,smokes,1
3,0.0,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1
4,1.0,81.0,0,0,1,Private,1,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4903,0.0,13.0,0,0,0,children,0,103.08,18.6,Unknown,0
4904,0.0,81.0,0,0,1,Self-employed,1,125.20,40.0,never smoked,0
4905,0.0,35.0,0,0,1,Self-employed,0,82.99,30.6,never smoked,0
4906,1.0,51.0,0,0,1,Private,0,166.29,25.6,formerly smoked,0


In [17]:
df_encoded = pd.get_dummies(df_data, columns = ['work_type','smoking_status'], \
                           prefix = {'work_type':'work_type', 'smoking_status' : 'smoking_status'}, drop_first=False)
df_encoded

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.0,67.0,0,1,1,1,228.69,36.6,1,0,0,1,0,0,0,1,0,0
1,1.0,80.0,0,1,1,0,105.92,32.5,1,0,0,1,0,0,0,0,1,0
2,0.0,49.0,0,0,1,1,171.23,34.4,1,0,0,1,0,0,0,0,0,1
3,0.0,79.0,1,0,1,0,174.12,24.0,1,0,0,0,1,0,0,0,1,0
4,1.0,81.0,0,0,1,1,186.21,29.0,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,0.0,13.0,0,0,0,0,103.08,18.6,0,0,0,0,0,1,1,0,0,0
4904,0.0,81.0,0,0,1,1,125.20,40.0,0,0,0,0,1,0,0,0,1,0
4905,0.0,35.0,0,0,1,0,82.99,30.6,0,0,0,0,1,0,0,0,1,0
4906,1.0,51.0,0,0,1,0,166.29,25.6,0,0,0,1,0,0,0,1,0,0


In [18]:
df_treatment = df_encoded[df_encoded['hypertension']==1]
df_control = df_encoded[df_encoded['hypertension']==0]

In [19]:
treat_mean = df_treatment.mean()
treat_std = df_treatment.std()
control_mean = df_control.mean()
control_std = df_control.std()

In [20]:
X_list = ['gender','age','heart_disease','ever_married','Residence_type','avg_glucose_level','bmi','work_type_Govt_job',
          'work_type_Never_worked','work_type_Private','work_type_Self-employed','work_type_children','smoking_status_Unknown',
          'smoking_status_formerly smoked','smoking_status_never smoked','smoking_status_smokes']
X_descrip = ['0 to Female and 1 to Male','age of the sample','observation of heart_disease',
             'Marriage states','0 to Rural and 1 to Urban','average glucose level','bmi','Whether is government job',
             'Whether is never worked','Whether job is private','Whether job is self-employed','Whether job is children',
             'Whether smoking status is unknown','Whether is formerly smoked','Whether is never smoked','Whether is smokes']

In [21]:
summary_list = []

for i in range(0, len(X_list)):
    summary_list.append([X_list[i],X_descrip[i],control_mean[X_list[i]],control_std[X_list[i]],treat_mean[X_list[i]],treat_std[X_list[i]],
                    stats.ttest_ind(df_control[X_list[i]], df_treatment[X_list[i]], equal_var=False)[0]])

summary_df = pd.DataFrame(summary_list,columns=
                          ['Label', 'Variable Description','Controls Mean','Controls STD','Treated Mean','Treated STD','t-Statistics'])
summary_df

Unnamed: 0,Label,Variable Description,Controls Mean,Controls STD,Treated Mean,Treated STD,t-Statistics
0,gender,0 to Female and 1 to Male,0.406327,0.491202,0.443459,0.497345,-1.512645
1,age,age of the sample,40.900184,22.330268,62.323725,13.877298,-29.183933
2,heart_disease,observation of heart_disease,0.041508,0.199484,0.128603,0.335132,-5.422738
3,ever_married,Marriage states,0.628225,0.483333,0.895787,0.305876,-16.597811
4,Residence_type,0 to Rural and 1 to Urban,0.507516,0.5,0.505543,0.500524,0.079782
5,avg_glucose_level,average glucose level,102.745255,41.622091,130.518958,60.536651,-9.518253
6,bmi,bmi,28.475432,7.639211,33.036585,8.711778,-10.709907
7,work_type_Govt_job,Whether is government job,0.126318,0.332245,0.148559,0.356048,-1.271715
8,work_type_Never_worked,Whether is never worked,0.004936,0.070091,0.0,0.0,4.701507
9,work_type_Private,Whether job is private,0.573256,0.49466,0.56541,0.496254,0.320033


In [22]:
df = df_encoded
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.0,67.0,0,1,1,1,228.69,36.6,1,0,0,1,0,0,0,1,0,0
1,1.0,80.0,0,1,1,0,105.92,32.5,1,0,0,1,0,0,0,0,1,0
2,0.0,49.0,0,0,1,1,171.23,34.4,1,0,0,1,0,0,0,0,0,1
3,0.0,79.0,1,0,1,0,174.12,24.0,1,0,0,0,1,0,0,0,1,0
4,1.0,81.0,0,0,1,1,186.21,29.0,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,0.0,13.0,0,0,0,0,103.08,18.6,0,0,0,0,0,1,1,0,0,0
4904,0.0,81.0,0,0,1,1,125.20,40.0,0,0,0,0,1,0,0,0,1,0
4905,0.0,35.0,0,0,1,0,82.99,30.6,0,0,0,0,1,0,0,0,1,0
4906,1.0,51.0,0,0,1,0,166.29,25.6,0,0,0,1,0,0,0,1,0,0


In [23]:
Y = df.stroke

df_data = df.loc[:,df.columns !='stroke']
T = df_data.hypertension
X = df_data.loc[:,df_data.columns !='hypertension']

In [24]:
X_h = ['age','heart_disease','avg_glucose_level','bmi']

In [25]:
T.mean()

0.09189079054604726

In [52]:
# weighted based on size ratio of treatment group and control group
model_initial = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:4.75}))])
model_initial.fit(X.loc[:,X_h], T)

predictions_binary = model_initial.predict(X.loc[:,X_h])
print('Accuracy: {:.4f}\n'.format(metrics.accuracy_score(T, predictions_binary)))
print('Confusion matrix:\n{}\n'.format(metrics.confusion_matrix(T, predictions_binary)))
print('F1 score is: {:.4f}'.format(metrics.f1_score(T, predictions_binary)))

Accuracy: 0.8138

Confusion matrix:
[[3756  701]
 [ 213  238]]

F1 score is: 0.3424


In [53]:
likelihood_ratio_table = []
X_l = X_h.copy()

while not likelihood_ratio_table or max([x for x in likelihood_ratio_table[-1].values() if x != '-']) >= 1:
    if likelihood_ratio_table:
        max_likelihood_ratio = max([x for x in likelihood_ratio_table[-1].values() if x != '-'])
        next_label = [key for key in likelihood_ratio_table[-1] if likelihood_ratio_table[-1][key] == max_likelihood_ratio][0]
        X_l.append(next_label)
    model = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:4.75}))])
    model.fit(X.loc[:,X_l], T)
    step = {}
    for x in X.columns:
        if x in X_l:
            step[x] = '-'
        else:
            X_new = X_l.copy()
            X_new.append(x)
            model_new = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:4.75}))])
            model_new.fit(X.loc[:,X_new], T)
            
            step[x] = -2*((-metrics.log_loss(T, model.predict_proba(X.loc[:,X_l]))*len(T))
                      -(-metrics.log_loss(T, model_new.predict_proba(X.loc[:,X_new]))*len(T)))
    likelihood_ratio_table.append(step)

In [54]:
linear_terms_list = []
for x in X.columns:
    row = [step[x] for step in likelihood_ratio_table]
    row.insert(0, x)
    linear_terms_list.append(row)
    
columns = ['Label']
for i in range(0,len(likelihood_ratio_table)):
    step = 'Step ' + str(i+1)
    columns.append(step)
    
linear_terms_df = pd.DataFrame(linear_terms_list,columns = columns)
linear_terms_df

Unnamed: 0,Label,Step 1,Step 2,Step 3,Step 4,Step 5,Step 6
0,gender,2.760703,2.816642,3.69661,3.751032,-,-
1,age,-,-,-,-,-,-
2,heart_disease,-,-,-,-,-,-
3,ever_married,1.077945,0.441671,0.655993,0.357304,0.297107,0.319278
4,Residence_type,0.108128,0.14971,0.083488,0.083278,0.051117,0.033348
5,avg_glucose_level,-,-,-,-,-,-
6,bmi,-,-,-,-,-,-
7,work_type_Govt_job,-0.263682,-0.088568,-0.089424,-0.038571,-0.070145,-0.146768
8,work_type_Never_worked,0.619679,0.64797,0.620592,0.676138,0.682143,0.685405
9,work_type_Private,0.247748,0.223716,0.21701,0.469573,0.522797,-0.121361


In [55]:
X_l

['age',
 'heart_disease',
 'avg_glucose_level',
 'bmi',
 'smoking_status_Unknown',
 'smoking_status_formerly smoked',
 'work_type_children',
 'gender',
 'work_type_Self-employed']

In [56]:
model_linear = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:4.75}))])
model_linear.fit(X.loc[:,X_l], T)

predictions_binary = model_linear.predict(X.loc[:,X_l])
print('Accuracy: {:.4f}\n'.format(metrics.accuracy_score(T, predictions_binary)))
print('Confusion matrix:\n{}\n'.format(metrics.confusion_matrix(T, predictions_binary)))
print('F1 score is: {:.4f}'.format(metrics.f1_score(T, predictions_binary)))

Accuracy: 0.8162

Confusion matrix:
[[3768  689]
 [ 213  238]]

F1 score is: 0.3454


In [57]:
X_copy = X.loc[:,X_l]

for i in range(0, len(X_l)):
    j = i
    while j < len(X_l):
        X_copy[X_l[i]+'*'+X_l[j]] = X_copy[X_l[i]] * X_copy[X_l[j]]
        j += 1

X_copy

Unnamed: 0,age,heart_disease,avg_glucose_level,bmi,smoking_status_Unknown,smoking_status_formerly smoked,work_type_children,gender,work_type_Self-employed,age*age,...,smoking_status_formerly smoked*smoking_status_formerly smoked,smoking_status_formerly smoked*work_type_children,smoking_status_formerly smoked*gender,smoking_status_formerly smoked*work_type_Self-employed,work_type_children*work_type_children,work_type_children*gender,work_type_children*work_type_Self-employed,gender*gender,gender*work_type_Self-employed,work_type_Self-employed*work_type_Self-employed
0,67.0,1,228.69,36.6,0,1,0,1.0,0,4489.0,...,1,0,1.0,0,0,0.0,0,1.0,0.0,0
1,80.0,1,105.92,32.5,0,0,0,1.0,0,6400.0,...,0,0,0.0,0,0,0.0,0,1.0,0.0,0
2,49.0,0,171.23,34.4,0,0,0,0.0,0,2401.0,...,0,0,0.0,0,0,0.0,0,0.0,0.0,0
3,79.0,0,174.12,24.0,0,0,0,0.0,1,6241.0,...,0,0,0.0,0,0,0.0,0,0.0,0.0,1
4,81.0,0,186.21,29.0,0,1,0,1.0,0,6561.0,...,1,0,1.0,0,0,0.0,0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,13.0,0,103.08,18.6,1,0,1,0.0,0,169.0,...,0,0,0.0,0,1,0.0,0,0.0,0.0,0
4904,81.0,0,125.20,40.0,0,0,0,0.0,1,6561.0,...,0,0,0.0,0,0,0.0,0,0.0,0.0,1
4905,35.0,0,82.99,30.6,0,0,0,0.0,1,1225.0,...,0,0,0.0,0,0,0.0,0,0.0,0.0,1
4906,51.0,0,166.29,25.6,0,1,0,1.0,0,2601.0,...,1,0,1.0,0,0,0.0,0,1.0,0.0,0


In [58]:
likelihood_ratio_table = []
X_2 = X_l.copy()

while not likelihood_ratio_table or max([x for x in likelihood_ratio_table[-1].values() if x != '-']) >= 2.71:
    if likelihood_ratio_table:
        max_likelihood_ratio = max([x for x in likelihood_ratio_table[-1].values() if x != '-'])
        next_label = [key for key in likelihood_ratio_table[-1] if likelihood_ratio_table[-1][key] == max_likelihood_ratio][0]
        X_2.append(next_label)
    model = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:4.75}))])
    model.fit(X_copy.loc[:,X_2], T)
    step = {}
    for x in X_copy.columns:
        if x in X_2:
            step[x] = '-'
        else:
            X_new = X_2.copy()
            X_new.append(x)
            model_new = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:4.75}))])
            model_new.fit(X_copy.loc[:,X_new], T)
            
            step[x] = -2*((-metrics.log_loss(T, model.predict_proba(X_copy.loc[:,X_2]))*len(T))
                      -(-metrics.log_loss(T, model_new.predict_proba(X_copy.loc[:,X_new]))*len(T)))
    likelihood_ratio_table.append(step)

In [59]:
final_terms_list = []
for x in X_copy.columns:
    row = [step[x] for step in likelihood_ratio_table]
    row.insert(0, x)
    final_terms_list.append(row)
    
final_columns = ['Label']
for i in range(0,len(likelihood_ratio_table)):
    step = 'Step ' + str(i+1)
    final_columns.append(step)
    
final_terms_df = pd.DataFrame(final_terms_list,columns = final_columns)
final_terms_df

Unnamed: 0,Label,Step 1,Step 2,Step 3,Step 4,Step 5,Step 6,Step 7,Step 8,Step 9,Step 10
0,age,-,-,-,-,-,-,-,-,-,-
1,heart_disease,-,-,-,-,-,-,-,-,-,-
2,avg_glucose_level,-,-,-,-,-,-,-,-,-,-
3,bmi,-,-,-,-,-,-,-,-,-,-
4,smoking_status_Unknown,-,-,-,-,-,-,-,-,-,-
5,smoking_status_formerly smoked,-,-,-,-,-,-,-,-,-,-
6,work_type_children,-,-,-,-,-,-,-,-,-,-
7,gender,-,-,-,-,-,-,-,-,-,-
8,work_type_Self-employed,-,-,-,-,-,-,-,-,-,-
9,age*age,15.952247,-,-,-,-,-,-,-,-,-


In [60]:
X_2

['age',
 'heart_disease',
 'avg_glucose_level',
 'bmi',
 'smoking_status_Unknown',
 'smoking_status_formerly smoked',
 'work_type_children',
 'gender',
 'work_type_Self-employed',
 'age*age',
 'age*bmi',
 'age*gender',
 'gender*work_type_Self-employed',
 'heart_disease*work_type_Self-employed',
 'smoking_status_Unknown*work_type_Self-employed',
 'avg_glucose_level*smoking_status_Unknown',
 'age*smoking_status_formerly smoked',
 'avg_glucose_level*bmi']

In [61]:
model_final = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:4.75}))])
model_final.fit(X_copy.loc[:,X_2], T)

predictions_binary = model_final.predict(X_copy.loc[:,X_2])
print('Accuracy: {:.4f}\n'.format(metrics.accuracy_score(T, predictions_binary)))
print('Confusion matrix:\n{}\n'.format(metrics.confusion_matrix(T, predictions_binary)))
print('F1 score is: {:.4f}'.format(metrics.f1_score(T, predictions_binary)))

Accuracy: 0.8115

Confusion matrix:
[[3725  732]
 [ 193  258]]

F1 score is: 0.3581


In [62]:
treat_index = []
control_index = []
for i in range(0,len(T)):
    if T[i] == 1:
        treat_index.append(i)
    elif T[i] == 0:
        control_index.append(i)

In [63]:
predictions = model_final.predict_proba(X_copy.loc[:,X_2])

ate_ipw = sum([T[i]*Y[i]/predictions[i][1] for i in treat_index]) / sum([T[i]/predictions[i][1] for i in treat_index]) 
- sum([(1-T[i])*Y[i]/(1-predictions[i][1]) for i in control_index]) / sum([(1-T[i])/(1-predictions[i][1]) for i in control_index])

-0.05317933928430953