In [106]:
from sklearn.linear_model import LogisticRegression as lr

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics

In [107]:
# Enabled to remove warnings for demo purposes.
import warnings
warnings.filterwarnings('ignore')

In [108]:
import math
import numpy as np
import pandas as pd
# import scipy.stats as stats
import statsmodels.api as sm

import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline

import seaborn as sns
sns.set(rc={'figure.figsize':(16,10)}, font_scale=1.3)

In [109]:
from scipy import stats

In [110]:
df = pd.read_csv('./stroke_data_processed.csv')
df

Unnamed: 0.1,Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status,stroke,treatment_bmi
0,0,1.0,67.0,0,1,1,Private,1,228.69,formerly smoked,1,1
1,1,1.0,80.0,0,1,1,Private,0,105.92,never smoked,1,1
2,2,0.0,49.0,0,0,1,Private,1,171.23,smokes,1,1
3,3,0.0,79.0,1,0,1,Self-employed,0,174.12,never smoked,1,0
4,4,1.0,81.0,0,0,1,Private,1,186.21,formerly smoked,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4903,4904,0.0,13.0,0,0,0,children,0,103.08,Unknown,0,0
4904,4905,0.0,81.0,0,0,1,Self-employed,1,125.20,never smoked,0,1
4905,4906,0.0,35.0,0,0,1,Self-employed,0,82.99,never smoked,0,1
4906,4907,1.0,51.0,0,0,1,Private,0,166.29,formerly smoked,0,0


In [111]:
df.drop(columns=['Unnamed: 0'],inplace=True)
df_encoded = pd.get_dummies(df, columns = ['work_type','smoking_status'], \
                           prefix = {'work_type':'work_type', 'smoking_status' : 'smoking_status'}, drop_first=False)
df_encoded
df_encoded.to_csv('final_dataset.csv')

In [112]:
df_encoded['treatment_bmi'].sum()

1920

In [113]:
df_treatment = df_encoded[df_encoded['treatment_bmi']==1]
df_treatment

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,stroke,treatment_bmi,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.0,67.0,0,1,1,1,228.69,1,1,0,0,1,0,0,0,1,0,0
1,1.0,80.0,0,1,1,0,105.92,1,1,0,0,1,0,0,0,0,1,0
2,0.0,49.0,0,0,1,1,171.23,1,1,0,0,1,0,0,0,0,0,1
9,0.0,61.0,0,1,1,0,120.46,1,1,1,0,0,0,0,0,0,0,1
12,0.0,50.0,1,0,1,0,167.41,1,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,1.0,76.0,0,0,1,1,82.35,0,1,1,0,0,0,0,0,0,1,0
4897,1.0,68.0,0,0,1,1,91.68,0,1,0,0,0,1,0,1,0,0,0
4902,0.0,18.0,0,0,0,1,82.85,0,1,0,0,1,0,0,1,0,0,0
4904,0.0,81.0,0,0,1,1,125.20,0,1,0,0,0,1,0,0,0,1,0


In [114]:
df_control = df_encoded[df_encoded['treatment_bmi']==0]
df_control

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,stroke,treatment_bmi,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
3,0.0,79.0,1,0,1,0,174.12,1,0,0,0,0,1,0,0,0,1,0
4,1.0,81.0,0,0,1,1,186.21,1,0,0,0,1,0,0,0,1,0,0
5,1.0,74.0,1,1,1,0,70.09,1,0,0,0,1,0,0,0,0,1,0
6,0.0,69.0,0,0,0,1,94.39,1,0,0,0,1,0,0,0,0,1,0
7,0.0,78.0,0,0,1,1,58.57,1,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4900,0.0,45.0,0,0,1,1,97.95,0,0,0,0,1,0,0,1,0,0,0
4901,0.0,57.0,0,0,1,0,77.93,0,0,0,0,1,0,0,0,0,1,0
4903,0.0,13.0,0,0,0,0,103.08,0,0,0,0,0,0,1,1,0,0,0
4906,1.0,51.0,0,0,1,0,166.29,0,0,0,0,1,0,0,0,1,0,0


In [115]:
treat_mean = df_treatment.mean()
treat_std = df_treatment.std()

In [116]:
control_mean = df_control.mean()
control_std = df_control.std()

In [117]:
X_list = ['gender','age','hypertension','heart_disease','ever_married','Residence_type','avg_glucose_level','work_type_Govt_job',
          'work_type_Never_worked','work_type_Private','work_type_Self-employed','work_type_children','smoking_status_Unknown',
          'smoking_status_formerly smoked','smoking_status_never smoked','smoking_status_smokes']
X_descrip = ['0 to Female and 1 to Male','age of the sample','observation of hypertension','observation of heart_disease',
             'Marriage states','0 to Rural and 1 to Urban','average glucose level','Whether is government job',
             'Whether is never worked job','Whether is private job','Whether is self-employed job','Whether is children job',
             'Whether is unknown smoking','0 to Female and 1 to Male','0 to Female and 1 to Male','0 to Female and 1 to Male']

In [118]:
len(X_descrip)

16

In [119]:
summary_list = []

for i in range(0, len(X_list)):
    summary_list.append([X_list[i],X_descrip[i],control_mean[X_list[i]],control_std[X_list[i]],treat_mean[X_list[i]],treat_std[X_list[i]],
                    stats.ttest_ind(df_control[X_list[i]], df_treatment[X_list[i]], equal_var=False)[0]])

summary_df = pd.DataFrame(summary_list,columns=
                          ['Label', 'Variable Description','Controls Mean','Controls STD','Treated Mean','Treated STD','t-Statistics'])

In [120]:
summary_df

Unnamed: 0,Label,Variable Description,Controls Mean,Controls STD,Treated Mean,Treated STD,t-Statistics
0,gender,0 to Female and 1 to Male,0.403614,0.490704,0.419271,0.493568,-1.086971
1,age,age of the sample,38.435181,24.417835,49.768646,17.166306,-19.074881
2,hypertension,observation of hypertension,0.058568,0.234853,0.14375,0.350927,-9.372606
3,heart_disease,observation of heart_disease,0.041165,0.198704,0.0625,0.242125,-3.225694
4,ever_married,Marriage states,0.557229,0.496797,0.801562,0.398927,-18.993364
5,Residence_type,0 to Rural and 1 to Urban,0.508367,0.500014,0.505729,0.500097,0.180334
6,avg_glucose_level,average glucose level,99.631493,37.844651,114.114974,51.861464,-10.562708
7,work_type_Govt_job,Whether is government job,0.111446,0.314736,0.154688,0.361701,-4.296516
8,work_type_Never_worked,Whether is never worked job,0.005689,0.075226,0.002604,0.050978,1.712078
9,work_type_Private,Whether is private job,0.524766,0.49947,0.646875,0.478066,-8.580426
