In [109]:
from sklearn.linear_model import LogisticRegression as lr

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics

In [110]:
# Enabled to remove warnings for demo purposes.
import warnings
warnings.filterwarnings('ignore')

In [111]:
import math
import numpy as np
import pandas as pd
# import scipy.stats as stats
import statsmodels.api as sm

import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline

import seaborn as sns
sns.set(rc={'figure.figsize':(16,10)}, font_scale=1.3)

from scipy import stats

In [112]:
df = pd.read_csv('./healthcare-dataset-stroke-data.csv')

In [113]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [114]:
# delete row with NA value
df_na = df.dropna()

In [115]:
df_na.reset_index(inplace=True)

In [116]:
pd.isnull(df_na).any()

index                False
id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [117]:
df_na

Unnamed: 0,index,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4904,5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
4905,5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
4906,5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
4907,5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [118]:
df_na['ever_married'] = df_na.ever_married.map({'Yes':1, 'No':0})
# For gender, 1 means Male, 0 means Female
df_na['gender'] = df_na.gender.map({'Male':1, 'Female':0})
# For Residence_type, 1 means Urban, 0 means Rural
df_na['Residence_type'] = df_na.Residence_type.map({'Urban':1, 'Rural':0})

In [119]:
pd.isnull(df_na).any()
# there is one sample with 'Other' gender'

index                False
id                   False
gender                True
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [120]:
# delete the row with 'gender' value of 'Other'
df_na = df_na.dropna()
df_na.reset_index(drop=True,inplace=True)
pd.isnull(df_na).any()

index                False
id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [121]:
df_na

Unnamed: 0,index,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,9046,1.0,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1
1,2,31112,1.0,80.0,0,1,1,Private,0,105.92,32.5,never smoked,1
2,3,60182,0.0,49.0,0,0,1,Private,1,171.23,34.4,smokes,1
3,4,1665,0.0,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1
4,5,56669,1.0,81.0,0,0,1,Private,1,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,5104,14180,0.0,13.0,0,0,0,children,0,103.08,18.6,Unknown,0
4904,5106,44873,0.0,81.0,0,0,1,Self-employed,1,125.20,40.0,never smoked,0
4905,5107,19723,0.0,35.0,0,0,1,Self-employed,0,82.99,30.6,never smoked,0
4906,5108,37544,1.0,51.0,0,0,1,Private,0,166.29,25.6,formerly smoked,0


In [122]:
def BMI_check(x):
    if x >= 30:
        return 1
    else:
        return 0
    
# Use bmi as treatment. The bound is 30. 1 means higher or equal. 0 means lower.
df_na['treatment_bmi'] = df_na.bmi.apply(BMI_check)

In [123]:
df_data = df_na.drop(columns=['index','id','bmi'])
df_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status,stroke,treatment_bmi
0,1.0,67.0,0,1,1,Private,1,228.69,formerly smoked,1,1
1,1.0,80.0,0,1,1,Private,0,105.92,never smoked,1,1
2,0.0,49.0,0,0,1,Private,1,171.23,smokes,1,1
3,0.0,79.0,1,0,1,Self-employed,0,174.12,never smoked,1,0
4,1.0,81.0,0,0,1,Private,1,186.21,formerly smoked,1,0
...,...,...,...,...,...,...,...,...,...,...,...
4903,0.0,13.0,0,0,0,children,0,103.08,Unknown,0,0
4904,0.0,81.0,0,0,1,Self-employed,1,125.20,never smoked,0,1
4905,0.0,35.0,0,0,1,Self-employed,0,82.99,never smoked,0,1
4906,1.0,51.0,0,0,1,Private,0,166.29,formerly smoked,0,0


In [124]:
df_encoded = pd.get_dummies(df_data, columns = ['work_type','smoking_status'], \
                           prefix = {'work_type':'work_type', 'smoking_status' : 'smoking_status'}, drop_first=False)
df_encoded

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,stroke,treatment_bmi,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.0,67.0,0,1,1,1,228.69,1,1,0,0,1,0,0,0,1,0,0
1,1.0,80.0,0,1,1,0,105.92,1,1,0,0,1,0,0,0,0,1,0
2,0.0,49.0,0,0,1,1,171.23,1,1,0,0,1,0,0,0,0,0,1
3,0.0,79.0,1,0,1,0,174.12,1,0,0,0,0,1,0,0,0,1,0
4,1.0,81.0,0,0,1,1,186.21,1,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,0.0,13.0,0,0,0,0,103.08,0,0,0,0,0,0,1,1,0,0,0
4904,0.0,81.0,0,0,1,1,125.20,0,1,0,0,0,1,0,0,0,1,0
4905,0.0,35.0,0,0,1,0,82.99,0,1,0,0,0,1,0,0,0,1,0
4906,1.0,51.0,0,0,1,0,166.29,0,0,0,0,1,0,0,0,1,0,0


In [125]:
df_treatment = df_encoded[df_encoded['treatment_bmi']==1]
df_control = df_encoded[df_encoded['treatment_bmi']==0]

In [126]:
treat_mean = df_treatment.mean()
treat_std = df_treatment.std()
control_mean = df_control.mean()
control_std = df_control.std()

In [127]:
X_list = ['gender','age','hypertension','heart_disease','ever_married','Residence_type','avg_glucose_level','work_type_Govt_job',
          'work_type_Never_worked','work_type_Private','work_type_Self-employed','work_type_children','smoking_status_Unknown',
          'smoking_status_formerly smoked','smoking_status_never smoked','smoking_status_smokes']
X_descrip = ['0 to Female and 1 to Male','age of the sample','observation of hypertension','observation of heart_disease',
             'Marriage states','0 to Rural and 1 to Urban','average glucose level','Whether is government job',
             'Whether is never worked','Whether job is private','Whether job is self-employed','Whether job is children',
             'Whether smoking status is unknown','Whether is formerly smoked','Whether is never smoked','Whether is smokes']

In [128]:
summary_list = []

for i in range(0, len(X_list)):
    summary_list.append([X_list[i],X_descrip[i],control_mean[X_list[i]],control_std[X_list[i]],treat_mean[X_list[i]],treat_std[X_list[i]],
                    stats.ttest_ind(df_control[X_list[i]], df_treatment[X_list[i]], equal_var=False)[0]])

summary_df = pd.DataFrame(summary_list,columns=
                          ['Label', 'Variable Description','Controls Mean','Controls STD','Treated Mean','Treated STD','t-Statistics'])
summary_df

Unnamed: 0,Label,Variable Description,Controls Mean,Controls STD,Treated Mean,Treated STD,t-Statistics
0,gender,0 to Female and 1 to Male,0.403614,0.490704,0.419271,0.493568,-1.086971
1,age,age of the sample,38.435181,24.417835,49.768646,17.166306,-19.074881
2,hypertension,observation of hypertension,0.058568,0.234853,0.14375,0.350927,-9.372606
3,heart_disease,observation of heart_disease,0.041165,0.198704,0.0625,0.242125,-3.225694
4,ever_married,Marriage states,0.557229,0.496797,0.801562,0.398927,-18.993364
5,Residence_type,0 to Rural and 1 to Urban,0.508367,0.500014,0.505729,0.500097,0.180334
6,avg_glucose_level,average glucose level,99.631493,37.844651,114.114974,51.861464,-10.562708
7,work_type_Govt_job,Whether is government job,0.111446,0.314736,0.154688,0.361701,-4.296516
8,work_type_Never_worked,Whether is never worked,0.005689,0.075226,0.002604,0.050978,1.712078
9,work_type_Private,Whether job is private,0.524766,0.49947,0.646875,0.478066,-8.580426


In [129]:
df = df_encoded
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,stroke,treatment_bmi,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.0,67.0,0,1,1,1,228.69,1,1,0,0,1,0,0,0,1,0,0
1,1.0,80.0,0,1,1,0,105.92,1,1,0,0,1,0,0,0,0,1,0
2,0.0,49.0,0,0,1,1,171.23,1,1,0,0,1,0,0,0,0,0,1
3,0.0,79.0,1,0,1,0,174.12,1,0,0,0,0,1,0,0,0,1,0
4,1.0,81.0,0,0,1,1,186.21,1,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,0.0,13.0,0,0,0,0,103.08,0,0,0,0,0,0,1,1,0,0,0
4904,0.0,81.0,0,0,1,1,125.20,0,1,0,0,0,1,0,0,0,1,0
4905,0.0,35.0,0,0,1,0,82.99,0,1,0,0,0,1,0,0,0,1,0
4906,1.0,51.0,0,0,1,0,166.29,0,0,0,0,1,0,0,0,1,0,0


In [130]:
Y = df.stroke

df_data = df.loc[:,df.columns !='stroke']
T = df_data.treatment_bmi
X = df_data.loc[:,df_data.columns !='treatment_bmi']

In [131]:
# Choose age,hypertension,heart_disease,avg_glucose_level as X_b
X_b = ['age','hypertension','heart_disease','avg_glucose_level']

In [132]:
T.mean()
# Since the sample size is unbalanced, we will weighted them in the regression model

0.39119804400978

In [133]:
# weighted based on size ratio of treatment group and control group
model_initial = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.5}))])
model_initial.fit(X.loc[:,X_b], T)

predictions_binary = model_initial.predict(X.loc[:,X_b])
print('Accuracy: {:.4f}\n'.format(metrics.accuracy_score(T, predictions_binary)))
print('Confusion matrix:\n{}\n'.format(metrics.confusion_matrix(T, predictions_binary)))
print('F1 score is: {:.4f}'.format(metrics.f1_score(T, predictions_binary)))

Accuracy: 0.6055

Confusion matrix:
[[1895 1093]
 [ 843 1077]]

F1 score is: 0.5267


In [134]:
likelihood_ratio_table = []
X_l = X_b.copy()

while not likelihood_ratio_table or max([x for x in likelihood_ratio_table[-1].values() if x != '-']) >= 1:
    if likelihood_ratio_table:
        max_likelihood_ratio = max([x for x in likelihood_ratio_table[-1].values() if x != '-'])
        next_label = [key for key in likelihood_ratio_table[-1] if likelihood_ratio_table[-1][key] == max_likelihood_ratio][0]
        X_l.append(next_label)
    model = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.5}))])
    model.fit(X.loc[:,X_l], T)
    step = {}
    for x in X.columns:
        if x in X_l:
            step[x] = '-'
        else:
            X_new = X_l.copy()
            X_new.append(x)
            model_new = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.5}))])
            model_new.fit(X.loc[:,X_new], T)
            
            step[x] = -2*((-metrics.log_loss(T, model.predict_proba(X.loc[:,X_l]))*len(T))
                      -(-metrics.log_loss(T, model_new.predict_proba(X.loc[:,X_new]))*len(T)))
    likelihood_ratio_table.append(step)
    

In [135]:
linear_terms_list = []
for x in X.columns:
    row = [step[x] for step in likelihood_ratio_table]
    row.insert(0, x)
    linear_terms_list.append(row)
    
columns = ['Label']
for i in range(0,len(likelihood_ratio_table)):
    step = 'Step ' + str(i+1)
    columns.append(step)
    
linear_terms_df = pd.DataFrame(linear_terms_list,columns = columns)
linear_terms_df

Unnamed: 0,Label,Step 1,Step 2,Step 3,Step 4,Step 5,Step 6,Step 7
0,gender,1.569423,5.786234,5.734279,5.162963,-,-,-
1,age,-,-,-,-,-,-,-
2,hypertension,-,-,-,-,-,-,-
3,heart_disease,-,-,-,-,-,-,-
4,ever_married,78.561411,35.931638,-,-,-,-,-
5,Residence_type,0.100047,0.064964,0.052424,0.054262,0.046287,0.039798,0.029011
6,avg_glucose_level,-,-,-,-,-,-,-
7,work_type_Govt_job,5.97007,1.604171,0.986685,0.974741,1.011634,0.443847,0.412827
8,work_type_Never_worked,0.076288,2.512238,1.338486,1.273616,1.379935,1.378694,-
9,work_type_Private,55.49972,0.527418,0.484006,0.508618,0.479834,0.179588,0.404223


In [136]:
# The linear terms we need to include
X_l

['age',
 'hypertension',
 'heart_disease',
 'avg_glucose_level',
 'work_type_children',
 'ever_married',
 'smoking_status_formerly smoked',
 'gender',
 'work_type_Self-employed',
 'work_type_Never_worked']

In [137]:
model_linear = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.5}))])
model_linear.fit(X.loc[:,X_l], T)

predictions_binary = model_linear.predict(X.loc[:,X_l])
print('Accuracy: {:.4f}\n'.format(metrics.accuracy_score(T, predictions_binary)))
print('Confusion matrix:\n{}\n'.format(metrics.confusion_matrix(T, predictions_binary)))
print('F1 score is: {:.4f}'.format(metrics.f1_score(T, predictions_binary)))

Accuracy: 0.6037

Confusion matrix:
[[1506 1482]
 [ 463 1457]]

F1 score is: 0.5997


In [138]:
X_copy = X.loc[:,X_l]

for i in range(0, len(X_l)):
    j = i
    while j < len(X_l):
        X_copy[X_l[i]+'*'+X_l[j]] = X_copy[X_l[i]] * X_copy[X_l[j]]
        j += 1

X_copy

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,work_type_children,ever_married,smoking_status_formerly smoked,gender,work_type_Self-employed,work_type_Never_worked,...,smoking_status_formerly smoked*smoking_status_formerly smoked,smoking_status_formerly smoked*gender,smoking_status_formerly smoked*work_type_Self-employed,smoking_status_formerly smoked*work_type_Never_worked,gender*gender,gender*work_type_Self-employed,gender*work_type_Never_worked,work_type_Self-employed*work_type_Self-employed,work_type_Self-employed*work_type_Never_worked,work_type_Never_worked*work_type_Never_worked
0,67.0,0,1,228.69,0,1,1,1.0,0,0,...,1,1.0,0,0,1.0,0.0,0.0,0,0,0
1,80.0,0,1,105.92,0,1,0,1.0,0,0,...,0,0.0,0,0,1.0,0.0,0.0,0,0,0
2,49.0,0,0,171.23,0,1,0,0.0,0,0,...,0,0.0,0,0,0.0,0.0,0.0,0,0,0
3,79.0,1,0,174.12,0,1,0,0.0,1,0,...,0,0.0,0,0,0.0,0.0,0.0,1,0,0
4,81.0,0,0,186.21,0,1,1,1.0,0,0,...,1,1.0,0,0,1.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,13.0,0,0,103.08,1,0,0,0.0,0,0,...,0,0.0,0,0,0.0,0.0,0.0,0,0,0
4904,81.0,0,0,125.20,0,1,0,0.0,1,0,...,0,0.0,0,0,0.0,0.0,0.0,1,0,0
4905,35.0,0,0,82.99,0,1,0,0.0,1,0,...,0,0.0,0,0,0.0,0.0,0.0,1,0,0
4906,51.0,0,0,166.29,0,1,1,1.0,0,0,...,1,1.0,0,0,1.0,0.0,0.0,0,0,0


In [139]:
likelihood_ratio_table = []
X_2 = X_l.copy()

while not likelihood_ratio_table or max([x for x in likelihood_ratio_table[-1].values() if x != '-']) >= 2.71:
    if likelihood_ratio_table:
        max_likelihood_ratio = max([x for x in likelihood_ratio_table[-1].values() if x != '-'])
        next_label = [key for key in likelihood_ratio_table[-1] if likelihood_ratio_table[-1][key] == max_likelihood_ratio][0]
        X_2.append(next_label)
    model = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.5}))])
    model.fit(X_copy.loc[:,X_2], T)
    step = {}
    for x in X_copy.columns:
        if x in X_2:
            step[x] = '-'
        else:
            X_new = X_2.copy()
            X_new.append(x)
            model_new = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.5}))])
            model_new.fit(X_copy.loc[:,X_new], T)
            
            step[x] = -2*((-metrics.log_loss(T, model.predict_proba(X_copy.loc[:,X_2]))*len(T))
                      -(-metrics.log_loss(T, model_new.predict_proba(X_copy.loc[:,X_new]))*len(T)))
    likelihood_ratio_table.append(step)

In [140]:
final_terms_list = []
for x in X_copy.columns:
    row = [step[x] for step in likelihood_ratio_table]
    row.insert(0, x)
    final_terms_list.append(row)
    
final_columns = ['Label']
for i in range(0,len(likelihood_ratio_table)):
    step = 'Step ' + str(i+1)
    final_columns.append(step)
    
final_terms_df = pd.DataFrame(final_terms_list,columns = final_columns)
final_terms_df

Unnamed: 0,Label,Step 1,Step 2,Step 3,Step 4,Step 5,Step 6,Step 7,Step 8,Step 9,Step 10,Step 11,Step 12
0,age,-,-,-,-,-,-,-,-,-,-,-,-
1,hypertension,-,-,-,-,-,-,-,-,-,-,-,-
2,heart_disease,-,-,-,-,-,-,-,-,-,-,-,-
3,avg_glucose_level,-,-,-,-,-,-,-,-,-,-,-,-
4,work_type_children,-,-,-,-,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,gender*work_type_Self-employed,0.047222,0.030021,0.051126,0.068857,0.087996,0.089939,0.059366,0.003143,-0.016522,-0.006677,0.010198,0.005247
61,gender*work_type_Never_worked,0.095687,0.145792,0.129843,0.155005,0.19042,0.172197,0.049688,0.063308,0.039769,0.066014,0.091861,0.091554
62,work_type_Self-employed*work_type_Self-employed,0.003233,0.006437,0.000343,0.000102,-0.016273,0.001892,-0.001782,0.01273,-0.019186,-0.038871,0.000656,0.009972
63,work_type_Self-employed*work_type_Never_worked,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0


In [141]:
# This is the final labela that we need to consider
X_2

['age',
 'hypertension',
 'heart_disease',
 'avg_glucose_level',
 'work_type_children',
 'ever_married',
 'smoking_status_formerly smoked',
 'gender',
 'work_type_Self-employed',
 'work_type_Never_worked',
 'age*age',
 'age*hypertension',
 'age*work_type_children',
 'age*avg_glucose_level',
 'age*ever_married',
 'age*gender',
 'avg_glucose_level*work_type_Self-employed',
 'ever_married*work_type_Self-employed',
 'avg_glucose_level*ever_married',
 'work_type_children*smoking_status_formerly smoked',
 'age*smoking_status_formerly smoked']

In [142]:
model_final = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.5}))])
model_final.fit(X_copy.loc[:,X_2], T)

predictions_binary = model_final.predict(X_copy.loc[:,X_2])
print('Accuracy: {:.4f}\n'.format(metrics.accuracy_score(T, predictions_binary)))
print('Confusion matrix:\n{}\n'.format(metrics.confusion_matrix(T, predictions_binary)))
print('F1 score is: {:.4f}'.format(metrics.f1_score(T, predictions_binary)))

Accuracy: 0.6318

Confusion matrix:
[[1633 1355]
 [ 452 1468]]

F1 score is: 0.6190


In [144]:
treat_index = []
control_index = []
for i in range(0,len(T)):
    if T[i] == 1:
        treat_index.append(i)
    elif T[i] == 0:
        control_index.append(i)

In [145]:
predictions = model_final.predict_proba(X_copy.loc[:,X_2])

ate_ipw = sum([T[i]*Y[i]/predictions[i][1] for i in treat_index]) / sum([T[i]/predictions[i][1] for i in treat_index]) 
- sum([(1-T[i])*Y[i]/(1-predictions[i][1]) for i in control_index]) / sum([(1-T[i])/(1-predictions[i][1]) for i in control_index])

-0.044172488914728664