In [1]:
from sklearn.linear_model import LogisticRegression as lr

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics

In [2]:
# Enabled to remove warnings for demo purposes.
import warnings
warnings.filterwarnings('ignore')

In [3]:
import math
import numpy as np
import pandas as pd
# import scipy.stats as stats
import statsmodels.api as sm

import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline

import seaborn as sns
sns.set(rc={'figure.figsize':(16,10)}, font_scale=1.3)

from scipy import stats

In [4]:
df = pd.read_csv('./healthcare-dataset-stroke-data.csv')

In [5]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [6]:
# delete row with NA value
df_na = df.dropna()

In [7]:
df_na.reset_index(inplace=True)

In [8]:
pd.isnull(df_na).any()

index                False
id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [9]:
df_na

Unnamed: 0,index,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4904,5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
4905,5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
4906,5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
4907,5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [10]:
df_na['ever_married'] = df_na.ever_married.map({'Yes':1, 'No':0})
# For gender, 1 means Male, 0 means Female
df_na['gender'] = df_na.gender.map({'Male':1, 'Female':0})
# For Residence_type, 1 means Urban, 0 means Rural
df_na['Residence_type'] = df_na.Residence_type.map({'Urban':1, 'Rural':0})

In [11]:
pd.isnull(df_na).any()

index                False
id                   False
gender                True
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [12]:
df_na = df_na.dropna()
df_na.reset_index(drop=True,inplace=True)
pd.isnull(df_na).any()

index                False
id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [13]:
df_na

Unnamed: 0,index,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,9046,1.0,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1
1,2,31112,1.0,80.0,0,1,1,Private,0,105.92,32.5,never smoked,1
2,3,60182,0.0,49.0,0,0,1,Private,1,171.23,34.4,smokes,1
3,4,1665,0.0,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1
4,5,56669,1.0,81.0,0,0,1,Private,1,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,5104,14180,0.0,13.0,0,0,0,children,0,103.08,18.6,Unknown,0
4904,5106,44873,0.0,81.0,0,0,1,Self-employed,1,125.20,40.0,never smoked,0
4905,5107,19723,0.0,35.0,0,0,1,Self-employed,0,82.99,30.6,never smoked,0
4906,5108,37544,1.0,51.0,0,0,1,Private,0,166.29,25.6,formerly smoked,0


In [14]:
def AGE_check(x):
    if x >= 45:
        return 1
    else:
        return 0
    
df_na['treatment_age'] = df_na.age.apply(AGE_check)

In [15]:
df_data = df_na.drop(columns = ['index','id','age'])
df_data

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,treatment_age
0,1.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1,1
1,1.0,0,1,1,Private,0,105.92,32.5,never smoked,1,1
2,0.0,0,0,1,Private,1,171.23,34.4,smokes,1,1
3,0.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1,1
4,1.0,0,0,1,Private,1,186.21,29.0,formerly smoked,1,1
...,...,...,...,...,...,...,...,...,...,...,...
4903,0.0,0,0,0,children,0,103.08,18.6,Unknown,0,0
4904,0.0,0,0,1,Self-employed,1,125.20,40.0,never smoked,0,1
4905,0.0,0,0,1,Self-employed,0,82.99,30.6,never smoked,0,0
4906,1.0,0,0,1,Private,0,166.29,25.6,formerly smoked,0,1


In [16]:
df_encoded = pd.get_dummies(df_data, columns = ['work_type','smoking_status'], \
                           prefix = {'work_type':'work_type', 'smoking_status' : 'smoking_status'}, drop_first=False)
df_encoded

Unnamed: 0,gender,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,treatment_age,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.0,0,1,1,1,228.69,36.6,1,1,0,0,1,0,0,0,1,0,0
1,1.0,0,1,1,0,105.92,32.5,1,1,0,0,1,0,0,0,0,1,0
2,0.0,0,0,1,1,171.23,34.4,1,1,0,0,1,0,0,0,0,0,1
3,0.0,1,0,1,0,174.12,24.0,1,1,0,0,0,1,0,0,0,1,0
4,1.0,0,0,1,1,186.21,29.0,1,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,0.0,0,0,0,0,103.08,18.6,0,0,0,0,0,0,1,1,0,0,0
4904,0.0,0,0,1,1,125.20,40.0,0,1,0,0,0,1,0,0,0,1,0
4905,0.0,0,0,1,0,82.99,30.6,0,0,0,0,0,1,0,0,0,1,0
4906,1.0,0,0,1,0,166.29,25.6,0,1,0,0,1,0,0,0,1,0,0


In [17]:
df_treatment = df_encoded[df_encoded['treatment_age']==1]
df_control = df_encoded[df_encoded['treatment_age']==0]

In [18]:
treat_mean = df_treatment.mean()
treat_std = df_treatment.std()
control_mean = df_control.mean()
control_std = df_control.std()

In [19]:
X_list = ['gender','hypertension','heart_disease','ever_married','Residence_type','avg_glucose_level','bmi','work_type_Govt_job',
          'work_type_Never_worked','work_type_Private','work_type_Self-employed','work_type_children','smoking_status_Unknown',
          'smoking_status_formerly smoked','smoking_status_never smoked','smoking_status_smokes']
X_descrip = ['0 to Female and 1 to Male','observation of hypertension','observation of heart_disease',
             'Marriage states','0 to Rural and 1 to Urban','average glucose level','bmi','Whether is government job',
             'Whether is never worked','Whether job is private','Whether job is self-employed','Whether job is children',
             'Whether smoking status is unknown','Whether is formerly smoked','Whether is never smoked','Whether is smokes']

In [20]:
summary_list = []

for i in range(0, len(X_list)):
    summary_list.append([X_list[i],X_descrip[i],control_mean[X_list[i]],control_std[X_list[i]],treat_mean[X_list[i]],treat_std[X_list[i]],
                    stats.ttest_ind(df_control[X_list[i]], df_treatment[X_list[i]], equal_var=False)[0]])

summary_df = pd.DataFrame(summary_list,columns=
                          ['Label', 'Variable Description','Controls Mean','Controls STD','Treated Mean','Treated STD','t-Statistics'])
summary_df

Unnamed: 0,Label,Variable Description,Controls Mean,Controls STD,Treated Mean,Treated STD,t-Statistics
0,gender,0 to Female and 1 to Male,0.403815,0.490761,0.415712,0.492945,-0.847259
1,hypertension,observation of hypertension,0.019886,0.139638,0.164484,0.37079,-18.051112
2,heart_disease,observation of heart_disease,0.002435,0.049296,0.096972,0.29598,-15.577435
3,ever_married,Marriage states,0.385552,0.486824,0.922259,0.267819,-47.902598
4,Residence_type,0 to Rural and 1 to Urban,0.497565,0.500096,0.517185,0.499807,-1.374646
5,avg_glucose_level,average glucose level,95.613389,30.794491,115.060663,53.086851,-15.68136
6,bmi,bmi,27.026177,8.367229,30.778232,6.801232,-17.244675
7,work_type_Govt_job,Whether is government job,0.090503,0.28696,0.16653,0.372632,-8.003549
8,work_type_Never_worked,Whether is never worked,0.008929,0.094087,0.0,0.0,4.71054
9,work_type_Private,Whether job is private,0.566558,0.495651,0.57856,0.493891,-0.84966


In [21]:
df = df_encoded
df

Unnamed: 0,gender,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,treatment_age,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.0,0,1,1,1,228.69,36.6,1,1,0,0,1,0,0,0,1,0,0
1,1.0,0,1,1,0,105.92,32.5,1,1,0,0,1,0,0,0,0,1,0
2,0.0,0,0,1,1,171.23,34.4,1,1,0,0,1,0,0,0,0,0,1
3,0.0,1,0,1,0,174.12,24.0,1,1,0,0,0,1,0,0,0,1,0
4,1.0,0,0,1,1,186.21,29.0,1,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,0.0,0,0,0,0,103.08,18.6,0,0,0,0,0,0,1,1,0,0,0
4904,0.0,0,0,1,1,125.20,40.0,0,1,0,0,0,1,0,0,0,1,0
4905,0.0,0,0,1,0,82.99,30.6,0,0,0,0,0,1,0,0,0,1,0
4906,1.0,0,0,1,0,166.29,25.6,0,1,0,0,1,0,0,0,1,0,0


In [22]:
Y = df.stroke

df_data = df.loc[:,df.columns !='stroke']
T = df_data.treatment_age
X = df_data.loc[:,df_data.columns !='treatment_age']

In [23]:
X_a = ['hypertension','heart_disease','avg_glucose_level','bmi']

In [24]:
T.mean()

0.4979625101874491

In [25]:
# weighted based on size ratio of treatment group and control group
model_initial = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.8}))])
model_initial.fit(X.loc[:,X_a], T)

predictions_binary = model_initial.predict(X.loc[:,X_a])
print('Accuracy: {:.4f}\n'.format(metrics.accuracy_score(T, predictions_binary)))
print('Confusion matrix:\n{}\n'.format(metrics.confusion_matrix(T, predictions_binary)))
print('F1 score is: {:.4f}'.format(metrics.f1_score(T, predictions_binary)))

Accuracy: 0.6483

Confusion matrix:
[[1053 1411]
 [ 315 2129]]

F1 score is: 0.7116


In [26]:
likelihood_ratio_table = []
X_l = X_a.copy()

while not likelihood_ratio_table or max([x for x in likelihood_ratio_table[-1].values() if x != '-']) >= 1:
    if likelihood_ratio_table:
        max_likelihood_ratio = max([x for x in likelihood_ratio_table[-1].values() if x != '-'])
        next_label = [key for key in likelihood_ratio_table[-1] if likelihood_ratio_table[-1][key] == max_likelihood_ratio][0]
        X_l.append(next_label)
    model = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.8}))])
    model.fit(X.loc[:,X_l], T)
    step = {}
    for x in X.columns:
        if x in X_l:
            step[x] = '-'
        else:
            X_new = X_l.copy()
            X_new.append(x)
            model_new = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.8}))])
            model_new.fit(X.loc[:,X_new], T)
            
            step[x] = -2*((-metrics.log_loss(T, model.predict_proba(X.loc[:,X_l]))*len(T))
                      -(-metrics.log_loss(T, model_new.predict_proba(X.loc[:,X_new]))*len(T)))
    likelihood_ratio_table.append(step)
    

In [27]:
linear_terms_list = []
for x in X.columns:
    row = [step[x] for step in likelihood_ratio_table]
    row.insert(0, x)
    linear_terms_list.append(row)
    
columns = ['Label']
for i in range(0,len(likelihood_ratio_table)):
    step = 'Step ' + str(i+1)
    columns.append(step)
    
linear_terms_df = pd.DataFrame(linear_terms_list,columns = columns)
linear_terms_df

Unnamed: 0,Label,Step 1,Step 2,Step 3,Step 4,Step 5,Step 6,Step 7,Step 8,Step 9,Step 10
0,gender,0.426998,1.399362,3.895933,5.558384,4.212808,4.370331,4.516161,-,-,-
1,hypertension,-,-,-,-,-,-,-,-,-,-
2,heart_disease,-,-,-,-,-,-,-,-,-,-
3,ever_married,1371.404411,-,-,-,-,-,-,-,-,-
4,Residence_type,2.87283,2.67131,2.708461,2.15928,2.471772,2.229507,2.337096,2.382211,-,-
5,avg_glucose_level,-,-,-,-,-,-,-,-,-,-
6,bmi,-,-,-,-,-,-,-,-,-,-
7,work_type_Govt_job,52.248799,8.963068,5.294081,20.801019,20.456227,-,-,-,-,-
8,work_type_Never_worked,24.194315,4.398252,7.760698,6.658812,6.094552,5.719204,-,-,-,-
9,work_type_Private,4.211225,43.742427,110.161818,17.689519,17.578046,5.133531,0.107287,0.122075,0.13468,0.166491


In [28]:
X_l

['hypertension',
 'heart_disease',
 'avg_glucose_level',
 'bmi',
 'ever_married',
 'work_type_children',
 'work_type_Self-employed',
 'smoking_status_formerly smoked',
 'work_type_Govt_job',
 'work_type_Never_worked',
 'gender',
 'Residence_type',
 'smoking_status_Unknown']

In [29]:
model_linear = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.8}))])
model_linear.fit(X.loc[:,X_l], T)

predictions_binary = model_linear.predict(X.loc[:,X_l])
print('Accuracy: {:.4f}\n'.format(metrics.accuracy_score(T, predictions_binary)))
print('Confusion matrix:\n{}\n'.format(metrics.confusion_matrix(T, predictions_binary)))
print('F1 score is: {:.4f}'.format(metrics.f1_score(T, predictions_binary)))

Accuracy: 0.7765

Confusion matrix:
[[1490  974]
 [ 123 2321]]

F1 score is: 0.8089


In [30]:
X_copy = X.loc[:,X_l]

for i in range(0, len(X_l)):
    j = i
    while j < len(X_l):
        X_copy[X_l[i]+'*'+X_l[j]] = X_copy[X_l[i]] * X_copy[X_l[j]]
        j += 1

X_copy

Unnamed: 0,hypertension,heart_disease,avg_glucose_level,bmi,ever_married,work_type_children,work_type_Self-employed,smoking_status_formerly smoked,work_type_Govt_job,work_type_Never_worked,...,work_type_Never_worked*work_type_Never_worked,work_type_Never_worked*gender,work_type_Never_worked*Residence_type,work_type_Never_worked*smoking_status_Unknown,gender*gender,gender*Residence_type,gender*smoking_status_Unknown,Residence_type*Residence_type,Residence_type*smoking_status_Unknown,smoking_status_Unknown*smoking_status_Unknown
0,0,1,228.69,36.6,1,0,0,1,0,0,...,0,0.0,0,0,1.0,1.0,0.0,1,0,0
1,0,1,105.92,32.5,1,0,0,0,0,0,...,0,0.0,0,0,1.0,0.0,0.0,0,0,0
2,0,0,171.23,34.4,1,0,0,0,0,0,...,0,0.0,0,0,0.0,0.0,0.0,1,0,0
3,1,0,174.12,24.0,1,0,1,0,0,0,...,0,0.0,0,0,0.0,0.0,0.0,0,0,0
4,0,0,186.21,29.0,1,0,0,1,0,0,...,0,0.0,0,0,1.0,1.0,0.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,0,0,103.08,18.6,0,1,0,0,0,0,...,0,0.0,0,0,0.0,0.0,0.0,0,0,1
4904,0,0,125.20,40.0,1,0,1,0,0,0,...,0,0.0,0,0,0.0,0.0,0.0,1,0,0
4905,0,0,82.99,30.6,1,0,1,0,0,0,...,0,0.0,0,0,0.0,0.0,0.0,0,0,0
4906,0,0,166.29,25.6,1,0,0,1,0,0,...,0,0.0,0,0,1.0,0.0,0.0,0,0,0


In [31]:
likelihood_ratio_table = []
X_2 = X_l.copy()

while not likelihood_ratio_table or max([x for x in likelihood_ratio_table[-1].values() if x != '-']) >= 2.71:
    if likelihood_ratio_table:
        max_likelihood_ratio = max([x for x in likelihood_ratio_table[-1].values() if x != '-'])
        next_label = [key for key in likelihood_ratio_table[-1] if likelihood_ratio_table[-1][key] == max_likelihood_ratio][0]
        X_2.append(next_label)
    model = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.8}))])
    model.fit(X_copy.loc[:,X_2], T)
    step = {}
    for x in X_copy.columns:
        if x in X_2:
            step[x] = '-'
        else:
            X_new = X_2.copy()
            X_new.append(x)
            model_new = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.8}))])
            model_new.fit(X_copy.loc[:,X_new], T)
            
            step[x] = -2*((-metrics.log_loss(T, model.predict_proba(X_copy.loc[:,X_2]))*len(T))
                      -(-metrics.log_loss(T, model_new.predict_proba(X_copy.loc[:,X_new]))*len(T)))
    likelihood_ratio_table.append(step)

In [32]:
final_terms_list = []
for x in X_copy.columns:
    row = [step[x] for step in likelihood_ratio_table]
    row.insert(0, x)
    final_terms_list.append(row)
    
final_columns = ['Label']
for i in range(0,len(likelihood_ratio_table)):
    step = 'Step ' + str(i+1)
    final_columns.append(step)
    
final_terms_df = pd.DataFrame(final_terms_list,columns = final_columns)
final_terms_df

Unnamed: 0,Label,Step 1,Step 2,Step 3,Step 4,Step 5,Step 6,Step 7,Step 8,Step 9,Step 10,Step 11,Step 12,Step 13
0,hypertension,-,-,-,-,-,-,-,-,-,-,-,-,-
1,heart_disease,-,-,-,-,-,-,-,-,-,-,-,-,-
2,avg_glucose_level,-,-,-,-,-,-,-,-,-,-,-,-,-
3,bmi,-,-,-,-,-,-,-,-,-,-,-,-,-
4,ever_married,-,-,-,-,-,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,gender*Residence_type,0.429994,0.403544,0.558156,0.683972,0.720772,0.76443,0.718604,0.538403,0.493732,0.495043,0.439343,0.43719,0.429173
100,gender*smoking_status_Unknown,1.820233,1.945364,1.914713,1.94973,2.135697,2.224161,2.244701,2.374423,2.25699,2.237716,2.245748,2.249313,2.589337
101,Residence_type*Residence_type,-0.008313,0.029036,-0.002622,-0.004469,-0.010611,0.0068,-0.023514,-0.006852,0.028871,0.036367,-0.016753,-0.004798,0.016673
102,Residence_type*smoking_status_Unknown,1.050397,0.907173,0.996454,0.948782,1.020441,0.964801,1.02766,1.092424,1.196983,1.245719,1.460514,1.446446,1.440634


In [33]:
X_2

['hypertension',
 'heart_disease',
 'avg_glucose_level',
 'bmi',
 'ever_married',
 'work_type_children',
 'work_type_Self-employed',
 'smoking_status_formerly smoked',
 'work_type_Govt_job',
 'work_type_Never_worked',
 'gender',
 'Residence_type',
 'smoking_status_Unknown',
 'avg_glucose_level*avg_glucose_level',
 'hypertension*bmi',
 'hypertension*ever_married',
 'bmi*bmi',
 'hypertension*smoking_status_formerly smoked',
 'ever_married*work_type_Govt_job',
 'ever_married*work_type_Self-employed',
 'bmi*ever_married',
 'work_type_Self-employed*smoking_status_formerly smoked',
 'avg_glucose_level*bmi',
 'heart_disease*work_type_Govt_job',
 'hypertension*gender']

In [34]:
model_final = Pipeline([('scaler', StandardScaler()),('logistic_classifier', lr(class_weight={0:1,1:1.8}))])
model_final.fit(X_copy.loc[:,X_2], T)

predictions_binary = model_final.predict(X_copy.loc[:,X_2])
print('Accuracy: {:.4f}\n'.format(metrics.accuracy_score(T, predictions_binary)))
print('Confusion matrix:\n{}\n'.format(metrics.confusion_matrix(T, predictions_binary)))
print('F1 score is: {:.4f}'.format(metrics.f1_score(T, predictions_binary)))

Accuracy: 0.7765

Confusion matrix:
[[1466  998]
 [  99 2345]]

F1 score is: 0.8104


In [35]:
treat_index = []
control_index = []
for i in range(0,len(T)):
    if T[i] == 1:
        treat_index.append(i)
    elif T[i] == 0:
        control_index.append(i)

In [41]:
predictions = model_final.predict(X_copy.loc[:,X_2])

ate_ipw = sum([T[i]*Y[i]/predictions[i][1] for i in treat_index]) / sum([T[i]/predictions[i][1] for i in treat_index]) 
- sum([(1-T[i])*Y[i]/(1-predictions[i][1]) for i in control_index]) / sum([(1-T[i])/(1-predictions[i][1]) for i in control_index])

IndexError: invalid index to scalar variable.

In [38]:
predictions

array([[9.08888255e-04, 9.99091112e-01],
       [9.28331536e-03, 9.90716685e-01],
       [2.14626746e-01, 7.85373254e-01],
       ...,
       [1.38072490e-01, 8.61927510e-01],
       [1.35273167e-01, 8.64726833e-01],
       [2.31072282e-01, 7.68927718e-01]])