In [165]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from matplotlib import pyplot as plt
from scipy import stats

In [166]:
nhanes = pd.read_csv('../data/nhanes.csv')
nhanes.head()

Unnamed: 0,id,heart_attack,relative_heart_attack,gender,age,race,edu,annual_income,weight,bmi,...,blood_press,blood_press2,hyper_med,hbq_med,high_chol,meadial_access,cover_hc,health_diet,year_smoke,year_hyper
0,93705,0,0,2,66,4,2,3,8614.571172,31.7,...,1,1,1,1,0,1,1,2,50,16
1,93708,0,0,2,66,5,1,6,13329.450589,23.7,...,1,1,1,1,1,1,1,4,0,16
2,93709,0,0,2,75,4,4,2,12043.388271,38.9,...,1,1,1,1,0,1,1,2,60,4
3,93711,0,1,1,56,5,5,15,11178.260106,21.3,...,0,0,0,0,1,1,1,3,0,0
4,93713,0,0,1,67,3,3,6,174806.575152,23.5,...,0,0,0,0,0,1,1,1,53,0


In [167]:
nhanes.columns

Index(['id', 'heart_attack', 'relative_heart_attack', 'gender', 'age', 'race',
       'edu', 'annual_income', 'weight', 'bmi', 'diabete', 'smoke_life',
       'phy_vigorous', 'phy_moderate', 'blood_press', 'blood_press2',
       'hyper_med', 'hbq_med', 'high_chol', 'meadial_access', 'cover_hc',
       'health_diet', 'year_smoke', 'year_hyper'],
      dtype='object')

In [168]:
nhanes_X = nhanes.drop(columns=['id', 'heart_attack','diabete','weight'])
nhanes_diab = nhanes['diabete']
weight = nhanes['weight']

## Estimate propensity score by fitting a logistic regression model.

In [179]:
clf = LogisticRegression(random_state=0, max_iter = 1000)
clf.fit(nhanes_X, nhanes_diab, sample_weight = weight)
prop_score = clf.predict_proba(nhanes_X)[:,1]

In [180]:
dia_idx = np.where(nhanes['diabete'].values==1)
non_dia_idx = np.where(nhanes['diabete'].values==0)

## Use Nearest-Neighborhood to match the diabete and non-diabete patients on the estimated propensity scores.

In [181]:
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nbrs.fit(prop_score[non_dia_idx].reshape(4429,1))
distances, idx = nbrs.kneighbors(prop_score[dia_idx].reshape([836,1]))

In [190]:
matched_idx = non_dia_idx[0][idx.reshape(836,)]
heart_matched = nhanes['heart_attack'].values[matched_idx]
heart_non_dia = nhanes['heart_attack'].values[non_dia_idx]
heart_dia = nhanes['heart_attack'].values[dia_idx]

## Use T-test to figure out the effect of diabetes on heart attack

In [191]:
# proportion of heart attack for people with diabetes
np.sum(heart_dia)/len(heart_dia)

0.13516746411483255

In [192]:
# proportion of heart attack for all the people don't have diabetese
np.sum(heart_non_dia)/len(heart_non_dia)

0.03183562881011515

In [193]:
# proportion of heart attack for people don't have diabetes matched by propensity score
np.sum(heart_matched)/len(heart_matched)

0.07535885167464115

In [194]:
stats.ttest_ind(heart_matched, heart_dia)

Ttest_indResult(statistic=-4.001084992191622, pvalue=6.580948559370417e-05)