In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [3]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [4]:
import imblearn
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [5]:
def get_ci(X):
    mean = np.mean(X)
    bound = 1.96*np.std(X)/np.sqrt(len(X))
    print(f'{mean:.3f}, [{mean-bound:.3f}, {mean+bound:.3f}]')
    #return mean, mean-bound, mean+bound

In [6]:
outcome_cols = [
    'death',
    'cvd_death',
    'time_death',
    'anyhosp',
    'time_anyhosp',
    'hfhosp',
    'time_hfhosp',
    'abortedca',
    'time_abortedca',
    'mi',
    'time_mi',
    'stroke',
    'time_stroke',
    'primary_ep',
    'time_primary_ep'
]

In [7]:
late_drops = [
     'mr_mod',
     'symp_bur_score',
     'tot_symp_score',
     'self_eff_score',
     'qol_score',
     'soc_limit_score',
     'overall_sum_score',
     'clin_sum_score'
]

In [8]:
con_cat_cols = [
    'GLUCOSE_FAST',
    'GLUCOSE_RAND',
    'CO2_mmolL',
    'GLUCOSE_mgdL',
    'WBC_kuL',
    'HCT_p',
    'HB_gdL',
    'PLT_kuL',
    'ALP_UL',
    'TBILI_mgdL',
    'ALB_gdL'
]

In [9]:
contin_cols = [
 'BNP_VAL',
 'age_entry',
 'EF',
 'visit_dt1_hf',
 'chfdc_dt3',
 'mi_dt3',
 'stroke_dt3',
 'cabg_dt3',
 'pci_dt3',
 'DM_AGE_YR',
 'DM_DUR_YR',
 'cigs',
 'SMOKE_YRS',
 'QUIT_YRS',
 'HEAVY_MIN',
 'HEAVY_WK',
 'MED_WK',
 'MED_MIN',
 'LIGHT_WK',
 'LIGHT_MIN',
 'metsperweek',
 'cooking_salt_score',
 'height',
 'weight',
 'waistc',
 'HR',
 'SBP',
 'DBP',
 'CR_mgdl',
 'gfr',
 'labs_dt1',
 'NA_mmolL',
 'K_mmolL',
 'CL_mmolL',
 'BUN_mgdL',
 'ALT_UL',
 'AST_UL',
 'urine_val_mgg',
 'QRS_DUR',
 'CR_mgdL',
 'BMI'
]

In [10]:
base_data = pd.read_csv(
    '/data/datasets/topcat/py_cleaned_data/TOPCAT_final_2_25_2020.csv',
    index_col=0
)

In [11]:
skf = StratifiedKFold(n_splits=5, shuffle=False)
oversample = SMOTE()
ran_undersample = RandomUnderSampler()

## Base Test 

In [12]:
xgb_auc = []
rf_auc = []
#Modes:
#     1 : Primary End Point
#     2 : Death
#     3 : HF Hospitalization

#Cutoff: Years before censoring

#for mode in range(1,4):
#    for cutoff in [3, np.inf]:
for mode in [2]:
    for cutoff in [3]:
        print(f'Mode={mode}, Cut Off={cutoff}')
        df = base_data.copy()
        
        #remove people
        if cutoff==3:
            ids = pd.read_csv('/data/datasets/topcat/nch/Pt_ID.csv')
            if mode == 2:
                df = df[
                    df['ID'].isin(ids['Died_Americas_3years']) |
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
            elif mode == 3:
                df = df[
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
        
        #remove variables
        df.drop(columns=df.columns[114:173], inplace=True)
        df.drop(columns=['BNP_VAL', 'BNP_YN', 'visit_dt1_hf',], inplace=True)
        df.drop(columns=late_drops, inplace=True)
        
        highly_miss = []
        for col in df.columns:
            if df[col].count()/df.shape[0] < 0.5:
                highly_miss.append(col)
        df.drop(columns=highly_miss, inplace=True)
        
        #create labels
        if mode == 1:
            outcome = 'primary_ep'
            outcome_time = 'time_primary_ep'
        elif mode == 2:
            outcome = 'death'
            outcome_time = 'time_death'
        elif mode == 3:
            outcome = 'hfhosp'
            outcome_time = 'time_hfhosp'
        
        labels = df[outcome].copy()
        complete_labels = labels.copy()
        
        labels.loc[df[outcome_time] > cutoff] = 0
        counter = Counter(labels)
        print(counter)
        
        for i, (train, test) in enumerate(skf.split(df, labels)):
            print(f'Fold {i}: ', end='')
            train_data = df.iloc[train].copy()
            test_data = df.iloc[test].copy()
            
            train_labels=labels.iloc[train].copy()
            test_labels=labels.iloc[test].copy()
            
            weights = len(train_labels)/test_labels.sum()
            glm_weights = pd.Series(data=1, index=train_labels.index)
            glm_weights.loc[train_labels==1] = weights
            
            #remove outcomes
            train_id = train_data['ID'].copy()
            test_id = test_data['ID'].copy()
            
            train_data.drop(columns=outcome_cols+['ID'], inplace=True)
            test_data.drop(columns= outcome_cols+['ID'], inplace=True)
            
            #print(f'Fold {i} Imputation')
            #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            #train_data.values = imp.fit_transform(train_data)
            #test_data.values  = imp.transform(test_data)
            test_data = test_data.fillna(train_data.mean())
            train_data = train_data.fillna(train_data.mean())
            
            sd_0_cols = train_data.columns[(train_data.std() == 0)]
            train_data.drop(columns=sd_0_cols, inplace=True)
            test_data.drop(columns=sd_0_cols, inplace=True)
            
            cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
            scaler = StandardScaler()
            train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
            test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
            
            xgb = XGBClassifier(use_label_encoder=False, verbosity = 0)
            xgb.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
            print(f'XGB AUC={auc}')
            xgb_auc.append(auc)
            
            rf = RandomForestClassifier()
            rf.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
            print(f'\tRF AUC={auc}')
            rf_auc.append(auc)
            
            
            #print('\tSuccess!\n')
            #break
        print()
        print(f'XGB AUC={sum(xgb_auc)/5}')
        print(f'Avg RF AUC={sum(rf_auc)/5}')
            

Mode=2, Cut Off=3
Fold 0: XGB AUC=0.693089430894309
	RF AUC=0.7066960252935863
Fold 1: XGB AUC=0.725609756097561
	RF AUC=0.7440718157181572
Fold 2: XGB AUC=0.7172538392050587
	RF AUC=0.764284101174345
Fold 3: XGB AUC=0.6824666359871145
	RF AUC=0.6852277956741831
Fold 4: XGB AUC=0.6323055683387022
	RF AUC=0.6283364012885414

XGB AUC=0.6901450461045491
Avg RF AUC=0.7057232278297627


## SMOTE on entire dataset

In [13]:
xgb_auc = []
rf_auc = []
#Modes:
#     1 : Primary End Point
#     2 : Death
#     3 : HF Hospitalization

#Cutoff: Years before censoring

#for mode in range(1,4):
#    for cutoff in [3, np.inf]:
for mode in [2]:
    for cutoff in [3]:
        print(f'Mode={mode}, Cut Off={cutoff}')
        df = base_data.copy()
        
        #remove people
        if cutoff==3:
            ids = pd.read_csv('/data/datasets/topcat/nch/Pt_ID.csv')
            if mode == 2:
                df = df[
                    df['ID'].isin(ids['Died_Americas_3years']) |
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
            elif mode == 3:
                df = df[
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
        
        #remove variables
        df.drop(columns=df.columns[114:173], inplace=True)
        df.drop(columns=['BNP_VAL', 'BNP_YN', 'visit_dt1_hf',], inplace=True)
        df.drop(columns=late_drops, inplace=True)
        
        highly_miss = []
        for col in df.columns:
            if df[col].count()/df.shape[0] < 0.5:
                highly_miss.append(col)
        df.drop(columns=highly_miss, inplace=True)
        
        #create labels
        if mode == 1:
            outcome = 'primary_ep'
            outcome_time = 'time_primary_ep'
        elif mode == 2:
            outcome = 'death'
            outcome_time = 'time_death'
        elif mode == 3:
            outcome = 'hfhosp'
            outcome_time = 'time_hfhosp'
        
        labels = df[outcome].copy()
        complete_labels = labels.copy()
        
        labels.loc[df[outcome_time] > cutoff] = 0
        #---------------------------------------------------------------------------
        counter = Counter(labels)
        print(counter)
        df = df.fillna(df.mean()) # oversample cant take NaN values
        # Oversampling the minority class in the entire dataset
        df, labels = oversample.fit_resample(df, labels)
        counter = Counter(labels)
        print(counter)
        #---------------------------------------------------------------------------
        for i, (train, test) in enumerate(skf.split(df, labels)):
            print(f'Fold {i}: ', end='')
            train_data = df.iloc[train].copy()
            test_data = df.iloc[test].copy()
            
            train_labels=labels.iloc[train].copy()
            test_labels=labels.iloc[test].copy()
            
            weights = len(train_labels)/test_labels.sum()
            glm_weights = pd.Series(data=1, index=train_labels.index)
            glm_weights.loc[train_labels==1] = weights
            
            #remove outcomes
            train_id = train_data['ID'].copy()
            test_id = test_data['ID'].copy()
            
            train_data.drop(columns=outcome_cols+['ID'], inplace=True)
            test_data.drop(columns= outcome_cols+['ID'], inplace=True)
            
            #print(f'Fold {i} Imputation')
            #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            #train_data.values = imp.fit_transform(train_data)
            #test_data.values  = imp.transform(test_data)
            test_data = test_data.fillna(train_data.mean())
            train_data = train_data.fillna(train_data.mean())
            
            sd_0_cols = train_data.columns[(train_data.std() == 0)]
            train_data.drop(columns=sd_0_cols, inplace=True)
            test_data.drop(columns=sd_0_cols, inplace=True)
            
            cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
            scaler = StandardScaler()
            train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
            test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
            
            xgb = XGBClassifier(use_label_encoder=False, verbosity = 0)
            xgb.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
            print(f'XGB AUC={auc}')
            xgb_auc.append(auc)
            
            rf = RandomForestClassifier()
            rf.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
            print(f'\tRF AUC={auc}')
            rf_auc.append(auc)
            
            
            #print('\tSuccess!\n')
            #break
        print()
        print(f'Avg XGB AUC={sum(xgb_auc)/5}')
        print(f'Avg RF AUC={sum(rf_auc)/5}')
            

Mode=2, Cut Off=3
Counter({0.0: 820, 1.0: 268})
Counter({1.0: 820, 0.0: 820})
Fold 0: XGB AUC=0.7817519333729922
	RF AUC=0.8502193634741225
Fold 1: XGB AUC=0.8196014277215943
	RF AUC=0.8437871802498513
Fold 2: XGB AUC=0.9991448542534206
	RF AUC=1.0
Fold 3: XGB AUC=0.9979550862581796
	RF AUC=0.9993679357525282
Fold 4: XGB AUC=0.9958729922665079
	RF AUC=0.9985871505056515

XGB AUC=0.9188652587745387
Avg RF AUC=0.9383923259964307


## SMOTE and Random Under Sampling in combo on entire Dataset

In [14]:
xgb_auc = []
rf_auc = []
#Modes:
#     1 : Primary End Point
#     2 : Death
#     3 : HF Hospitalization

#Cutoff: Years before censoring

#for mode in range(1,4):
#    for cutoff in [3, np.inf]:
for mode in [2]:
    for cutoff in [3]:
        print(f'Mode={mode}, Cut Off={cutoff}')
        df = base_data.copy()
        
        #remove people
        if cutoff==3:
            ids = pd.read_csv('/data/datasets/topcat/nch/Pt_ID.csv')
            if mode == 2:
                df = df[
                    df['ID'].isin(ids['Died_Americas_3years']) |
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
            elif mode == 3:
                df = df[
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
        
        #remove variables
        df.drop(columns=df.columns[114:173], inplace=True)
        df.drop(columns=['BNP_VAL', 'BNP_YN', 'visit_dt1_hf',], inplace=True)
        df.drop(columns=late_drops, inplace=True)
        
        highly_miss = []
        for col in df.columns:
            if df[col].count()/df.shape[0] < 0.5:
                highly_miss.append(col)
        df.drop(columns=highly_miss, inplace=True)
        
        #create labels
        if mode == 1:
            outcome = 'primary_ep'
            outcome_time = 'time_primary_ep'
        elif mode == 2:
            outcome = 'death'
            outcome_time = 'time_death'
        elif mode == 3:
            outcome = 'hfhosp'
            outcome_time = 'time_hfhosp'
        
        labels = df[outcome].copy()
        complete_labels = labels.copy()
        
        labels.loc[df[outcome_time] > cutoff] = 0
        #---------------------------------------------------------------------------
        counter = Counter(labels)
        print(counter)
        df = df.fillna(df.mean()) # oversample cant take NaN values
        # Oversampling the minority class
        df, labels = oversample.fit_resample(df, labels)
        # Undersampling the majority class 
        df, labels = ran_undersample.fit_resample(df, labels)
        counter = Counter(labels)
        print(counter)
        #---------------------------------------------------------------------------
        for i, (train, test) in enumerate(skf.split(df, labels)):
            print(f'Fold {i}: ', end='')
            train_data = df.iloc[train].copy()
            test_data = df.iloc[test].copy()
            
            train_labels=labels.iloc[train].copy()
            test_labels=labels.iloc[test].copy()
            
            weights = len(train_labels)/test_labels.sum()
            glm_weights = pd.Series(data=1, index=train_labels.index)
            glm_weights.loc[train_labels==1] = weights
            
            #remove outcomes
            train_id = train_data['ID'].copy()
            test_id = test_data['ID'].copy()
            
            train_data.drop(columns=outcome_cols+['ID'], inplace=True)
            test_data.drop(columns= outcome_cols+['ID'], inplace=True)
            
            #print(f'Fold {i} Imputation')
            #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            #train_data.values = imp.fit_transform(train_data)
            #test_data.values  = imp.transform(test_data)
            test_data = test_data.fillna(train_data.mean())
            train_data = train_data.fillna(train_data.mean())
            
            sd_0_cols = train_data.columns[(train_data.std() == 0)]
            train_data.drop(columns=sd_0_cols, inplace=True)
            test_data.drop(columns=sd_0_cols, inplace=True)
            
            cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
            scaler = StandardScaler()
            train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
            test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
            
            xgb = XGBClassifier(use_label_encoder=False, verbosity = 0)
            xgb.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
            print(f'XGB AUC={auc}')
            xgb_auc.append(auc)
            
            rf = RandomForestClassifier()
            rf.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
            print(f'\tRF AUC={auc}')
            rf_auc.append(auc)
            
            
            #print('\tSuccess!\n')
            #break
        print()
        print(f'Avg XGB AUC={sum(xgb_auc)/5}')
        print(f'Avg RF AUC={sum(rf_auc)/5}')
            

Mode=2, Cut Off=3
Counter({0.0: 820, 1.0: 268})
Counter({0.0: 820, 1.0: 820})
Fold 0: XGB AUC=0.9140020820939917
	RF AUC=0.9316627007733492
Fold 1: XGB AUC=0.9390243902439025
	RF AUC=0.9440809042236764
Fold 2: XGB AUC=0.940288518738846
	RF AUC=0.961220999405116
Fold 3: XGB AUC=0.9312908982748365
	RF AUC=0.9292273944080904
Fold 4: XGB AUC=0.8782718619869125
	RF AUC=0.8943709101725164

XGB AUC=0.9205755502676979
Avg RF AUC=0.9321125817965497


## SMOTE on K-Fold Groups training data

In [15]:
xgb_auc = []
rf_auc = []
#Modes:
#     1 : Primary End Point
#     2 : Death
#     3 : HF Hospitalization

#Cutoff: Years before censoring

#for mode in range(1,4):
#    for cutoff in [3, np.inf]:
for mode in [2]:
    for cutoff in [3]:
        print(f'Mode={mode}, Cut Off={cutoff}')
        df = base_data.copy()
        
        #remove people
        if cutoff==3:
            ids = pd.read_csv('/data/datasets/topcat/nch/Pt_ID.csv')
            if mode == 2:
                df = df[
                    df['ID'].isin(ids['Died_Americas_3years']) |
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
            elif mode == 3:
                df = df[
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
        
        #remove variables
        df.drop(columns=df.columns[114:173], inplace=True)
        df.drop(columns=['BNP_VAL', 'BNP_YN', 'visit_dt1_hf',], inplace=True)
        df.drop(columns=late_drops, inplace=True)
        
        highly_miss = []
        for col in df.columns:
            if df[col].count()/df.shape[0] < 0.5:
                highly_miss.append(col)
        df.drop(columns=highly_miss, inplace=True)
        
        #create labels
        if mode == 1:
            outcome = 'primary_ep'
            outcome_time = 'time_primary_ep'
        elif mode == 2:
            outcome = 'death'
            outcome_time = 'time_death'
        elif mode == 3:
            outcome = 'hfhosp'
            outcome_time = 'time_hfhosp'
        
        labels = df[outcome].copy()
        complete_labels = labels.copy()
        
        labels.loc[df[outcome_time] > cutoff] = 0
        #---------------------------------------------------------------------------
        counter = Counter(labels)
        print(counter)
        #---------------------------------------------------------------------------
        for i, (train, test) in enumerate(skf.split(df, labels)):
            print(f'Fold {i}: ', end='')
            train_data = df.iloc[train].copy()
            test_data = df.iloc[test].copy()
            
            train_labels=labels.iloc[train].copy()
            test_labels=labels.iloc[test].copy()
            
            weights = len(train_labels)/test_labels.sum()
            glm_weights = pd.Series(data=1, index=train_labels.index)
            glm_weights.loc[train_labels==1] = weights
            
            #remove outcomes
            train_id = train_data['ID'].copy()
            test_id = test_data['ID'].copy()
            
            train_data.drop(columns=outcome_cols+['ID'], inplace=True)
            test_data.drop(columns= outcome_cols+['ID'], inplace=True)
            
            #print(f'Fold {i} Imputation')
            #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            #train_data.values = imp.fit_transform(train_data)
            #test_data.values  = imp.transform(test_data)
            test_data = test_data.fillna(train_data.mean())
            train_data = train_data.fillna(train_data.mean())
            
            #---------------------------------------------------------------------------
            train_data, train_labels = oversample.fit_resample(train_data, train_labels)
            counter = Counter(train_labels)
            print(counter)
            #---------------------------------------------------------------------------
            
            sd_0_cols = train_data.columns[(train_data.std() == 0)]
            train_data.drop(columns=sd_0_cols, inplace=True)
            test_data.drop(columns=sd_0_cols, inplace=True)
            
            cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
            scaler = StandardScaler()
            train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
            test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
            
            xgb = XGBClassifier(use_label_encoder=False, verbosity = 0)
            xgb.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
            print(f'XGB AUC={auc}')
            xgb_auc.append(auc)
            
            rf = RandomForestClassifier()
            rf.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
            print(f'\tRF AUC={auc}')
            rf_auc.append(auc)
            
            
            #print('\tSuccess!\n')
            #break
        print()
        print(f'Avg XGB AUC={sum(xgb_auc)/5}')
        print(f'Avg RF AUC={sum(rf_auc)/5}')
            

Mode=2, Cut Off=3
Counter({0.0: 820, 1.0: 268})
Fold 0: Counter({1.0: 656, 0.0: 656})
XGB AUC=0.7203026196928637
	RF AUC=0.7162375790424571
Fold 1: Counter({1.0: 656, 0.0: 656})
XGB AUC=0.6988482384823849
	RF AUC=0.7571702800361337
Fold 2: Counter({1.0: 656, 0.0: 656})
XGB AUC=0.6659891598915988
	RF AUC=0.6892502258355917
Fold 3: Counter({1.0: 656, 0.0: 656})
XGB AUC=0.6764841233317993
	RF AUC=0.63466405890474
Fold 4: Counter({1.0: 656, 0.0: 656})
XGB AUC=0.6852277956741832
	RF AUC=0.6547975149562817

XGB AUC=0.689370387414566
Avg RF AUC=0.6904239317550409


## SMOTE and Random Under Sampling in combo on k-fold groups training data

In [16]:
xgb_auc = []
rf_auc = []
#Modes:
#     1 : Primary End Point
#     2 : Death
#     3 : HF Hospitalization

#Cutoff: Years before censoring

#for mode in range(1,4):
#    for cutoff in [3, np.inf]:
for mode in [2]:
    for cutoff in [3]:
        print(f'Mode={mode}, Cut Off={cutoff}')
        df = base_data.copy()
        
        #remove people
        if cutoff==3:
            ids = pd.read_csv('/data/datasets/topcat/nch/Pt_ID.csv')
            if mode == 2:
                df = df[
                    df['ID'].isin(ids['Died_Americas_3years']) |
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
            elif mode == 3:
                df = df[
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
        
        #remove variables
        df.drop(columns=df.columns[114:173], inplace=True)
        df.drop(columns=['BNP_VAL', 'BNP_YN', 'visit_dt1_hf',], inplace=True)
        df.drop(columns=late_drops, inplace=True)
        
        highly_miss = []
        for col in df.columns:
            if df[col].count()/df.shape[0] < 0.5:
                highly_miss.append(col)
        df.drop(columns=highly_miss, inplace=True)
        
        #create labels
        if mode == 1:
            outcome = 'primary_ep'
            outcome_time = 'time_primary_ep'
        elif mode == 2:
            outcome = 'death'
            outcome_time = 'time_death'
        elif mode == 3:
            outcome = 'hfhosp'
            outcome_time = 'time_hfhosp'
        
        labels = df[outcome].copy()
        complete_labels = labels.copy()
        
        labels.loc[df[outcome_time] > cutoff] = 0
        #---------------------------------------------------------------------------
        counter = Counter(labels)
        print(counter)
        #---------------------------------------------------------------------------
        for i, (train, test) in enumerate(skf.split(df, labels)):
            print(f'Fold {i}: ', end='')
            train_data = df.iloc[train].copy()
            test_data = df.iloc[test].copy()
            
            train_labels=labels.iloc[train].copy()
            test_labels=labels.iloc[test].copy()
            
            weights = len(train_labels)/test_labels.sum()
            glm_weights = pd.Series(data=1, index=train_labels.index)
            glm_weights.loc[train_labels==1] = weights
            
            #remove outcomes
            train_id = train_data['ID'].copy()
            test_id = test_data['ID'].copy()
            
            train_data.drop(columns=outcome_cols+['ID'], inplace=True)
            test_data.drop(columns= outcome_cols+['ID'], inplace=True)
            
            #print(f'Fold {i} Imputation')
            #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            #train_data.values = imp.fit_transform(train_data)
            #test_data.values  = imp.transform(test_data)
            test_data = test_data.fillna(train_data.mean())
            train_data = train_data.fillna(train_data.mean())
            
            #---------------------------------------------------------------------------
            train_data, train_labels = oversample.fit_resample(train_data, train_labels)
            train_data, train_labels = ran_undersample.fit_resample(train_data, train_labels)
            counter = Counter(train_labels)
            print(counter)
            #---------------------------------------------------------------------------
            
            sd_0_cols = train_data.columns[(train_data.std() == 0)]
            train_data.drop(columns=sd_0_cols, inplace=True)
            test_data.drop(columns=sd_0_cols, inplace=True)
            
            cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
            scaler = StandardScaler()
            train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
            test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
            
            xgb = XGBClassifier(use_label_encoder=False, verbosity = 0)
            xgb.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
            print(f'XGB AUC={auc}')
            xgb_auc.append(auc)
            
            rf = RandomForestClassifier()
            rf.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
            print(f'\tRF AUC={auc}')
            rf_auc.append(auc)
            
            
            #print('\tSuccess!\n')
            #break
        print()
        print(f'Avg XGB AUC={sum(xgb_auc)/5}')
        print(f'Avg RF AUC={sum(rf_auc)/5}')
            

Mode=2, Cut Off=3
Counter({0.0: 820, 1.0: 268})
Fold 0: Counter({0.0: 656, 1.0: 656})
XGB AUC=0.6666666666666666
	RF AUC=0.7015018066847335
Fold 1: Counter({0.0: 656, 1.0: 656})
XGB AUC=0.7125112917795844
	RF AUC=0.7406842818428183
Fold 2: Counter({0.0: 656, 1.0: 656})
XGB AUC=0.6903794037940378
	RF AUC=0.7169150858175249
Fold 3: Counter({0.0: 656, 1.0: 656})
XGB AUC=0.6505982512655315
	RF AUC=0.6410492406810862
Fold 4: Counter({0.0: 656, 1.0: 656})
XGB AUC=0.6587666820064427
	RF AUC=0.6663598711458814

XGB AUC=0.6757844591024527
Avg RF AUC=0.693302057234409


## SMOTE on K-Fold Groups training and testing data

In [17]:
xgb_auc = []
rf_auc = []
#Modes:
#     1 : Primary End Point
#     2 : Death
#     3 : HF Hospitalization

#Cutoff: Years before censoring

#for mode in range(1,4):
#    for cutoff in [3, np.inf]:
for mode in [2]:
    for cutoff in [3]:
        print(f'Mode={mode}, Cut Off={cutoff}')
        df = base_data.copy()
        
        #remove people
        if cutoff==3:
            ids = pd.read_csv('/data/datasets/topcat/nch/Pt_ID.csv')
            if mode == 2:
                df = df[
                    df['ID'].isin(ids['Died_Americas_3years']) |
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
            elif mode == 3:
                df = df[
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
        
        #remove variables
        df.drop(columns=df.columns[114:173], inplace=True)
        df.drop(columns=['BNP_VAL', 'BNP_YN', 'visit_dt1_hf',], inplace=True)
        df.drop(columns=late_drops, inplace=True)
        
        highly_miss = []
        for col in df.columns:
            if df[col].count()/df.shape[0] < 0.5:
                highly_miss.append(col)
        df.drop(columns=highly_miss, inplace=True)
        
        #create labels
        if mode == 1:
            outcome = 'primary_ep'
            outcome_time = 'time_primary_ep'
        elif mode == 2:
            outcome = 'death'
            outcome_time = 'time_death'
        elif mode == 3:
            outcome = 'hfhosp'
            outcome_time = 'time_hfhosp'
        
        labels = df[outcome].copy()
        complete_labels = labels.copy()
        
        labels.loc[df[outcome_time] > cutoff] = 0
        #---------------------------------------------------------------------------
        counter = Counter(labels)
        print(counter)
        #---------------------------------------------------------------------------
        for i, (train, test) in enumerate(skf.split(df, labels)):
            print(f'Fold {i}: ', end='')
            train_data = df.iloc[train].copy()
            test_data = df.iloc[test].copy()
            
            train_labels=labels.iloc[train].copy()
            test_labels=labels.iloc[test].copy()
            
            weights = len(train_labels)/test_labels.sum()
            glm_weights = pd.Series(data=1, index=train_labels.index)
            glm_weights.loc[train_labels==1] = weights
            
            #remove outcomes
            train_id = train_data['ID'].copy()
            test_id = test_data['ID'].copy()
            
            train_data.drop(columns=outcome_cols+['ID'], inplace=True)
            test_data.drop(columns= outcome_cols+['ID'], inplace=True)
            
            #print(f'Fold {i} Imputation')
            #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            #train_data.values = imp.fit_transform(train_data)
            #test_data.values  = imp.transform(test_data)
            test_data = test_data.fillna(train_data.mean())
            train_data = train_data.fillna(train_data.mean())
            
            #---------------------------------------------------------------------------
            train_data, train_labels = oversample.fit_resample(train_data, train_labels)
            test_data, test_labels = oversample.fit_resample(test_data, test_labels)
            #---------------------------------------------------------------------------
            
            sd_0_cols = train_data.columns[(train_data.std() == 0)]
            train_data.drop(columns=sd_0_cols, inplace=True)
            test_data.drop(columns=sd_0_cols, inplace=True)
            
            cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
            scaler = StandardScaler()
            train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
            test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
            
            xgb = XGBClassifier(use_label_encoder=False, verbosity = 0)
            xgb.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
            print(f'XGB AUC={auc}')
            xgb_auc.append(auc)
            
            rf = RandomForestClassifier()
            rf.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
            print(f'\tRF AUC={auc}')
            rf_auc.append(auc)
            
            
            #print('\tSuccess!\n')
            #break
        print()
        print(f'Avg XGB AUC={sum(xgb_auc)/5}')
        print(f'Avg RF AUC={sum(rf_auc)/5}')
            

Mode=2, Cut Off=3
Counter({0.0: 820, 1.0: 268})
Fold 0: XGB AUC=0.878457763236169
	RF AUC=0.8989069006543724
Fold 1: XGB AUC=0.9048929208804283
	RF AUC=0.9014351576442593
Fold 2: XGB AUC=0.8782718619869124
	RF AUC=0.8897233789411064
Fold 3: XGB AUC=0.8645523497917905
	RF AUC=0.869125520523498
Fold 4: XGB AUC=0.8838117192147531
	RF AUC=0.8827149018441404

XGB AUC=0.8819973230220105
Avg RF AUC=0.8883811719214754


## SMOTE and Random Under Sampling in combo on k-fold groups training and testing data

In [18]:
xgb_auc = []
rf_auc = []
#Modes:
#     1 : Primary End Point
#     2 : Death
#     3 : HF Hospitalization

#Cutoff: Years before censoring

#for mode in range(1,4):
#    for cutoff in [3, np.inf]:
for mode in [2]:
    for cutoff in [3]:
        print(f'Mode={mode}, Cut Off={cutoff}')
        df = base_data.copy()
        
        #remove people
        if cutoff==3:
            ids = pd.read_csv('/data/datasets/topcat/nch/Pt_ID.csv')
            if mode == 2:
                df = df[
                    df['ID'].isin(ids['Died_Americas_3years']) |
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
            elif mode == 3:
                df = df[
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
        
        #remove variables
        df.drop(columns=df.columns[114:173], inplace=True)
        df.drop(columns=['BNP_VAL', 'BNP_YN', 'visit_dt1_hf',], inplace=True)
        df.drop(columns=late_drops, inplace=True)
        
        highly_miss = []
        for col in df.columns:
            if df[col].count()/df.shape[0] < 0.5:
                highly_miss.append(col)
        df.drop(columns=highly_miss, inplace=True)
        
        #create labels
        if mode == 1:
            outcome = 'primary_ep'
            outcome_time = 'time_primary_ep'
        elif mode == 2:
            outcome = 'death'
            outcome_time = 'time_death'
        elif mode == 3:
            outcome = 'hfhosp'
            outcome_time = 'time_hfhosp'
        
        labels = df[outcome].copy()
        complete_labels = labels.copy()
        
        labels.loc[df[outcome_time] > cutoff] = 0
        #---------------------------------------------------------------------------
        counter = Counter(labels)
        print(counter)
        #---------------------------------------------------------------------------
        for i, (train, test) in enumerate(skf.split(df, labels)):
            print(f'Fold {i}: ', end='')
            train_data = df.iloc[train].copy()
            test_data = df.iloc[test].copy()
            
            train_labels=labels.iloc[train].copy()
            test_labels=labels.iloc[test].copy()
            
            weights = len(train_labels)/test_labels.sum()
            glm_weights = pd.Series(data=1, index=train_labels.index)
            glm_weights.loc[train_labels==1] = weights
            
            #remove outcomes
            train_id = train_data['ID'].copy()
            test_id = test_data['ID'].copy()
            
            train_data.drop(columns=outcome_cols+['ID'], inplace=True)
            test_data.drop(columns= outcome_cols+['ID'], inplace=True)
            
            #print(f'Fold {i} Imputation')
            #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            #train_data.values = imp.fit_transform(train_data)
            #test_data.values  = imp.transform(test_data)
            test_data = test_data.fillna(train_data.mean())
            train_data = train_data.fillna(train_data.mean())
            
            #---------------------------------------------------------------------------
            train_data, train_labels = oversample.fit_resample(train_data, train_labels)
            train_data, train_labels = ran_undersample.fit_resample(train_data, train_labels)
            test_data, test_labels = oversample.fit_resample(test_data, test_labels)
            test_data, test_labels = ran_undersample.fit_resample(test_data, test_labels)

            counter = Counter(train_labels)
            print(counter)
            #---------------------------------------------------------------------------
            
            sd_0_cols = train_data.columns[(train_data.std() == 0)]
            train_data.drop(columns=sd_0_cols, inplace=True)
            test_data.drop(columns=sd_0_cols, inplace=True)
            
            cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
            scaler = StandardScaler()
            train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
            test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
            
            xgb = XGBClassifier(use_label_encoder=False, verbosity = 0)
            xgb.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
            print(f'XGB AUC={auc}')
            xgb_auc.append(auc)
            
            rf = RandomForestClassifier()
            rf.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
            print(f'\tRF AUC={auc}')
            rf_auc.append(auc)
            
            
            #print('\tSuccess!\n')
            #break
        print()
        print(f'Avg XGB AUC={sum(xgb_auc)/5}')
        print(f'Avg RF AUC={sum(rf_auc)/5}')
            

Mode=2, Cut Off=3
Counter({0.0: 820, 1.0: 268})
Fold 0: Counter({0.0: 656, 1.0: 656})
XGB AUC=0.8853361094586555
	RF AUC=0.8998735871505057
Fold 1: Counter({0.0: 656, 1.0: 656})
XGB AUC=0.8844437834622249
	RF AUC=0.9135745092207019
Fold 2: Counter({0.0: 656, 1.0: 656})
XGB AUC=0.8885707911957168
	RF AUC=0.8806699881023201
Fold 3: Counter({0.0: 656, 1.0: 656})
XGB AUC=0.8576368233194527
	RF AUC=0.8641247769185009
Fold 4: Counter({0.0: 656, 1.0: 656})
XGB AUC=0.8816924449732302
	RF AUC=0.8898349196906603

XGB AUC=0.8795359904818559
Avg RF AUC=0.8896155562165378
