In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [3]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
def brierskillscore(y_pred, y_true):
    cls_num_list = np.unique(y_true, return_counts=True)[1]
    ratio = float(cls_num_list[1]) / np.sum(cls_num_list)
    y_true, y_pred = y_true.reshape(1,-1).squeeze(), y_pred.reshape(1,-1).squeeze()
    predictions = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    predictions_ref = [ratio] * len(predictions)
    testy = y_true
    BS = [brier_score_loss(testy, [y for x in range(len(testy))]) for y in predictions]
    BS_ref = [brier_score_loss(testy, [y for x in range(len(testy))]) for y in predictions_ref]
    BS_skill = 1 - np.array(BS)/np.array(BS_ref)


    return BS_skill

In [4]:
def get_ci(X):
    mean = np.mean(X)
    bound = 1.96*np.std(X)/np.sqrt(len(X))
    print(f'{mean:.3f}, [{mean-bound:.3f}, {mean+bound:.3f}]')
    #return mean, mean-bound, mean+bound

In [5]:
outcome_cols = [
    'death',
    'cvd_death',
    'time_death',
    'anyhosp',
    'time_anyhosp',
    'hfhosp',
    'time_hfhosp',
    'abortedca',
    'time_abortedca',
    'mi',
    'time_mi',
    'stroke',
    'time_stroke',
    'primary_ep',
    'time_primary_ep'
]

In [6]:
late_drops = [
     'mr_mod',
     'symp_bur_score',
     'tot_symp_score',
     'self_eff_score',
     'qol_score',
     'soc_limit_score',
     'overall_sum_score',
     'clin_sum_score'
]

In [7]:
con_cat_cols = [
    'GLUCOSE_FAST',
    'GLUCOSE_RAND',
    'CO2_mmolL',
    'GLUCOSE_mgdL',
    'WBC_kuL',
    'HCT_p',
    'HB_gdL',
    'PLT_kuL',
    'ALP_UL',
    'TBILI_mgdL',
    'ALB_gdL'
]

In [8]:
contin_cols = ['BNP_VAL',
 'age_entry',
 'EF',
 'visit_dt1_hf',
 'chfdc_dt3',
 'mi_dt3',
 'stroke_dt3',
 'cabg_dt3',
 'pci_dt3',
 'DM_AGE_YR',
 'DM_DUR_YR',
 'cigs',
 'SMOKE_YRS',
 'QUIT_YRS',
 'HEAVY_MIN',
 'HEAVY_WK',
 'MED_WK',
 'MED_MIN',
 'LIGHT_WK',
 'LIGHT_MIN',
 'metsperweek',
 'cooking_salt_score',
 'height',
 'weight',
 'waistc',
 'HR',
 'SBP',
 'DBP',
 'CR_mgdl',
 'gfr',
 'labs_dt1',
 'NA_mmolL',
 'K_mmolL',
 'CL_mmolL',
 'BUN_mgdL',
 'ALT_UL',
 'AST_UL',
 'urine_val_mgg',
 'QRS_DUR',
 'CR_mgdL',
 'BMI'
              ]

In [9]:
base_data = pd.read_csv(
    '/data/datasets/topcat/py_cleaned_data/TOPCAT_final_2_25_2020.csv',
    index_col=0
)

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=False)

In [11]:
xgb_auc = []
rf_auc = []
#Modes:
#     1 : Primary End Point
#     2 : Death
#     3 : HF Hospitalization

#Cutoff: Years before censoring

#for mode in range(1,4):
#    for cutoff in [3, np.inf]:
for mode in [3]:
    for cutoff in [3]:
        print(f'Mode={mode}, Cut Off={cutoff}')
        df = base_data.copy()
        
        #remove people
        if cutoff==3:
            ids = pd.read_csv('/data/datasets/topcat/nch/Pt_ID.csv')
            if mode == 2:
                df = df[
                    df['ID'].isin(ids['Died_Americas_3years']) |
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
            elif mode == 3:
                df = df[
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
        
        #remove variables
        df.drop(columns=df.columns[114:173], inplace=True)
        df.drop(columns=['BNP_VAL', 'BNP_YN', 'visit_dt1_hf',], inplace=True)
        df.drop(columns=late_drops, inplace=True)
        
        highly_miss = []
        for col in df.columns:
            if df[col].count()/df.shape[0] < 0.5:
                highly_miss.append(col)
        df.drop(columns=highly_miss, inplace=True)
        
        #create labels
        if mode == 1:
            outcome = 'primary_ep'
            outcome_time = 'time_primary_ep'
        elif mode == 2:
            outcome = 'death'
            outcome_time = 'time_death'
        elif mode == 3:
            outcome = 'hfhosp'
            outcome_time = 'time_hfhosp'
        
        labels = df[outcome].copy()
        complete_labels = labels.copy()
        
        labels.loc[df[outcome_time] > cutoff] = 0
        
        for i, (train, test) in enumerate(skf.split(df, labels)):
            print(f'Fold {i}: ', end='')
            train_data = df.iloc[train].copy()
            test_data = df.iloc[test].copy()
            
            train_labels=labels.iloc[train].copy()
            test_labels=labels.iloc[test].copy()
            
            weights = len(train_labels)/test_labels.sum()
            glm_weights = pd.Series(data=1, index=train_labels.index)
            glm_weights.loc[train_labels==1] = weights
            
            #remove outcomes
            train_id = train_data['ID'].copy()
            test_id = test_data['ID'].copy()
            
            train_data.drop(columns=outcome_cols+['ID'], inplace=True)
            test_data.drop(columns= outcome_cols+['ID'], inplace=True)
            
            #print(f'Fold {i} Imputation')
            #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            #train_data.values = imp.fit_transform(train_data)
            #test_data.values  = imp.transform(test_data)
            test_data = test_data.fillna(train_data.mean())
            train_data = train_data.fillna(train_data.mean())
            
            sd_0_cols = train_data.columns[(train_data.std() == 0)]
            train_data.drop(columns=sd_0_cols, inplace=True)
            test_data.drop(columns=sd_0_cols, inplace=True)
            
            cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
            scaler = StandardScaler()
            train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
            test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
            
            xgb = XGBClassifier()
            xgb.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
            print(f'XGB AUC={auc}')
            xgb_auc.append(auc)
            
            rf = RandomForestClassifier()
            rf.fit(train_data, train_labels)
            auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
            print(f'\tRF AUC={auc}')
            rf_auc.append(auc)
            
            
            #print('\tSuccess!\n')
            #break
            
            

Mode=3, Cut Off=3
Fold 0: XGB AUC=0.7262759924385633
	RF AUC=0.7366099558916194
Fold 1: XGB AUC=0.7456836798991809
	RF AUC=0.7601764335223692
Fold 2: XGB AUC=0.7710399186371727
	RF AUC=0.7623951182303585
Fold 3: XGB AUC=0.7482097186700768
	RF AUC=0.7599104859335039
Fold 4: XGB AUC=0.7358056265984656
	RF AUC=0.7381074168797953


In [12]:
clf_auc = []
#Modes:
#     1 : Primary End Point
#     2 : Death
#     3 : HF Hospitalization

#Cutoff: Years before censoring

#for mode in range(1,4):
#    for cutoff in [3, np.inf]:
for mode in [3]:
    for cutoff in [3]:
        print(f'Mode={mode}, Cut Off={cutoff}')
        df = base_data.copy()
        
        #remove people
        if cutoff==3:
            ids = pd.read_csv('/data/datasets/topcat/nch/Pt_ID.csv')
            if mode == 2:
                df = df[
                    df['ID'].isin(ids['Died_Americas_3years']) |
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
            elif mode == 3:
                df = df[
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
        
        #remove variables
        df.drop(columns=df.columns[114:173], inplace=True)
        df.drop(columns=['BNP_VAL', 'BNP_YN', 'visit_dt1_hf',], inplace=True)
        df.drop(columns=late_drops, inplace=True)
        
        highly_miss = []
        for col in df.columns:
            if df[col].count()/df.shape[0] < 0.5:
                highly_miss.append(col)
        df.drop(columns=highly_miss, inplace=True)
        
        #create labels
        if mode == 1:
            outcome = 'primary_ep'
            outcome_time = 'time_primary_ep'
        elif mode == 2:
            outcome = 'death'
            outcome_time = 'time_death'
        elif mode == 3:
            outcome = 'hfhosp'
            outcome_time = 'time_hfhosp'
        
        labels = df[outcome].copy()
        complete_labels = labels.copy()
        
        labels.loc[df[outcome_time] > cutoff] = 0
        
        for i, (train, test) in enumerate(skf.split(df, labels)):
            print(f'Fold {i}: ', end='')
            train_data = df.iloc[train].copy()
            test_data = df.iloc[test].copy()
            
            train_labels=labels.iloc[train].copy()
            test_labels=labels.iloc[test].copy()
            
            weights = len(train_labels)/test_labels.sum()
            glm_weights = pd.Series(data=1, index=train_labels.index)
            glm_weights.loc[train_labels==1] = weights
            
            #remove outcomes
            train_id = train_data['ID'].copy()
            test_id = test_data['ID'].copy()
            
            train_data.drop(columns=outcome_cols+['ID'], inplace=True)
            test_data.drop(columns= outcome_cols+['ID'], inplace=True)
            
            #print(f'Fold {i} Imputation')
            #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            #train_data.values = imp.fit_transform(train_data)
            #test_data.values  = imp.transform(test_data)
            test_data = test_data.fillna(train_data.mean())
            train_data = train_data.fillna(train_data.mean())
            
            sd_0_cols = train_data.columns[(train_data.std() == 0)]
            train_data.drop(columns=sd_0_cols, inplace=True)
            test_data.drop(columns=sd_0_cols, inplace=True)
            
            cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
            scaler = StandardScaler()
            train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
            test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
            
            parameters = {
                'max_depth': range (2, 10, 2),
                'n_estimators': range(40, 220, 40),
                'learning_rate': [0.2, 0.15, 0.1],
                'gamma': [0, 1, 2, 5]
            }
            
            #START CV GRIDSEARCH
            clf = GridSearchCV(
                XGBClassifier(),
                parameters,
                scoring='roc_auc',
                n_jobs=12
            )
            
            clf.fit(train_data, train_labels)
            print(f'Best parameters: {clf.best_params_}')
            auc = roc_auc_score(test_labels, clf.predict_proba(test_data)[:,1])
            print(f'\tWith score{auc}')
            clf_auc.append(auc)
            
            #xgb = XGBClassifier()
            #xgb.fit(train_data, train_labels)
            #auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
            #print(f'XGB AUC={auc}')
            #xgb_auc.append(auc)
            #
            #rf = RandomForestClassifier()
            #rf.fit(train_data, train_labels)
            #auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
            #print(f'\tRF AUC={auc}')
            #rf_auc.append(auc)
            
            
            #print('\tSuccess!\n')
            #break
            
            

Mode=3, Cut Off=3
Fold 0: Best parameters: {'gamma': 0, 'learning_rate': 0.15, 'max_depth': 8, 'n_estimators': 200}
	With score0.724007561436673
Fold 1: Best parameters: {'gamma': 1, 'learning_rate': 0.15, 'max_depth': 4, 'n_estimators': 40}
	With score0.7282923755513547
Fold 2: Best parameters: {'gamma': 0, 'learning_rate': 0.15, 'max_depth': 8, 'n_estimators': 160}
	With score0.7524790236460717
Fold 3: Best parameters: {'gamma': 1, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 80}
	With score0.7485933503836316
Fold 4: Best parameters: {'gamma': 0, 'learning_rate': 0.15, 'max_depth': 2, 'n_estimators': 120}
	With score0.7493606138107417


In [13]:
0.724007561436673
0.737618147448015
0.7552758708365116
0.7400255754475704
0.7493606138107417

0.7493606138107417

In [14]:
# XGB

clf_auc = []
#Modes:
#     1 : Primary End Point
#     2 : Death
#     3 : HF Hospitalization

#Cutoff: Years before censoring

#for mode in range(1,4):
#    for cutoff in [3, np.inf]:
for mode in [3]:Modes
    for cutoff in [3]:
        print(f'Mode={mode}, Cut Off={cutoff}')
        df = base_data.copy()
        
        #remove people
        if cutoff==3:
            ids = pd.read_csv('/data/datasets/topcat/nch/Pt_ID.csv')
            if mode == 2:
                df = df[
                    df['ID'].isin(ids['Died_Americas_3years']) |
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
            elif mode == 3:
                df = df[
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
        
        #remove variables
        df.drop(columns=df.columns[114:173], inplace=True)
        df.drop(columns=['BNP_VAL', 'BNP_YN', 'visit_dt1_hf',], inplace=True)
        df.drop(columns=late_drops, inplace=True)
        
        highly_miss = []
        for col in df.columns:
            if df[col].count()/df.shape[0] < 0.5:
                highly_miss.append(col)
        df.drop(columns=highly_miss, inplace=True)
        
        #create labels
        if mode == 1:
            outcome = 'primary_ep'
            outcome_time = 'time_primary_ep'
        elif mode == 2:
            outcome = 'death'
            outcome_time = 'time_death'
        elif mode == 3:
            outcome = 'hfhosp'
            outcome_time = 'time_hfhosp'
        
        labels = df[outcome].copy()
        complete_labels = labels.copy()
        
        labels.loc[df[outcome_time] > cutoff] = 0
        
        for i, (train, test) in enumerate(skf.split(df, labels)):
            print(f'Fold {i}: ', end='')
            train_data = df.iloc[train].copy()
            test_data = df.iloc[test].copy()
            
            train_labels=labels.iloc[train].copy()
            test_labels=labels.iloc[test].copy()
            
            weights = len(train_labels)/test_labels.sum()
            glm_weights = pd.Series(data=1, index=train_labels.index)
            glm_weights.loc[train_labels==1] = weights
            
            #remove outcomes
            train_id = train_data['ID'].copy()
            test_id = test_data['ID'].copy()
            
            train_data.drop(columns=outcome_cols+['ID'], inplace=True)
            test_data.drop(columns= outcome_cols+['ID'], inplace=True)
            
            #print(f'Fold {i} Imputation')
            #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            #train_data.values = imp.fit_transform(train_data)
            #test_data.values  = imp.transform(test_data)
            test_data = test_data.fillna(train_data.mean())
            train_data = train_data.fillna(train_data.mean())
            
            sd_0_cols = train_data.columns[(train_data.std() == 0)]
            train_data.drop(columns=sd_0_cols, inplace=True)
            test_data.drop(columns=sd_0_cols, inplace=True)
            
            cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
            scaler = StandardScaler()
            train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
            test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
            
            parameters = {
                'max_depth': range (2, 10, 2),
                'n_estimators': range(40, 220, 40),
                'learning_rate': [0.15],
                'gamma': [0, 0.1, 0.5, 1],
                'lambda': [1, 1.1, 1.5, 2]
            }
            
            parameters = {
                'n_estimators': [10, 100, 1000, 10000],
                'max_depth': [2, 4, 6],
                
            }
            
            #START CV GRIDSEARCH
            clf = GridSearchCV(
                XGBClassifier(),
                parameters,
                scoring='roc_auc',
                n_jobs=12
            )
            
            clf.fit(train_data, train_labels)
            print(f'Best parameters: {clf.best_params_}')
            auc = roc_auc_score(test_labels, clf.predict_proba(test_data)[:,1])
            print(f'\tWith score{auc}')
            clf_auc.append(auc)
            
            #xgb = XGBClassifier()
            #xgb.fit(train_data, train_labels)
            #auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
            #print(f'XGB AUC={auc}')
            #xgb_auc.append(auc)
            #
            #rf = RandomForestClassifier()
            #rf.fit(train_data, train_labels)
            #auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
            #print(f'\tRF AUC={auc}')
            #rf_auc.append(auc)
            
            
            #print('\tSuccess!\n')
            #break
            
            

Mode=3, Cut Off=3
Fold 0: Best parameters: {'max_depth': 6, 'n_estimators': 10000}
	With score0.7134215500945179
Fold 1: Best parameters: {'max_depth': 6, 'n_estimators': 1000}
	With score0.7512287334593573
Fold 2: Best parameters: {'max_depth': 4, 'n_estimators': 10000}
	With score0.7396389524535977
Fold 3: Best parameters: {'max_depth': 2, 'n_estimators': 100}
	With score0.7402813299232737
Fold 4: Best parameters: {'max_depth': 6, 'n_estimators': 100}
	With score0.740920716112532


In [15]:
get_ci(clf_auc)

0.737, [0.726, 0.748]


In [16]:
# RF

clf_auc = []
#Modes:
#     1 : Primary End Point
#     2 : Death
#     3 : HF Hospitalization

#Cutoff: Years before censoring

#for mode in range(1,4):
#    for cutoff in [3, np.inf]:
for mode in [3]:
    for cutoff in [3]:
        print(f'Mode={mode}, Cut Off={cutoff}')
        df = base_data.copy()
        
        #remove people
        if cutoff==3:
            ids = pd.read_csv('/data/datasets/topcat/nch/Pt_ID.csv')
            if mode == 2:
                df = df[
                    df['ID'].isin(ids['Died_Americas_3years']) |
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
            elif mode == 3:
                df = df[
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
        
        #remove variables
        df.drop(columns=df.columns[114:173], inplace=True)
        df.drop(columns=['BNP_VAL', 'BNP_YN', 'visit_dt1_hf',], inplace=True)
        df.drop(columns=late_drops, inplace=True)
        
        highly_miss = []
        for col in df.columns:
            if df[col].count()/df.shape[0] < 0.5:
                highly_miss.append(col)
        df.drop(columns=highly_miss, inplace=True)
        
        #create labels
        if mode == 1:
            outcome = 'primary_ep'
            outcome_time = 'time_primary_ep'
        elif mode == 2:
            outcome = 'death'
            outcome_time = 'time_death'
        elif mode == 3:
            outcome = 'hfhosp'
            outcome_time = 'time_hfhosp'
        
        labels = df[outcome].copy()
        complete_labels = labels.copy()
        
        labels.loc[df[outcome_time] > cutoff] = 0
        
        for i, (train, test) in enumerate(skf.split(df, labels)):
            print(f'Fold {i}: ', end='')
            train_data = df.iloc[train].copy()
            test_data = df.iloc[test].copy()
            
            train_labels=labels.iloc[train].copy()
            test_labels=labels.iloc[test].copy()
            
            weights = len(train_labels)/test_labels.sum()
            glm_weights = pd.Series(data=1, index=train_labels.index)
            glm_weights.loc[train_labels==1] = weights
            
            #remove outcomes
            train_id = train_data['ID'].copy()
            test_id = test_data['ID'].copy()
            
            train_data.drop(columns=outcome_cols+['ID'], inplace=True)
            test_data.drop(columns= outcome_cols+['ID'], inplace=True)
            
            #print(f'Fold {i} Imputation')
            #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            #train_data.values = imp.fit_transform(train_data)
            #test_data.values  = imp.transform(test_data)
            test_data = test_data.fillna(train_data.mean())
            train_data = train_data.fillna(train_data.mean())
            
            sd_0_cols = train_data.columns[(train_data.std() == 0)]
            train_data.drop(columns=sd_0_cols, inplace=True)
            test_data.drop(columns=sd_0_cols, inplace=True)
            
            cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
            scaler = StandardScaler()
            train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
            test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
            
            parameters = {
                'max_depth': range (2, 10, 2),
                'n_estimators': range(40, 220, 40),
                'learning_rate': [0.15],
                'gamma': [0, 0.1, 0.5, 1],
                'lambda': [1, 1.1, 1.5, 2]
            }
            
            parameters = {
                'n_estimators': [1000, 5000, 10000],
                'max_depth': [6, 8, 10],
                
            }
            
            #START CV GRIDSEARCH
            clf = GridSearchCV(
                RandomForestClassifier(),
                parameters,
                scoring='roc_auc',
                n_jobs=12
            )
            
            clf.fit(train_data, train_labels)
            print(f'Best parameters: {clf.best_params_}')
            auc = roc_auc_score(test_labels, clf.predict_proba(test_data)[:,1])
            print(f'\tWith score{auc}')
            clf_auc.append(auc)
            
            #xgb = XGBClassifier()
            #xgb.fit(train_data, train_labels)
            #auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
            #print(f'XGB AUC={auc}')
            #xgb_auc.append(auc)
            #
            #rf = RandomForestClassifier()
            #rf.fit(train_data, train_labels)
            #auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
            #print(f'\tRF AUC={auc}')
            #rf_auc.append(auc)
            
            
            #print('\tSuccess!\n')
            #break
            
            

Mode=3, Cut Off=3
Best parameters: {'max_depth': 10, 'n_estimators': 10000}
	With score0.7587901701323253
Fold 1: Best parameters: {'max_depth': 6, 'n_estimators': 1000}
	With score0.7681159420289855
Fold 2: Best parameters: {'max_depth': 10, 'n_estimators': 1000}
	With score0.7962115433511315
Fold 3: Best parameters: {'max_depth': 10, 'n_estimators': 5000}
	With score0.7606138107416879
Fold 4: Best parameters: {'max_depth': 10, 'n_estimators': 10000}
	With score0.758695652173913


In [17]:
get_ci(clf_auc)

0.768, [0.756, 0.781]


In [18]:
# XGB

clf_auc = []
#Modes:
#     1 : Primary End Point
#     2 : Death
#     3 : HF Hospitalization

#Cutoff: Years before censoring

#for mode in range(1,4):
#    for cutoff in [3, np.inf]:
for mode in [3]:
    for cutoff in [3]:
        print(f'Mode={mode}, Cut Off={cutoff}')
        df = base_data.copy()
        
        #remove people
        if cutoff==3:
            ids = pd.read_csv('/data/datasets/topcat/nch/Pt_ID.csv')
            if mode == 2:
                df = df[
                    df['ID'].isin(ids['Died_Americas_3years']) |
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
            elif mode == 3:
                df = df[
                    df['ID'].isin(ids['hosp_americas_3years']) |
                    df['ID'].isin(ids['FU_alive_3y_Americas'])
                ]
        
        #remove variables
        df.drop(columns=df.columns[114:173], inplace=True)
        df.drop(columns=['BNP_VAL', 'BNP_YN', 'visit_dt1_hf',], inplace=True)
        df.drop(columns=late_drops, inplace=True)
        
        highly_miss = []
        for col in df.columns:
            if df[col].count()/df.shape[0] < 0.5:
                highly_miss.append(col)
        df.drop(columns=highly_miss, inplace=True)
        
        #create labels
        if mode == 1:
            outcome = 'primary_ep'
            outcome_time = 'time_primary_ep'
        elif mode == 2:
            outcome = 'death'
            outcome_time = 'time_death'
        elif mode == 3:
            outcome = 'hfhosp'
            outcome_time = 'time_hfhosp'
        
        labels = df[outcome].copy()
        complete_labels = labels.copy()
        
        labels.loc[df[outcome_time] > cutoff] = 0
        
        for i, (train, test) in enumerate(skf.split(df, labels)):
            print(f'Fold {i}: ', end='')
            train_data = df.iloc[train].copy()
            test_data = df.iloc[test].copy()
            
            train_labels=labels.iloc[train].copy()
            test_labels=labels.iloc[test].copy()
            
            weights = len(train_labels)/test_labels.sum()
            glm_weights = pd.Series(data=1, index=train_labels.index)
            glm_weights.loc[train_labels==1] = weights
            
            #remove outcomes
            train_id = train_data['ID'].copy()
            test_id = test_data['ID'].copy()
            
            train_data.drop(columns=outcome_cols+['ID'], inplace=True)
            test_data.drop(columns= outcome_cols+['ID'], inplace=True)
            
            #print(f'Fold {i} Imputation')
            #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            #train_data.values = imp.fit_transform(train_data)
            #test_data.values  = imp.transform(test_data)
            test_data = test_data.fillna(train_data.mean())
            train_data = train_data.fillna(train_data.mean())
            
            sd_0_cols = train_data.columns[(train_data.std() == 0)]
            train_data.drop(columns=sd_0_cols, inplace=True)
            test_data.drop(columns=sd_0_cols, inplace=True)
            
            cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
            scaler = StandardScaler()
            train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
            test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
            
            parameters = {
                'max_depth': range (2, 10, 2),
                'n_estimators': range(40, 220, 40),
                'learning_rate': [0.15],
                'gamma': [0, 0.1, 0.5, 1],
                'lambda': [1, 1.1, 1.5, 2]
            }
            
            parameters = {
                'n_estimators': [1000, 5000, 10000],
                'max_depth': [6, 8, 10],
                'learning_rate': [0.1, 0.15]
                
            }
            
            #START CV GRIDSEARCH
            clf = GridSearchCV(
                XGBClassifier(),
                parameters,
                scoring='roc_auc',
                n_jobs=12
            )
            
            clf.fit(train_data, train_labels)
            print(f'Best parameters: {clf.best_params_}')
            auc = roc_auc_score(test_labels, clf.predict_proba(test_data)[:,1])
            print(f'\tWith score{auc}')
            clf_auc.append(auc)
            
            #xgb = XGBClassifier()
            #xgb.fit(train_data, train_labels)
            #auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
            #print(f'XGB AUC={auc}')
            #xgb_auc.append(auc)
            #
            #rf = RandomForestClassifier()
            #rf.fit(train_data, train_labels)
            #auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
            #print(f'\tRF AUC={auc}')
            #rf_auc.append(auc)
            
            
            #print('\tSuccess!\n')
            #break
            
            

Mode=3, Cut Off=3
Best parameters: {'learning_rate': 0.15, 'max_depth': 8, 'n_estimators': 10000}
	With score0.722621298046629
Fold 1: Best parameters: {'learning_rate': 0.15, 'max_depth': 10, 'n_estimators': 10000}
	With score0.75526149968494
Fold 2: Best parameters: {'learning_rate': 0.15, 'max_depth': 8, 'n_estimators': 10000}
	With score0.7481566234426646
Fold 3: Best parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 10000}
	With score0.7561381074168798
Fold 4: Best parameters: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 10000}
	With score0.7179028132992327


In [19]:
get_ci(clf_auc)

0.740, [0.726, 0.754]
