In [1]:
def pds(x_train, y_train, x_test, y_test, model):
    import pandas as pd
    from sklearn import metrics
    
    
    if (model=='Decision Tree'):
        from sklearn.tree import DecisionTreeClassifier
        m = DecisionTreeClassifier(max_depth=3)
        
    elif (model=='Logistic Regression'):
        from sklearn.linear_model import LogisticRegression
        m = LogisticRegression()
    
    elif (model=='Random Forest'):
        from sklearn.ensemble import RandomForestClassifier
        m = RandomForestClassifier()
    
    elif (model=='Gradient Boosting'):
        from sklearn.ensemble import GradientBoostingClassifier
        m = GradientBoostingClassifier()
        
    elif (model=='Easy Ensemble'):
        from imblearn.ensemble import EasyEnsembleClassifier 
        m = EasyEnsembleClassifier()
        
    elif (model=='Balance Bagging'):
        from imblearn.ensemble import BalancedBaggingClassifier 
        m = BalancedBaggingClassifier()
    
    elif (model=='Balance Forest'):
        from imblearn.ensemble import BalancedRandomForestClassifier
        m = BalancedRandomForestClassifier()

    elif (model=='RUS Boost'):
        from imblearn.ensemble import RUSBoostClassifier
        m = RUSBoostClassifier()
        
    m.fit(x_train, y_train)
    y_pred = m.predict(x_test)
    [tn, fp, fn, tp] = metrics.confusion_matrix(y_test, y_pred).ravel()
    
#     d = pd.DataFrame({'Model':['Decision Tree','Decision Tree','Decision Tree','Decision Tree'],
#                     'Metrics':['Sensitivity','Specificity','Balanced Accuracy','Accuracy'],
#                      'Values':[tp/(tp+fn), tn/(tn+fp), .5*(tp/(tp+fn) + tn/(tn+fp)), (tp+tn)/(tp+tn+fp+fn)]})
    
    d = pd.DataFrame({'Model':model,
                    'Sensitivity':[tp/(tp+fn)],
                     'Specificity':[tn/(tn+fp)],
                     'Balanced Accuracy':[.5*(tp/(tp+fn) + tn/(tn+fp))],
                      'Accuracy':[(tp+tn)/(tp+tn+fp+fn)]})  
    
    return d

def res(X, y, method):
    if (method=='None'):
        
        X_res = X
        y_res = y
    elif (method =='RUS'):
        
        from imblearn.under_sampling import RandomUnderSampler
        rus = RandomUnderSampler()
        X_res, y_res = rus.fit_resample(X, y)
        
    elif(method=='ROS'):
        
        from imblearn.over_sampling import RandomOverSampler 
        ros = RandomOverSampler()
        X_res, y_res = ros.fit_resample(X, y)
        
    elif(method=='SMOTE'):
        
        from imblearn.over_sampling import SMOTE 
        sm = SMOTE()
        X_res, y_res = sm.fit_resample(X, y)
    
    return X_res, y_res

def baseline(x_train, y_train, x_test, y_test, model_list, resampling_list):
    import pandas as pd
    import time
    d = pd.DataFrame({})
    
    k = set(['Easy Ensemble', 'Balance Bagging','Balance Forest', 'RUS Boost'])
    m1 = list(set(model_list).intersection(set(k)))
    
    
    for i in model_list:
        if not(i in m1):
            for j in resampling_list:
                    
                    start_time = time.time()
                    
                    
                    x_res, y_res = res(x_train, y_train, j)
                    a = pds(x_res, y_res , x_test, y_test, i)
                    
                    run_time = time.time() - start_time
                    
                    a.insert(1,"Resampling",[j])
                    a.insert(2,"Run Time",[run_time])
                    d = pd.concat([d,a],ignore_index=True)
            
        else:
            start_time = time.time()
            a = pds(x_train, y_train , x_test, y_test, i)
            run_time = time.time() - start_time
            a.insert(1,"Resampling",['Ensemble'])
            a.insert(2,"Run Time",[run_time])
            d = pd.concat([d,a],ignore_index=True)               

    return d

def baseline2(x_train, y_train, x_test, y_test, model_list, resampling_list, n_simu=20):
    d = pd.DataFrame({})
    for i in range(0,n_simu):
        print('Simulation: ',i)
        a = baseline(x_train, y_train, x_test, y_test, model_list, resampling_list)
        d = pd.concat([d,a],ignore_index=True)    
        
    return d

In [2]:
import pandas as pd

df = pd.read_csv('df_clean.csv')

df = df.dropna()


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.columns

Index(['yod', 'age', 'sex', 'b_wt', 'ethnic', 'pt_state', 'raceethn', 'campus',
       'admtype', 'payer', 'yoa', 'pay_ub92', 'provider', 'asource', 'moa',
       'service', 'diag_adm', 'los', 'los_binary'],
      dtype='object')

In [4]:
df = df.drop(['yod','yoa','b_wt'], axis=1)

In [5]:
X = df.drop(['los','los_binary'], axis=1)
X = pd.get_dummies(X)
y = df['los_binary']

In [6]:
X.shape

(849982, 1827)

In [7]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.5)

In [None]:
model_list = ['Random Forest', 'Gradient Boosting',
          'Easy Ensemble', 'Balance Bagging','Balance Forest', 'RUS Boost']

# model_list = ['Decision Tree']

resampling_list = ['None','RUS']


# model_list = ['Decision Tree',
#              'Easy Ensemble', 'Balance Bagging','Balance Forest', 'RUS Boost']

# resampling_list = ['RUS']


import warnings; warnings.simplefilter('ignore')
bs_lmc = baseline(x_train, y_train, x_test, y_test, model_list, resampling_list)

In [None]:
bs_lmc

In [None]:
bs_lmc

In [None]:
bs_lmc2 = baseline2(x_train, y_train, x_test, y_test, model_list, resampling_list)

In [None]:
model_list = ['Decision Tree', 'Logistic Regression', 'Random Forest', 'Gradient Boosting',
             'Easy Ensemble', 'Balance Bagging','Balance Forest', 'RUS Boost']

resampling_list = ['None','RUS','ROS','SMOTE']


# model_list = ['Decision Tree',
#              'Easy Ensemble', 'Balance Bagging','Balance Forest', 'RUS Boost']

# resampling_list = ['RUS']


import warnings; warnings.simplefilter('ignore')
bs2 = baseline(x_train, y_train, x_test, y_test, model_list, resampling_list)
bs2