In [23]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import optuna
from category_encoders import OneHotEncoder, MEstimateEncoder, CatBoostEncoder, OrdinalEncoder
from sklearn import set_config
import category_encoders
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.metrics import roc_auc_score, roc_curve, make_scorer, f1_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.preprocessing import FunctionTransformer,StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import auc, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier

In [None]:
# load data here
train = pd.read_csv('training.csv')
test = pd.read_csv('test.csv')

In [None]:
#Define cat and numeric columns
num_cols = train.select_dtypes('number').columns.tolist()
target = 'DiagPeriodL90D'
cat_cols = [c for c in train.columns if c not in num_cols and c != 'DiagPeriodL90D']
num_cols.remove('DiagPeriodL90D')

In [None]:
#make sure numeric cols doesn't have numeric categories
train[num_cols]

# apparently patient_id, patient_zip3 are cat cols
cat_cols.extend(['patient_id', 'patient_zip3'])
num_cols.remove('patient_id')
num_cols.remove('patient_zip3')

In [24]:
# drop column here, example of dropping  bmi
class DropColumns(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        x_copy = X.copy()        
        x_copy = x_copy.drop('bmi',axis=1) # drop column here
        return x_copy
    
# add column here, example of adding an all-one colum 
class AddColumns(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        x_copy = X.copy()        
        # x_copy['all_one'] = np.ones(len(x_copy)) # add column here
        return x_copy
    
# self-define missing value class   
class InputCol(TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        x_copy = X.copy()   
        return x_copy

In [30]:

def ImputeCatCols(d):
    df = d.copy()
    for c in cat_cols:
        df[c] = df[c].fillna('None')
    return df

num_transformer = Pipeline([('imputer',SimpleImputer(strategy='mean')),
                             ('scaler',StandardScaler())])
cat_transformer = Pipeline([('imputer',FunctionTransformer(ImputeCatCols))])

preprocess = ColumnTransformer([('num',num_transformer,num_cols),
                                ('cat',cat_transformer,cat_cols)],
                                remainder='passthrough',
                                verbose_feature_names_out=False).set_output(transform='pandas')
preprocess_catboost= Pipeline([('preprocess',preprocess),
                           ('drop',DropColumns())
                          ])



In [35]:
cat_transformer_2= Pipeline([('imputer',SimpleImputer(strategy='most_frequent')),
                           ('cat',CatBoostEncoder())])
num_transformer_2 = Pipeline([('imputer',SimpleImputer(strategy='most_frequent'))])
preprocess_othermodels = ColumnTransformer([('cat',cat_transformer_2,cat_cols),
                                            ('num',num_transformer_2,num_cols)
                                           ])

In [36]:
def score_model(model, label):
    
    print('model: {}'.format(label))
    X = train.copy()
    y = X.pop(target)
              
    skf = StratifiedKFold(n_splits=5,random_state=SEED, shuffle=True)
    
    val_predictions = np.zeros(len(train))
    score_list = []
    for fold, (trx_idx, val_idx) in enumerate(skf.split(X,y)):
        X_train = X.iloc[trx_idx]
        y_train = y.iloc[trx_idx]
        X_val   = X.iloc[val_idx]
        y_val   = y.iloc[val_idx]
        
        model.fit(X_train,y_train)
        y_pred_train = model.predict_proba(X_train)[:,1]
        y_pred_val   = model.predict_proba(X_val)[:,1]
        auc_train = roc_auc_score(y_train,y_pred_train)
        auc_val   = roc_auc_score(y_val,y_pred_val)
        
        val_predictions[val_idx] = y_pred_val
        score_list.append(auc_val)
        print(f'fold: {fold} - AUC Train: {auc_train} - AUC Val {auc_val}') 

    print(f'AUC MEAN {np.mean(score_list)} - Std: {np.std(score_list)}')  
    
    return score_list, val_predictions

In [37]:
SEED = 0


params_cat = {'learning_rate': 0.004, 
              'iterations': 1000,
              'max_depth': 5, 
              'subsample': 0.7401131867566202, 
              'colsample_bylevel': 0.29684187768021997, 
              'min_data_in_leaf': 47,
              'logging_level': 'Silent'}

params_lgb= {'learning_rate': 0.0016,
             'subsample': 0.6710494933148675, 
             'colsample_bytree': 0.7929648706646588, 
             'num_leaves': 29,
             'verbose':-1}


params_xbg = {'learning_rate': 0.001, 
              'max_depth': 6, 
              'subsample': 0.5281085467708261, 
              'min_child_weight': 9}

In [38]:
score_list, oof_list= pd.DataFrame(), pd.DataFrame()
models = [          
        #    ('catBoost',make_pipeline(preprocess_catboost,
        #                             CatBoostClassifier(cat_features=cat_cols,
        #                                                **params_cat,
        #                                                random_state=SEED))),                                       
                            
           ('RF',make_pipeline(preprocess_othermodels,
                               RandomForestClassifier(n_estimators=200,
                                                      random_state=SEED,
                                                      min_samples_leaf=92,
                                                      max_features=1.0))),
           ('Extratrees',make_pipeline(preprocess_othermodels,
                                       ExtraTreesClassifier(n_estimators=300,
                                                           random_state=SEED,
                                                           min_samples_leaf=46,
                                                           max_features=1.0))),
           ('XGB',make_pipeline(preprocess_othermodels,
                                 XGBClassifier(**params_xbg,random_state=SEED)))                                        

            
        ]
for label, model in models:
    score_list[label], oof_list[label] = score_model(model,label)


model: RF
fold: 0 - AUC Train: 0.8502600865669502 - AUC Val 0.8010395615359034
fold: 1 - AUC Train: 0.8535540329460948 - AUC Val 0.7977654689928734
fold: 2 - AUC Train: 0.8528011997009787 - AUC Val 0.8078609986504722
fold: 3 - AUC Train: 0.8538171445651039 - AUC Val 0.7887176158173861


In [None]:
# voting from all models
w = RidgeClassifier().fit(oof_list,train.DiagPeriodL90D).coef_[0]
voter = VotingClassifier(models, weights = w, voting = 'soft')

X = train.copy()
y = X.pop('DiagPeriodL90D')      
                       
voter.fit(X,y)
voter.predict_proba(test)[:,1]
