In [71]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import optuna
from category_encoders import OneHotEncoder, MEstimateEncoder, CatBoostEncoder, OrdinalEncoder
from sklearn import set_config
import category_encoders
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.metrics import roc_auc_score, roc_curve, make_scorer, f1_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, clone, OneToOneFeatureMixin
from sklearn.preprocessing import FunctionTransformer,StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import auc, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from collections import Counter

In [72]:
# load data here
train = pd.read_csv('training.csv')
test = pd.read_csv('test.csv')

In [73]:
#Define cat and numeric columns
num_cols = train.select_dtypes('number').columns.tolist()
target = 'DiagPeriodL90D'
cat_cols = [c for c in train.columns if c not in num_cols and c != 'DiagPeriodL90D']
num_cols.remove('DiagPeriodL90D')

In [74]:
#make sure numeric cols doesn't have numeric categories
# Note: when adding or removing columns in the self-defined class, edit num_cols and cat_cols

# apparently patient_id, patient_zip3 are cat cols
cat_cols.extend(['patient_id', 'patient_zip3'])
num_cols.remove('patient_id')
num_cols.remove('patient_zip3')

#drop bmi
num_cols.remove('bmi')

# add code_counts, tumor_loc, common_code
num_cols.append('code_counts')
cat_cols.extend(['tumor_loc', 'common_code'])



In [75]:
# drop column here, example of dropping  bmi
class DropColumns(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        x_copy = X.copy()        
        x_copy = x_copy.drop('bmi',axis=1) # drop column here
        return x_copy


# add column here, example of adding an all-one colum 
class AddColumns(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        x_copy = X.copy()
        tumor_loc_list = []

        for i in x_copy['breast_cancer_diagnosis_desc']:
            if 'left' in i:
                tumor_loc_list.append('Left')
            elif 'right' in i:
                tumor_loc_list.append('Right')
            else:
                tumor_loc_list.append('Unspecified')

        x_copy['tumor_loc'] = tumor_loc_list
    
        # Add the common vs. uncommon code cols
        # Create a dictionary mapping diagnosis code to the frequency
        code_counts = Counter(x_copy['breast_cancer_diagnosis_code'])

        # Loop through the codes in the dataset to map the frequency then attach to the df
        code_freq_list = [code_counts[i] for i in x_copy['breast_cancer_diagnosis_code']]
        x_copy['code_counts'] = code_freq_list

        # Create another variable classifying if the code is common or uncommon
        x_copy['common_code'] = np.where(
            x_copy['code_counts']>=np.mean(x_copy['code_counts']),
            'Common', 'Uncommon')
        return x_copy

    
# self-define missing value class   
class InputCol(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        x_copy = X.copy()   
        x_copy['patient_race'] = np.where((x_copy['patient_race'] == 'None') & (x_copy['race_white'] > 90.00), 'White', x_copy['patient_race'])
        x_copy['patient_race'] = np.where((x_copy['patient_race'] == 'None') & (x_copy['hispanic'] > 90.00), 'Hispanic', x_copy['patient_race'])
        x_copy['payer_type'] = np.where((x_copy['payer_type'] == 'None') & (x_copy['patient_age'] >= 75), 'MEDICARE ADVANTAGE', x_copy['payer_type'])

        return x_copy
    

In [76]:
# transformer design

def ImputeCatCols(d):
    df = d.copy()
    for c in cat_cols:
        df[c] = df[c].fillna('None')
    return df

cat_transformer = Pipeline([('imputer',FunctionTransformer(ImputeCatCols))])

num_transformer = Pipeline([('imputer',SimpleImputer(strategy='mean')),
                             ('scaler',StandardScaler())])



cat_transformer_2= Pipeline([('imputer',SimpleImputer(strategy='most_frequent')),
                           ('cat',CatBoostEncoder())])
num_transformer_2 = Pipeline([('imputer',SimpleImputer(strategy='most_frequent'))])



In [77]:
#other model preprocess
preprocess_1 = Pipeline([('drop', DropColumns()),
                                ('add', AddColumns()),
                                ('general_input', InputCol())]).set_output(transform='pandas')

preprocess_2 = ColumnTransformer([('num',num_transformer_2,num_cols),
                                  ('cat',cat_transformer_2,cat_cols)],
                                  remainder='passthrough',
                                  verbose_feature_names_out=False)

preprocess_3 = Pipeline([('p1', preprocess_1), ('p2', preprocess_2)])


In [78]:
#catboost
preprocess_cat_2 = ColumnTransformer([('num',num_transformer,num_cols),
                                  ('cat',cat_transformer,cat_cols)],
                                  remainder='passthrough',
                                  verbose_feature_names_out=False).set_output(transform='pandas')
preprocess_cat_3 = Pipeline([('p1', preprocess_1), ('p2', preprocess_cat_2)])




In [79]:
def score_model(model, label):
    
    print('model: {}'.format(label))
    X = train.copy()
    y = X.pop(target)
              
    skf = StratifiedKFold(n_splits=5,random_state=SEED, shuffle=True)
    
    val_predictions = np.zeros(len(train))
    score_list = []
    for fold, (trx_idx, val_idx) in enumerate(skf.split(X,y)):
        X_train = X.iloc[trx_idx]
        y_train = y.iloc[trx_idx]
        X_val   = X.iloc[val_idx]
        y_val   = y.iloc[val_idx]
        
        model.fit(X_train,y_train)
        y_pred_train = model.predict_proba(X_train)[:,1]
        y_pred_val   = model.predict_proba(X_val)[:,1]
        auc_train = roc_auc_score(y_train,y_pred_train)
        auc_val   = roc_auc_score(y_val,y_pred_val)
        
        val_predictions[val_idx] = y_pred_val
        score_list.append(auc_val)
        print(f'fold: {fold} - AUC Train: {auc_train} - AUC Val {auc_val}') 

    print(f'AUC MEAN {np.mean(score_list)} - Std: {np.std(score_list)}')  
    
    return score_list, val_predictions

In [80]:
SEED = 0


params_cat = {'learning_rate': 0.004, 
              'iterations': 1000,
              'max_depth': 5, 
              'subsample': 0.7401131867566202, 
              'colsample_bylevel': 0.29684187768021997, 
              'min_data_in_leaf': 47,
              'logging_level': 'Silent'}

params_lgb= {'learning_rate': 0.0016,
             'subsample': 0.6710494933148675, 
             'colsample_bytree': 0.7929648706646588, 
             'num_leaves': 29,
             'verbose':-1}


params_xbg = {'learning_rate': 0.001, 
              'max_depth': 6, 
              'subsample': 0.5281085467708261, 
              'min_child_weight': 9}

In [81]:
cat_cols

['patient_race',
 'payer_type',
 'patient_state',
 'patient_gender',
 'breast_cancer_diagnosis_code',
 'breast_cancer_diagnosis_desc',
 'metastatic_cancer_diagnosis_code',
 'metastatic_first_novel_treatment',
 'metastatic_first_novel_treatment_type',
 'Region',
 'Division',
 'patient_id',
 'patient_zip3',
 'tumor_loc',
 'common_code']

In [82]:
score_list, oof_list= pd.DataFrame(), pd.DataFrame()
models = [          
           ('catBoost',make_pipeline(preprocess_cat_3,
                                    CatBoostClassifier(cat_features=cat_cols,
                                                       **params_cat,
                                                       random_state=SEED))),                                       
                            
           ('RF',make_pipeline(preprocess_3,
                               RandomForestClassifier(n_estimators=200,
                                                      random_state=SEED,
                                                      min_samples_leaf=92,
                                                      max_features=1.0))),
           ('Extratrees',make_pipeline(preprocess_3,
                                       ExtraTreesClassifier(n_estimators=300,
                                                           random_state=SEED,
                                                           min_samples_leaf=46,
                                                           max_features=1.0))),
           ('XGB',make_pipeline(preprocess_3,
                                 XGBClassifier(**params_xbg,random_state=SEED)))                                        

            
        ]
for label, model in models:
    score_list[label], oof_list[label] = score_model(model,label)


model: catBoost


fold: 0 - AUC Train: 0.8443537023664106 - AUC Val 0.809982796551636
fold: 1 - AUC Train: 0.838400143750348 - AUC Val 0.7939323110725288
fold: 2 - AUC Train: 0.8380597687193867 - AUC Val 0.8132491863142017
fold: 3 - AUC Train: 0.8386950567737071 - AUC Val 0.7942117554870975
fold: 4 - AUC Train: 0.8434086249248767 - AUC Val 0.7882160242966196
AUC MEAN 0.7999184147444167 - Std: 0.00984222120305882
model: RF
fold: 0 - AUC Train: 0.8522306218262412 - AUC Val 0.8025232790156303
fold: 1 - AUC Train: 0.8535817941720307 - AUC Val 0.7962341264049044
fold: 2 - AUC Train: 0.8537655222854641 - AUC Val 0.8082736032900818
fold: 3 - AUC Train: 0.8535282918093663 - AUC Val 0.7888440540118358
fold: 4 - AUC Train: 0.8598641115991683 - AUC Val 0.7786348900275795
AUC MEAN 0.7949019905500063 - Std: 0.010392072031986194
model: Extratrees
fold: 0 - AUC Train: 0.888370498960326 - AUC Val 0.801517932516436
fold: 1 - AUC Train: 0.8859882652417931 - AUC Val 0.8001207404732822
fold: 2 - AUC Train: 0.88624915676276

In [83]:
# voting from all models
w = RidgeClassifier().fit(oof_list,train.DiagPeriodL90D).coef_[0]
voter = VotingClassifier(models, weights = w, voting = 'soft')

X = train.copy()
y = X.pop('DiagPeriodL90D')      
                       
voter.fit(X,y)
probs = voter.predict_proba(test)[:,1]


In [84]:
# generate submission file
import time
timestr = time.strftime("%m%d%Y-%H%M%S")
result = pd.DataFrame({'patient_id': test['patient_id'],'DiagPeriodL90D': probs})
result.to_csv(f'submission_{timestr}.csv',index=False)