In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
from IPython.display import display
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

from xgboost import XGBClassifier
from sklearn.preprocessing import OrdinalEncoder

from eda import get_missing_summary, plot_distributions_by_target

data_path = pathlib.Path("data")
raw_path = data_path / pathlib.Path("raw")
training_data_path = raw_path / pathlib.Path("training.csv")
test_data_path = raw_path / pathlib.Path("test.csv")
sample_submission_path = raw_path / pathlib.Path("sample_submission.csv")

import warnings
warnings.filterwarnings("ignore")

In [50]:
train = pd.read_csv(training_data_path)
test = pd.read_csv(test_data_path)
train.drop(columns=['patient_id'],inplace=True)
test.drop(columns=['patient_id'],inplace=True)

cols = ['breast_cancer_diagnosis_code','metastatic_cancer_diagnosis_code','patient_zip3','patient_age','payer_type', 'patient_state']

train = train[cols + ['DiagPeriodL90D']]
test = test[cols]

numerical_cols = train.select_dtypes(exclude=['object']).columns
categorical_columns = train.select_dtypes(include=['object']).columns

In [51]:
# Impute categorical columns using mode
for col in categorical_columns:
    if col != 'DiagPeriodL90D':
        mode = train[col].mode()[0]
        train[col].fillna(mode, inplace=True)
        test[col].fillna(mode, inplace=True)

# Impute numerical columns using mean
for col in numerical_cols:
    if col != 'DiagPeriodL90D':
        mean = train[col].median()
        train[col].fillna(mean, inplace=True)
        test[col].fillna(mean, inplace=True)



In [52]:
test['DiagPeriodL90D'] = 2
df = pd.concat([train,test])

# Encode categorical columns
for col in categorical_columns:
    df[col] = df[col].astype('category')

train = df[df['DiagPeriodL90D']!=2]
test = df[df['DiagPeriodL90D']==2].drop(columns=['DiagPeriodL90D'])    

In [53]:
X = train[cols+['DiagPeriodL90D']].drop(columns=['DiagPeriodL90D'], axis=1)
y = train['DiagPeriodL90D']

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

params = {
    'depth':2,
    'random_state': 42,
    'eval_metric': 'AUC',
    'verbose': False,
    'loss_function': 'Logloss',
    'learning_rate':0.3,
    'iterations':1000
}

auc_scores = []
test_preds = []
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model = CatBoostClassifier(**params)
    
    model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, cat_features=categorical_columns.tolist())
    
    preds = model.predict_proba(X_test)[:, 1]
    preds_test = model.predict_proba(test[cols])[:, 1]
    test_preds.append(preds_test)

    auc_score = roc_auc_score(y_test, preds)
    auc_scores.append(auc_score)
    print(f"AUC Score: {auc_score}")

print(f"Average AUC Score: {np.mean(auc_scores)}")
print(pd.DataFrame([1 if prob >= 0.5 else 0 for prob in np.mean(test_preds,axis=0)], columns=['test_preds'])['test_preds'].value_counts())

AUC Score: 0.81607243355248
AUC Score: 0.8084983111698382
AUC Score: 0.8105888626836395
AUC Score: 0.8102950139178042
AUC Score: 0.8108273347212726
Average AUC Score: 0.8112563912090069
test_preds
1    4400
0    1392
Name: count, dtype: int64


In [59]:
predictions = model.predict_proba(test)[:,1]
submission = pd.read_csv(sample_submission_path)
submission['DiagPeriodL90D'] = predictions#np.mean(test_preds,axis=0)
submission.to_csv('submission.csv',index=False)

In [45]:
from sklearn.model_selection import GridSearchCV

catboost_param_grid = {
    'depth': [4, 8],
    'learning_rate': [0.1, 0.3],
    'iterations': [100, 500]
}

catboost_model = CatBoostClassifier(random_state=42, eval_metric='AUC', verbose=False, loss_function='Logloss', cat_features=categorical_columns.tolist())

catboost_grid_search = GridSearchCV(catboost_model, catboost_param_grid, cv=cv, scoring='roc_auc', verbose=1)
catboost_grid_search.fit(X, y, cat_features=categorical_columns.tolist())

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [43]:
xgboost_param_grid = {
    'max_depth': [4, 8],
    'learning_rate': [0.1, 0.3],
    'n_estimators': [100, 500],
}

xgboost_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', enable_categorical=True)

xgboost_grid_search = GridSearchCV(xgboost_model, xgboost_param_grid, cv=cv, scoring='roc_auc', verbose=1)
xgboost_grid_search.fit(X, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [46]:
from sklearn.ensemble import VotingClassifier

# Extract the best estimators
best_catboost = catboost_grid_search.best_estimator_
best_xgboost = xgboost_grid_search.best_estimator_

# Create the voting classifier
voting_classifier = VotingClassifier(
    estimators=[
        ('catboost', best_catboost),
        ('xgboost', best_xgboost),
    ],
    voting='soft'  # Use 'soft' for probabilities to average predictions; 'hard' for majority voting
)

# Fit the voting classifier
voting_classifier.fit(X, y)

In [47]:
roc_auc_scores = cross_val_score(voting_classifier, X, y, cv=cv, scoring='roc_auc')

# Calculate the average ROC AUC score
average_roc_auc = roc_auc_scores.mean()
print(f"Average ROC AUC Score: {average_roc_auc}")

Average ROC AUC Score: 0.8069710697135225


In [62]:
# Assuming `X_test` is your new/test dataset
predictions = voting_classifier.predict(test)

# For soft voting, if you want probabilities
probabilities = voting_classifier.predict_proba(test)

submission = pd.read_csv(sample_submission_path)
submission['DiagPeriodL90D'] = probabilities#np.mean(test_preds,axis=0)
submission.to_csv('submission.csv',index=False)