In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
from IPython.display import display
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

from xgboost import XGBClassifier
from sklearn.preprocessing import OrdinalEncoder

from eda import get_missing_summary, plot_distributions_by_target

data_path = pathlib.Path("data")
raw_path = data_path / pathlib.Path("raw")
training_data_path = raw_path / pathlib.Path("training.csv")
test_data_path = raw_path / pathlib.Path("test.csv")
sample_submission_path = raw_path / pathlib.Path("sample_submission.csv")

import warnings
warnings.filterwarnings("ignore")

In [36]:
train = pd.read_csv(training_data_path)
test = pd.read_csv(test_data_path)
train.drop(columns=['patient_id'],inplace=True)
test.drop(columns=['patient_id'],inplace=True)


numerical_cols = train.select_dtypes(exclude=['object']).columns
categorical_columns = train.select_dtypes(include=['object']).columns

In [37]:
# Impute categorical columns using mode
for col in categorical_columns:
    if col != 'DiagPeriodL90D':
        mode = train[col].mode()[0]
        train[col].fillna(mode, inplace=True)
        test[col].fillna(mode, inplace=True)

# Impute numerical columns using mean
for col in numerical_cols:
    if col != 'DiagPeriodL90D':
        mean = train[col].median()
        train[col].fillna(mean, inplace=True)
        test[col].fillna(mean, inplace=True)

In [38]:
test['DiagPeriodL90D'] = 2
df = pd.concat([train,test])

cols = ['breast_cancer_diagnosis_code','metastatic_cancer_diagnosis_code','patient_zip3','payer_type','patient_age']

for col in cols[:-1]:
    df[col] = df[col].astype('category')

train = df[df['DiagPeriodL90D']!=2]
test = df[df['DiagPeriodL90D']==2].drop(columns=['DiagPeriodL90D'])    

In [39]:
# plot the correlation matrix
#plt.figure(figsize=(20,10))
#sns.heatmap(train[cols].corr(), annot=True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm',linewidths=3, linecolor='black')
#plt.title('Correlation matrix')
train[cols]

Unnamed: 0,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3,payer_type,patient_age
0,C50919,C7989,924,MEDICAID,84
1,C50411,C773,928,COMMERCIAL,62
2,C50112,C773,760,COMMERCIAL,43
3,C50212,C773,926,COMMERCIAL,45
4,1749,C773,836,COMMERCIAL,55
...,...,...,...,...,...
12901,C50411,C773,436,COMMERCIAL,50
12902,C50912,C773,945,COMMERCIAL,50
12903,C50912,C7931,926,COMMERCIAL,61
12904,1749,C773,112,COMMERCIAL,37


In [40]:
X = train[cols+['DiagPeriodL90D']].drop(columns=['DiagPeriodL90D'], axis=1)
y = train['DiagPeriodL90D']

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

params = {
    'depth':2,
    'random_state': 42,
    'eval_metric': 'AUC',
    'verbose': False,
    'loss_function': 'Logloss',
    'learning_rate':0.3,
    'iterations':1000
}

auc_scores = []
test_preds = []
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model = CatBoostClassifier(**params)
    
    model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, cat_features=cols[:-1])
    
    preds = model.predict_proba(X_test)[:, 1]
    preds_test = model.predict_proba(test[cols])[:, 1]
    test_preds.append(preds_test)

    auc_score = roc_auc_score(y_test, preds)
    auc_scores.append(auc_score)
    print(f"AUC Score: {auc_score}")

print(f"Average AUC Score: {np.mean(auc_scores)}")
print(pd.DataFrame([1 if prob >= 0.5 else 0 for prob in np.mean(test_preds,axis=0)], columns=['test_preds'])['test_preds'].value_counts())

AUC Score: 0.8120753498247678
AUC Score: 0.8080959496244625
AUC Score: 0.8086638011610546
AUC Score: 0.807626047676482
AUC Score: 0.8121995892519212
Average AUC Score: 0.8097321475077376
test_preds
1    4397
0    1395
Name: count, dtype: int64


In [41]:
submission = pd.read_csv(sample_submission_path)
submission['DiagPeriodL90D'] = np.mean(test_preds,axis=0)
submission.to_csv('submission.csv',index=False)