In [None]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Read the data
df_train = pd.read_csv("training.csv")
df_test = pd.read_csv("test.csv")

# Drop patient_id column
df_train = df_train.drop("patient_id", axis=1)
df_test = df_test.drop("patient_id", axis=1)

numerical_cols = df_train.select_dtypes(exclude=['object']).columns
categorical_columns = df_train.select_dtypes(include=['object']).columns

# Impute categorical columns using mode
for col in categorical_columns:
    if col != 'DiagPeriodL90D':
        mode = df_train[col].mode()[0]
        df_train[col].fillna(mode, inplace=True)
        df_test[col].fillna(mode, inplace=True)

# Impute numerical columns using mean
for col in numerical_cols:
    if col != 'DiagPeriodL90D':
        mean = df_train[col].median()
        df_train[col].fillna(mean, inplace=True)
        df_test[col].fillna(mean, inplace=True)

# Encoding categorical columns
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
for col in categorical_columns.to_list() + ['patient_zip3']:
    encoder.fit(df_train[[col]])
    df_train[col] = encoder.transform(df_train[[col]])
    df_test[col] = encoder.transform(df_test[[col]])

cols = ['breast_cancer_diagnosis_code', 'metastatic_cancer_diagnosis_code', 'patient_zip3', 'patient_age', 'payer_type',
        'patient_state', 'breast_cancer_diagnosis_desc']

X_train = df_train[cols]
y_train = df_train['DiagPeriodL90D']

# Model parameters
params = {
    'depth': 2,
    'random_state': 42,
    'eval_metric': 'AUC',
    'verbose': False,
    'loss_function': 'Logloss',
    'learning_rate': 0.3,
    'iterations': 1000
}

# Cross-validation settings
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List to store AUC scores
auc_scores = []
test_preds = []

for train_idx, test_idx in cv.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Initialize CatBoost classifier
    model = CatBoostClassifier(**params)

    # Train the model
    model.fit(X_train_fold, y_train_fold, eval_set=(X_test_fold, y_test_fold), use_best_model=True)

    # Make predictions
    preds = model.predict_proba(X_test_fold)[:, 1]
    preds_test = model.predict_proba(df_test[cols])[:, 1]
    test_preds.append(preds_test)

    # Calculate AUC score
    auc_score = roc_auc_score(y_test_fold, preds)
    auc_scores.append(auc_score)
    print(f"AUC Score: {auc_score}")

# Print average AUC score
print(f"Average AUC Score: {np.mean(auc_scores)}")

# Take the average of predictions for the test data
test_predictions = np.mean(test_preds, axis=0)

# # Convert predictions to binary using classification threshold
# test_predictions_binary = [1 if prob >= 0.5 else 0 for prob in test_predictions]

# # Print the distribution of predictions
# print(pd.Series(test_predictions_binary).value_counts())

# Create submission dataframe
result = pd.read_csv('/Users/yujata/widsdatathon2024-challenge1/result.csv')
result['DiagPeriodL90D'] = np.mean(test_preds, axis=0)

# Save submission file
result.to_csv('result.csv', index=False)
