In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, precision_recall_curve, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
import mlflow
from mlflow.models.signature import infer_signature
import os

In [40]:
%run utils.ipynb

In [41]:
os.makedirs(plots_artefact_path, exist_ok=True)

In [42]:
data = pd.read_parquet(processed_dataset_path)[select_training_columns]
train_categorical_columns = ['course', 'gender']
for col in train_categorical_columns:
    data[col] = data[col].astype('category')

X = data.drop('unsuccessful_outcome', axis=1)
y = data['unsuccessful_outcome'].astype(int)

# Train model

In [44]:
groups = data['course']

# Split the dataset into a train-test set (80%, 20%)
X_train_test, X_test, y_train_test, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
groups_train_test = groups.loc[X_train_test.index]

# Initialize GroupKFold
gkf = GroupKFold(n_splits=5)

# Placeholder for the best model and its score
best_model = None
best_score = -np.inf

mlflow.set_tracking_uri(mlflow_tracking_uri)
mlflow.set_experiment(mlflow_experiment_name)

# Start MLflow run
with mlflow.start_run() as run:
    # Iterate over each fold
    fold_number = 0
    for train_idx, valid_idx in gkf.split(X_train_test, y_train_test, groups=groups_train_test):
        fold_number += 1
        # Split the data
        X_train, X_valid = X_train_test.iloc[train_idx], X_train_test.iloc[valid_idx]
        y_train, y_valid = y_train_test.iloc[train_idx], y_train_test.iloc[valid_idx]

        # Initialize CatBoostClassifier
        catboost_model = CatBoostClassifier(
            iterations=1000,
            learning_rate=0.1,
            depth=6,
            loss_function='Logloss',
            eval_metric='AUC',
            random_seed=42,
            verbose=200,
            cat_features=train_categorical_columns,
        )

        # Fit model
        catboost_model.fit(X_train, y_train, eval_set=(X_valid, y_valid), use_best_model=True)

        # Evaluate model
        y_pred = catboost_model.predict(X_test)
        y_pred_proba = catboost_model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)

        # Update best model if current model is better
        if roc_auc > best_score:
            best_model = catboost_model
            best_score = roc_auc
            plot_roc_curve(y_test, y_pred_proba)
            plot_precision_recall_curve(y_test, y_pred_proba)
            plot_confusion_matrix(y_test, y_pred)
            plot_feature_importance(best_model, X_train)
            plot_learning_curve(best_model)

        # Log metrics to MLflow
        mlflow.log_metric(f"fold_{fold_number}_roc_auc", roc_auc)
        mlflow.log_metric(f"fold_{fold_number}_accuracy", accuracy)
        mlflow.log_metric(f"fold_{fold_number}_precision", precision)
        mlflow.log_metric(f"fold_{fold_number}_recall", recall)
        mlflow.log_metric(f"fold_{fold_number}_f1_score", f1)

        break

    # Log the best model to MLflow
    if best_model is not None:
        # Infer model signature
        X_train_converted = X_train.copy()
        for col in X_train_converted.select_dtypes(include=['category']).columns:
            X_train_converted[col] = X_train_converted[col].astype(str)
        signature = infer_signature(X_train_converted, best_model.predict(X_train_converted))

        # Log the best model to MLflow with the model signature
        mlflow.catboost.log_model(best_model, mlflow_model_path, signature=signature)


    
    # Log plots as artefacts
    mlflow.log_artifacts(plots_artefact_path)

    # Print the run_id
    print(f"MLflow Run ID: {run.info.run_id}")


0:	test: 0.9333895	best: 0.9333895 (0)	total: 27.3ms	remaining: 27.3s
200:	test: 0.9658510	best: 0.9663207 (176)	total: 5.68s	remaining: 22.6s
400:	test: 0.9646909	best: 0.9663207 (176)	total: 11.7s	remaining: 17.5s
600:	test: 0.9628728	best: 0.9663207 (176)	total: 17.8s	remaining: 11.8s
800:	test: 0.9624902	best: 0.9663207 (176)	total: 23.8s	remaining: 5.92s
999:	test: 0.9625367	best: 0.9663207 (176)	total: 29.9s	remaining: 0us

bestTest = 0.9663207295
bestIteration = 176

Shrink model to first 177 iterations.
MLflow Run ID: 1c98ca6f99e64daba617bdaaf272bb38
