In [None]:
# CONFIGS
dataset_path = 'data\case_study_example_data.csv'
processed_dataset_path = 'data\processed_dataset.parquet'
store_model_path = 'model/best_model.cbm'

categorical_columns = ['course', 'gender', 'pre_existing_medical_condition']
select_training_columns = ['combined_to_assessment_age', 'course', 'gender', 'combined_age_medical_condition', 'mass_score', 'unsuccessful_outcome']

composite_score_ratio_dict = {
    'sleep': {
        'diary_sleep_avg':0.5,
        'diary_stress_avg':0.2 ,
        'diary_mood_avg':0.2 ,
        'alcohol_scale':0.1 ,
    },
    'stress': {
        'diary_sleep_avg':0.2,
        'diary_stress_avg':0.5 ,
        'diary_mood_avg':0.2 ,
        'alcohol_scale':0.1 ,
    },
    'depression': {
        'diary_sleep_avg':0.2,
        'diary_stress_avg':0.2 ,
        'diary_mood_avg':0.5 ,
        'alcohol_scale':0.1 ,
    },
    'burnout': {
        'diary_sleep_avg':0.2,
        'diary_stress_avg':0.5 ,
        'diary_mood_avg':0.2 ,
        'alcohol_scale':0.1 ,
    },
    'chronic_pain': {
        'diary_sleep_avg':0.3,
        'diary_stress_avg':0.3,
        'diary_mood_avg':0.3,
        'alcohol_scale':0.1,
    },
}


# MLflow configuration variables
use_mlflow = False
mlflow_tracking_uri = 'http://localhost:5000'
mlflow_experiment_name = 'HelloBetter-Treatment_Outcome_preds'
mlflow_register_model_name = 'HelloBetter-Teatment_outcome_predictor'
mlflow_model_path = 'model'
plots_artefact_path = 'plots'

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import os
import mlflow

In [None]:
def preprocess_data(df, select_cols=select_training_columns):
    """
    Preprocess the health behavior dataset.

    Args:
    df (DataFrame): The health behavior DataFrame to preprocess.
    select_cols (List): List of columns to be extracted for output dataframe

    Returns:
    DataFrame: The preprocessed dataset.
    """
    # Fill missing data in the dataset
    df, _ = knn_impute_missing_data(df)
    df, _ = round_columns_to_allowed_values(df, columns_to_round=['alcohol_scale', 'diary_mood_avg', 'diary_sleep_avg', 'diary_stress_avg'])

    # Features engineering
    df['mass_score'] = df.apply(lambda r: create_mass_score(r), axis=1)
    df['combined_age_medical_condition'] = (df['age'] * 
                                            (1 + 0.5 * df['pre_existing_medical_condition'].apply(lambda v: 1 if v == 'yes' else 0)))
    df['combined_to_assessment_age'] = (df['to_assessment'] * (1 - 0.01 * df['age']))

    # Transform columns
    df['mass_score'] = np.log(df['mass_score'] + 1)
    df['combined_age_medical_condition'] = np.log(df['combined_age_medical_condition'] + 1)
    df['combined_to_assessment_age'] = np.log(df['combined_to_assessment_age'] + 1)


    return df


In [None]:
def knn_impute_missing_data(df, categorical_cols=categorical_columns):
    """
    Perform KNN imputation on a DataFrame with categorical features.

    This function takes a DataFrame and a list of column names that are categorical.
    It encodes the categorical columns using an OrdinalEncoder, then applies KNN imputation.
    After imputation, it decodes the categorical features back to their original values.

    Parameters:
    - df (pd.DataFrame): The input DataFrame with missing values.
    - categorical_columns (list of str): The names of the categorical columns in the DataFrame.

    Returns:
    - pd.DataFrame: The DataFrame with imputed values.
    - pd.Series: A Series with the count of remaining missing values per column after imputation.
    """

    # Separate the numerical columns
    numerical_columns = df.columns.difference(categorical_columns)

    # Initialize the OrdinalEncoder
    encoder = OrdinalEncoder()
    # Fit and transform the data for categorical columns
    data_encoded = df.copy()
    data_encoded[categorical_columns] = encoder.fit_transform(df[categorical_columns].astype(str))

    # Initialize the KNNImputer
    imputer = KNNImputer(n_neighbors=5)

    # Perform imputation
    data_imputed = imputer.fit_transform(data_encoded)

    # Convert the imputed data back to a DataFrame and apply inverse transformation for categorical columns
    data_imputed = pd.DataFrame(data_imputed, columns=data_encoded.columns)
    data_imputed[categorical_columns] = encoder.inverse_transform(data_imputed[categorical_columns])

    # Check if there are any missing values left
    missing_values_after_imputation = data_imputed.isnull().sum()

    return data_imputed, missing_values_after_imputation


In [None]:
def round_columns_to_allowed_values(df, columns_to_round, allowed_values= [0, 1, 2, 3, 4, 5]):
    """
    Rounds the values in the specified columns of a DataFrame to the nearest allowed values.

    This function applies custom rounding logic to a DataFrame by rounding each value in the 
    specified columns to the nearest value from a list of allowed values.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the columns to round.
    - columns_to_round (list of str): The names of the columns in the DataFrame to apply rounding to.
    - allowed_values (list of numeric): The allowed values that the DataFrame values can be rounded to.

    Returns:
    - pd.DataFrame: The DataFrame with rounded values.
    - dict: A dictionary with column names as keys and arrays of unique values in those columns after rounding.
    """

    def round_to_nearest_allowed_value(x, allowed_values):
        return min(allowed_values, key=lambda allowed_value: abs(allowed_value - x))

    # Applying the custom rounding to the specified numerical columns
    for column in columns_to_round:
        df[column] = df[column].apply(round_to_nearest_allowed_value, args=(allowed_values,))

    # Check the unique values after rounding to ensure they are within the specified range
    unique_values_after_rounding = {column: df[column].unique() for column in columns_to_round}

    return df, unique_values_after_rounding

In [None]:
def create_mass_score(row, ratio_dict=composite_score_ratio_dict):
  """
  Calculate a composite score based on various health metrics and their respective weights.

  This function computes a weighted score by multiplying individual health metric values by their
  corresponding weights defined in the `ratio_dict`. The health metrics include average sleep, stress, 
  mood levels, and an alcohol scale. The weights and the metrics are course-specific.

  Parameters:
  - row (pd.Series): A pandas Series object representing a single row of a DataFrame, 
                      which should contain the health metrics (diary_sleep_avg, diary_stress_avg,
                      diary_mood_avg, alcohol_scale) and a 'course' identifier.
  - ratio_dict (dict, optional): A dictionary where each key is a course identifier and 
                                  the value is another dictionary mapping health metric names to their 
                                  respective weights. Defaults to `composite_score_ratio_dict` if not provided.

  Returns:
  float: The calculated composite score for the given row.
  """
  
  return (row['diary_sleep_avg'] * ratio_dict[row['course']]['diary_sleep_avg']
          + row['diary_stress_avg'] * ratio_dict[row['course']]['diary_stress_avg']
          + row['diary_mood_avg'] * ratio_dict[row['course']]['diary_mood_avg']
          + row['alcohol_scale'] * ratio_dict[row['course']]['alcohol_scale']
          )

In [None]:
def plot_roc_curve(y_test, y_pred_proba, folder_path=plots_artefact_path, file_name='roc_curve.png'):
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(7, 7))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    file_path = os.path.join(folder_path, file_name)
    plt.savefig(file_path)
    plt.close()

In [None]:
def plot_precision_recall_curve(y_test, y_pred_proba, folder_path=plots_artefact_path, file_name='precision_recall_curve.png'):
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.figure(figsize=(7, 7))
    plt.plot(recall, precision, color='darkorange', lw=2, label='Precision-Recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")
    plt.savefig(os.path.join(folder_path, file_name))
    plt.close()

In [None]:
def plot_confusion_matrix(y_test, y_pred, folder_path=plots_artefact_path, file_name='confusion_matrix.png'):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(7, 7))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion Matrix')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.savefig(os.path.join(folder_path, file_name))
    plt.close()

In [None]:
def plot_feature_importance(best_model, X_train, folder_path=plots_artefact_path, file_name='feature_importance.png'):
    feature_importances = best_model.get_feature_importance()
    sorted_indices = feature_importances.argsort()[::-1]
    plt.figure(figsize=(10, 12))
    plt.title("Feature Importances")
    sns.barplot(y=X_train.columns[sorted_indices], x=feature_importances[sorted_indices])
    plt.savefig(os.path.join(folder_path, file_name))
    plt.close()

In [None]:
def plot_learning_curve(best_model, folder_path=plots_artefact_path, file_name='learning_curve.png'):
    results = best_model.get_evals_result()
    epochs = len(results['validation']['AUC'])
    x_axis = range(0, epochs)
    plt.figure(figsize=(7, 7))
    plt.plot(x_axis, results['validation']['AUC'], label='Test')
    plt.title('CatBoost Learning Curve')
    plt.ylabel('AUC')
    plt.xlabel('Epoch')
    plt.legend(loc="best")
    plt.savefig(os.path.join(folder_path, file_name))
    plt.close()