In [None]:
# CONFIGS
dataset_path = 'data\case_study_example_data.csv'
processed_dataset_path = 'data\processed_dataset.parquet'

categorical_columns = ['course', 'gender', 'pre_existing_medical_condition']

composite_score_ratio_dict = {
    'sleep': {
        'diary_sleep_avg':0.5,
        'diary_stress_avg':0.2 ,
        'diary_mood_avg':0.2 ,
        'alcohol_scale':0.1 ,
    },
    'stress': {
        'diary_sleep_avg':0.2,
        'diary_stress_avg':0.5 ,
        'diary_mood_avg':0.2 ,
        'alcohol_scale':0.1 ,
    },
    'depression': {
        'diary_sleep_avg':0.2,
        'diary_stress_avg':0.2 ,
        'diary_mood_avg':0.5 ,
        'alcohol_scale':0.1 ,
    },
    'burnout': {
        'diary_sleep_avg':0.2,
        'diary_stress_avg':0.5 ,
        'diary_mood_avg':0.2 ,
        'alcohol_scale':0.1 ,
    },
    'chronic_pain': {
        'diary_sleep_avg':0.3,
        'diary_stress_avg':0.3,
        'diary_mood_avg':0.3,
        'alcohol_scale':0.1,
    },
}

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
def knn_impute_missing_data(df, categorical_cols=categorical_columns):
    """
    Perform KNN imputation on a DataFrame with categorical features.

    This function takes a DataFrame and a list of column names that are categorical.
    It encodes the categorical columns using an OrdinalEncoder, then applies KNN imputation.
    After imputation, it decodes the categorical features back to their original values.

    Parameters:
    - df (pd.DataFrame): The input DataFrame with missing values.
    - categorical_columns (list of str): The names of the categorical columns in the DataFrame.

    Returns:
    - pd.DataFrame: The DataFrame with imputed values.
    - pd.Series: A Series with the count of remaining missing values per column after imputation.
    """

    # Separate the numerical columns
    numerical_columns = df.columns.difference(categorical_columns)

    # Initialize the OrdinalEncoder
    encoder = OrdinalEncoder()
    # Fit and transform the data for categorical columns
    data_encoded = df.copy()
    data_encoded[categorical_columns] = encoder.fit_transform(df[categorical_columns].astype(str))

    # Initialize the KNNImputer
    imputer = KNNImputer(n_neighbors=5)

    # Perform imputation
    data_imputed = imputer.fit_transform(data_encoded)

    # Convert the imputed data back to a DataFrame and apply inverse transformation for categorical columns
    data_imputed = pd.DataFrame(data_imputed, columns=data_encoded.columns)
    data_imputed[categorical_columns] = encoder.inverse_transform(data_imputed[categorical_columns])

    # Check if there are any missing values left
    missing_values_after_imputation = data_imputed.isnull().sum()

    return data_imputed, missing_values_after_imputation


In [None]:
def round_columns_to_allowed_values(df, columns_to_round, allowed_values= [0, 1, 2, 3, 4, 5]):
    """
    Rounds the values in the specified columns of a DataFrame to the nearest allowed values.

    This function applies custom rounding logic to a DataFrame by rounding each value in the 
    specified columns to the nearest value from a list of allowed values.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the columns to round.
    - columns_to_round (list of str): The names of the columns in the DataFrame to apply rounding to.
    - allowed_values (list of numeric): The allowed values that the DataFrame values can be rounded to.

    Returns:
    - pd.DataFrame: The DataFrame with rounded values.
    - dict: A dictionary with column names as keys and arrays of unique values in those columns after rounding.
    """

    def round_to_nearest_allowed_value(x, allowed_values):
        return min(allowed_values, key=lambda allowed_value: abs(allowed_value - x))

    # Applying the custom rounding to the specified numerical columns
    for column in columns_to_round:
        df[column] = df[column].apply(round_to_nearest_allowed_value, args=(allowed_values,))

    # Check the unique values after rounding to ensure they are within the specified range
    unique_values_after_rounding = {column: df[column].unique() for column in columns_to_round}

    return df, unique_values_after_rounding

In [None]:
def create_mass_score(row, ratio_dict=composite_score_ratio_dict):
  return (row['diary_sleep_avg'] * ratio_dict[row['course']]['diary_sleep_avg']
          + row['diary_stress_avg'] * ratio_dict[row['course']]['diary_stress_avg']
          + row['diary_mood_avg'] * ratio_dict[row['course']]['diary_mood_avg']
          + row['alcohol_scale'] * ratio_dict[row['course']]['alcohol_scale']
          )