In [None]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import datetime
from utils.data_exploration_utils import drop_unnamedcolumn,  investigate_data, plot_hist, scatterplot, missing_from_df

In [None]:
today = datetime.date.today()

base_dir = config.RAW_DATA_PATH
proc_dir = config.PROC_DATA_PATH

folder = "2025-08-11_data_exploration"
df_filename = "inmodi_data_questionnaire_kl_woSC.csv"

smote_types = ['SMOTE', 'DBSMOTE', 'Borderline_SMOTE1', 'Borderline_SMOTE2']
smote_type_id = 0
smote_type = smote_types[smote_type_id]

if folder is not None:
    save_dir = os.path.join(proc_dir, folder)
    save_dir2 = save_dir
else:
    save_dir = os.path.join(proc_dir, f"{today}_data_exploration")
    save_dir2 = os.path.join(proc_dir, '2025-07-14_data_exploration')

os.makedirs(save_dir, exist_ok=True)

df = pd.read_csv(os.path.join(save_dir, df_filename))
smote = pd.read_csv(os.path.join(save_dir, f'smote_oversampled_data_{smote_type}.csv'))

In [None]:
# some df preprocessing
df['is_male'] = df['gender'].apply(lambda x: 1 if x=='male' else 0)

cols = [
    'pain', 'age',
    'ce_bmi', 'ce_fm',  'koos_s1', 
       'koos_s2', 'koos_s3', 'koos_s4', 'koos_s5', 'koos_s6',
       'koos_s7', 
       'koos_p1', 'koos_p2', 'koos_p3', 'koos_p4', 'koos_p5',
       'koos_p6', 'koos_p7', 'koos_p8', 'koos_p9', 
       'koos_a1', 'koos_a2',
       'koos_a3', 'koos_a4', 'koos_a5', 'koos_a6', 'koos_a7', 'koos_a8',
       'koos_a9', 'koos_a10', 'koos_a11', 'koos_a12', 'koos_a13', 'koos_a14',
       'koos_a15', 'koos_a16', 'koos_a17', 
       'koos_sp1', 'koos_sp2', 'koos_sp3',
       'koos_sp4', 'koos_sp5', 
       'koos_q1', 'koos_q2', 'koos_q3', 'koos_q4',
       'oks_q1', 'oks_q2', 'oks_q3', 'oks_q4',
       'oks_q5', 'oks_q6', 'oks_q7', 'oks_q8', 'oks_q9', 'oks_q10', 'oks_q11',
       'oks_q12', 
       'is_male'
       ]

df = df.dropna(axis=0, how='any', subset=cols)

In [None]:
print(df.shape)
print(smote.shape)

# KL-Score Overall Distribution

In [None]:
display(df['KL-Score'].value_counts().reset_index())

display(smote['KL-Score'].value_counts().reset_index())

# Histograms

## PI

In [None]:
lcols = ['pain', 'age',
    'ce_bmi', 'ce_fm', 'is_male', 'KL-Score']
for col in lcols:
    plot_hist(df, col, title=f"Org. Data {col}")
    plot_hist(smote, col, title=f"SMOTE Data {col}")

## Questionnaire

# Boxplots

In [None]:
cols.remove('is_male')

In [None]:
for col in cols:
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.boxplot(x='KL-Score', y=col, data=df)
    plt.title(f'Org. Data {col} by KL-Score')
    plt.subplot(1, 2, 2)
    sns.boxplot(x='KL-Score', y=col, data=smote)
    plt.title(f'SMOTE Data {col} by KL-Score')
    plt.show()

# Pairplots

## KL-Score

In [None]:
col_cat = ['KL-Score']
col_num = ['pain', 'age',
       'ce_bmi', 'ce_fm']
cols = col_cat + col_num


sns.pairplot(df[cols], hue = col_cat[0])
sns.pairplot(smote[cols], hue = col_cat[0])
