In [None]:
import config
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import datetime

# Functions

In [None]:
def drop_unnamedcolumn(df):
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]
    return df

def get_nan_values(df):
    nan_columns = df.columns[df.isna().any()].tolist()
    nan_summary = df.isna().sum()

    print("Columns with NaN values: ", nan_columns)
    print()
    print("NaN values per column:")
    print(nan_summary[nan_summary > 0].sort_values(ascending=False))
    return nan_columns

def check_duplicate_rows(df, column = 'record_id'):
    dupl_recordid = pd.DataFrame(df[column].value_counts().reset_index())
    dupl_recordid_l = list(dupl_recordid[dupl_recordid['count']>1][column])

    #TODO: add logic to delete duplicates, where the entire row is the same and 
    # call out the duplicates that are not the same

    if len(dupl_recordid_l)>0:
        dupl_rows = df[df[column].isin(dupl_recordid_l)].sort_values(by=column)

        print(f"Duplicate rows based on {column}:")
        display(dupl_rows)
        return dupl_rows
    else:
        print(f"No duplicate rows based on {column}.")
        return None
    
def investigate_data(df, save_path = None, save_name = None):
    df_descr = df.describe(include='all')

    if save_path:
        if save_name is None:
            print("No save name provided!")
            return
        save_path = os.path.join(save_path, f"{save_name}_description.csv")
        df_descr.to_csv(save_path, index=True, mode='x')
        print(f"Data description saved to {save_path}")
        print()
    
    max_count = df_descr.loc['count'].max()

    for col in df_descr.columns:
        if df_descr[col]['count'] < max_count:
            print(f"Column '{col}' has missing values: {df_descr[col]['count']} out of {max_count}")
    print()
    nan_columns = get_nan_values(df)

    if len(nan_columns) > 0:
        nan_df = df[df.isnull().any(axis=1)]
        nan_patient_id = nan_df['record_id'].unique()
        display(nan_df)
        print()
        _ = check_duplicate_rows(df, column='record_id')
        return nan_patient_id
    else:
        _ = check_duplicate_rows(df, column='record_id')
        
def plot_hist(df, column, title = None, xlabel = None, y_label = "Frequency", stat = 'frequency', figsize=(10, 6), hue= None, multiple=None, bins = 30, kde=False):
    plt.figure(figsize=figsize)
    if hue is None:
        sns.histplot(df[column], bins=bins, stat=stat, kde=kde)
    else:
        sns.histplot(data=df, x = column, bins=bins, stat=stat, hue = hue, multiple=multiple, kde=kde)
    plt.title(title if title else f"Distribution of {column}")
    plt.xlabel(xlabel if xlabel else column)
    plt.ylabel(y_label)
    plt.show()

def plot_violin(df, column, title = None, xlabel = None, y_label = "Frequency", figsize=(10, 6), hue= 'gender'):
    plt.figure(figsize=figsize)
    sns.violinplot(data=df, x = column, hue=hue)
    plt.title(title if title else f"Distribution of {column}")
    plt.xlabel(xlabel if xlabel else column)
    plt.ylabel(y_label)
    plt.show()

In [None]:
today = datetime.date.today()

base_dir = config.RAW_DATA_PATH
proc_dir = config.PROC_DATA_PATH
save_dir = os.path.join(proc_dir, f"{today}_data_exploration")
os.makedirs(save_dir, exist_ok=True)

first_leg = "inmodi_data_first_visit.csv"
second_leg = "inmodi_data_second_visit.csv"

df_first = pd.read_csv(os.path.join(base_dir, first_leg), sep=",")
df_second = pd.read_csv(os.path.join(base_dir, second_leg), sep=",")

#TODO: import newest KL scores

# Data Cleaning

In [None]:
# remove unnamed columns
try:
    df_first = drop_unnamedcolumn(df_first)
    df_second = drop_unnamedcolumn(df_second)
except Exception as e:
    print(f"Error dropping unnamed columns: {e}")

In [None]:
print("Columns in first visit data:")
print(df_first.columns.tolist())

print("\nColumns in second visit data:")
print(df_second.columns.tolist())

## Missing Values & Duplicates

In [None]:
df_first_nanids = investigate_data(df_first) #save_path=save_dir, save_name="inmodi_data_first_visit"

In [None]:
df_second_nanids = investigate_data(df_second) #save_path=save_dir, save_name="inmodi_data_second_visit"

For first leg, ce_fm, ce_pain_r, ce_pain_l, COMI_score, UCLA_score, KOOS scores and rx_ap_kl_left appear to possibly have missing values.

Also we have a unique record_id count of 121, but 122 record_ids (rows), therefore we might have duplicates.

## Quick Fix: Removing duplicates

In [None]:
print("Number of rows: ", len(df_first))
df_first = df_first.drop_duplicates(subset='record_id', keep='first')
print("Number of rows after dropping duplicates: ", len(df_first))

## Quick Fix: Fix Unique case

In [None]:
display(df_second[df_second['ce_fm']=='35.4, 26.4'])
outl_index = df_second[df_second['ce_fm']=='35.4, 26.4'].index

print(f"Location: {outl_index}")

In [None]:
df_second.loc[outl_index, 'ce_fm'] = 35.4

In [None]:
display(df_second.iloc[outl_index])

# Patient-Related Information

This includes patient information, such as age, bmi, body fat, pain and sex. Additionally, we will look at the aggregated questionnaire scores.

*Height and Weight was purposefully omitted in this part, since it is already covered in the BMI, which takes the height into account when evaluating the weight.*

**Columns Patient-Related Data**
* `record_id`: id column
* `age`: patient age (at time of study?)
* `ce_height`
* `ce_weight`
* `ce_bmi`
* `ce_fm`: body fat percentage (%)
* `ce_pain_r`: patient-reported pain (right leg)
* `ce_pain_l`: patient-reported pain (left leg)
* `gender`
* `COMI_score`: Core Outcome Measures Index
* `OKS_score`: Oxford Knee Score (12 Questions)
* `UCLA_score`: UCLA-Activity Index (1 Question)
* `FJS_score`: Forgotten Joint Score (12 Questions)
* `KOOS_pain`: Knee Injury and OA Outcome Score - Pain Section (9 Questions)
* `KOOS_symptoms`: Symptoms Section (7 Questions)
* `KOOS_sport`: Sport Section (5 Questions)
* `KOOS_adl`: Daily Activities Section (17 Questions)
* `KOOS_qol`: Quality of Living Section (4 Questions)
* `rx_ap_kl_left`: KL-Score (left leg)
* `rx_ap_kl_right`: KL-Score (right leg)

The various scores are aggregated in this dataset already.

In [None]:
pi = pd.concat([df_first, df_second], ignore_index=True)
print("Number of rows after concatenation: ", len(pi))

pi = pi.astype({'ce_fm':'float64'})
pi= pi.drop(columns=['rx_ap_kl_left', 'rx_ap_kl_right'])

# Columns that we will analyze
l_columns = ['age', 'ce_bmi', 'ce_fm',
       'gender', 'COMI_score', 'OKS_score',
       'UCLA_score', 'FJS_score', 'KOOS_pain', 'KOOS_symptoms', 'KOOS_sport',
       'KOOS_adl', 'KOOS_qol']

In [None]:
for col in l_columns:
    if col != 'gender':
        plot_hist(pi, col, figsize=(10, 6), stat='density', y_label = 'Density', bins=10, kde=True)
    elif col == 'gender':
        plot_hist(pi, col, figsize=(10, 6), stat='density', y_label = 'Density', bins=2, kde=False)

In [None]:
#for col in l_columns:
    # plot_violin(pi, col)

l_columns2 = ['age', 'ce_bmi', 'ce_fm',
        'COMI_score', 'OKS_score',
       'UCLA_score', 'FJS_score', 'KOOS_pain', 'KOOS_symptoms', 'KOOS_sport',
       'KOOS_adl', 'KOOS_qol']
for col in l_columns2:
    plt.figure(figsize=(8, 6))
    sns.violinplot(data=pi, x='gender', y=col, split= True)
    plt.title(f"Distribution of {col} by Gender")
    plt.show()

In [None]:
melted_df = pi.melt(id_vars='gender', value_vars=l_columns2, var_name='Variable', value_name='Value')

plt.figure(figsize=(12, 6))
sns.violinplot(data=melted_df, x='Variable', y='Value', hue='gender', split=True)
plt.title("Split Violin Plots of Variables by Gender")
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.histplot(data=melted_df, x='Value', hue='gender', stat='probability', multiple='dodge')


In [None]:
for col in l_columns:
    if col != 'gender':
        plot_hist(pi, col, figsize=(10, 6)
                , hue='gender', multiple='dodge', stat='density', y_label='Density', bins=10)
    else:
        plot_hist(pi, col, figsize=(10, 6), stat='density', y_label='Density', bins = 2)

Pain for left and right leg needs to be concated, since we are interested in overall pain distribution, not specifically pain distribution of left and right leg.

In [None]:
 'ce_pain_r',
 'ce_pain_l',

In [None]:
pi_cols = pi[l_columns]
pi_grouped = pi_cols.groupby(by=['gender']).count()

In [None]:
display(pi_grouped)

# End of Code