In [None]:
import config
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import datetime
import math
from utils.data_exploration_utils import drop_unnamedcolumn, get_nan_values, check_duplicate_rows, investigate_data, plot_hist, plot_violin, scatterplot

# Functions

In [None]:
today = datetime.date.today()

base_dir = config.RAW_DATA_PATH
proc_dir = config.PROC_DATA_PATH
save_dir = os.path.join(proc_dir, f"{today}_data_exploration")
os.makedirs(save_dir, exist_ok=True)

first_leg = "inmodi_data_first_visit.csv"
second_leg = "inmodi_data_second_visit.csv"

df_first = pd.read_csv(os.path.join(base_dir, first_leg), sep=",")
df_second = pd.read_csv(os.path.join(base_dir, second_leg), sep=",")

#TODO: import newest KL scores

# Data Cleaning

In [None]:
# remove unnamed columns
try:
    df_first = drop_unnamedcolumn(df_first)
    df_second = drop_unnamedcolumn(df_second)
except Exception as e:
    print(f"Error dropping unnamed columns: {e}")

In [None]:
print("Columns in first visit data:")
print(df_first.columns.tolist())

print("\nColumns in second visit data:")
print(df_second.columns.tolist())

## Missing Values & Duplicates

In [None]:
df_first_nanids = investigate_data(df_first) #save_path=save_dir, save_name="inmodi_data_first_visit"

In [None]:
df_second_nanids = investigate_data(df_second) #save_path=save_dir, save_name="inmodi_data_second_visit"

For first leg, ce_fm, ce_pain_r, ce_pain_l, COMI_score, UCLA_score, KOOS scores and rx_ap_kl_left appear to possibly have missing values.

Also we have a unique record_id count of 121, but 122 record_ids (rows), therefore we might have duplicates.

## Quick Fix: Removing duplicates

In [None]:
print("Number of rows: ", len(df_first))
df_first = df_first.drop_duplicates(subset='record_id', keep='first')
print("Number of rows after dropping duplicates: ", len(df_first))

## Quick Fix: Fix Unique case

In [None]:
display(df_second[df_second['ce_fm']=='35.4, 26.4'])
outl_index = df_second[df_second['ce_fm']=='35.4, 26.4'].index

print(f"Location: {outl_index}")

In [None]:
df_second.loc[outl_index, 'ce_fm'] = 35.4

In [None]:
display(df_second.iloc[outl_index])

# Patient-Related Information

This includes patient information, such as age, bmi, body fat, pain and sex. Additionally, we will look at the aggregated questionnaire scores.

*Height and Weight was purposefully omitted in this part, since it is already covered in the BMI, which takes the height into account when evaluating the weight.*

**Columns Patient-Related Data**
* `record_id`: id column
* `age`: patient age (at time of study?)
* `ce_height`
* `ce_weight`
* `ce_bmi`
* `ce_fm`: body fat percentage (%)
* `ce_pain_r`: patient-reported pain (right leg)
* `ce_pain_l`: patient-reported pain (left leg)
* `gender`
* `COMI_score`: Core Outcome Measures Index
* `OKS_score`: Oxford Knee Score (12 Questions)
* `UCLA_score`: UCLA-Activity Index (1 Question)
* `FJS_score`: Forgotten Joint Score (12 Questions)
* `KOOS_pain`: Knee Injury and OA Outcome Score - Pain Section (9 Questions)
* `KOOS_symptoms`: Symptoms Section (7 Questions)
* `KOOS_sport`: Sport Section (5 Questions)
* `KOOS_adl`: Daily Activities Section (17 Questions)
* `KOOS_qol`: Quality of Living Section (4 Questions)
* `rx_ap_kl_left`: KL-Score (left leg)
* `rx_ap_kl_right`: KL-Score (right leg)

The various scores are aggregated in this dataset already.

In [None]:
df_first['visit'] = 1
df_second['visit'] = 2

pi = pd.concat([df_first, df_second], ignore_index=True)
print("Number of rows after concatenation: ", len(pi))

pi = pi.astype({'ce_fm':'float64'})
pi= pi.drop(columns=['rx_ap_kl_left', 'rx_ap_kl_right'])

# Columns that we will analyze
l_columns = ['age', 'ce_bmi', 'ce_fm',
       'gender', 'COMI_score', 'OKS_score',
       'UCLA_score', 'FJS_score', 'KOOS_pain', 'KOOS_symptoms', 'KOOS_sport',
       'KOOS_adl', 'KOOS_qol']

In [None]:
for col in l_columns:
    if col != 'gender':
        plot_hist(pi, col, figsize=(10, 6), stat='density', y_label = 'Density', bins=10, kde=True)
    elif col == 'gender':
        plot_hist(pi, col, figsize=(10, 6), stat='density', y_label = 'Density', bins=2, kde=False)

In [None]:
l_columns2 = ['age', 'ce_bmi', 'ce_fm',
        'COMI_score', 'OKS_score',
       'UCLA_score', 'FJS_score', 'KOOS_pain', 'KOOS_symptoms', 'KOOS_sport',
       'KOOS_adl', 'KOOS_qol']
for col in l_columns2:
    melted_df = pi.melt(id_vars='gender', value_vars=col, var_name='Variable', value_name='Count')
    plt.figure(figsize=(12, 6))
    sns.violinplot(data=melted_df, x='Variable', y='Count', hue='gender', split=True, inner='quartile')
    plt.title(f"Distribution of {col} by Gender")
    plt.xticks(rotation=45)
    plt.show()

Pain for left and right leg needs to be concated, since we are interested in overall pain distribution, not specifically pain distribution of left and right leg.

### Patient-reported Pain

In [None]:
pain_df = pi.melt(id_vars=['record_id', 'visit'], value_vars=['ce_pain_l', 'ce_pain_r'], var_name='side', value_name='pain')

print()
print(f"Length of pivoted df: {len(pi)}")
print(f"Length of unpivoted df: {len(pain_df)}")

pain_df['side'] = pain_df['side'].str.replace('ce_pain_', '')

display(pain_df.head())


In [None]:
pi2 = pi.drop(columns=['ce_pain_l', 'ce_pain_r'])

pi2 = pi2.drop_duplicates(subset=['record_id', 'visit'], keep='first')

print(f"PI2 DF shape: {pi2.shape}")

In [None]:
print(f"PI DF shape: {pi.shape}")
pi2 = pain_df.merge(pi2, on=['record_id', 'visit'], how='left')

print()
print(f"PI DF shape after merging pain data: {pi2.shape}")

In [None]:
display(pi2.head())

In [None]:
count_recordid = pi2['record_id'].value_counts().reset_index()

print(f"These are the record ids with too many rows:")
display(count_recordid[count_recordid['count'] > 4])

In [None]:
pain_df['pain'].max()

*9 bins, because highest score is 9 and it is an int value.*

In [None]:
plot_hist(pain_df, column='pain', y_label='Percentage', stat='percent', 
          kde=True, bins=9)

In [None]:
plot_hist(pi2, column='pain', y_label='Percentage', stat='percent', hue = 'gender',
          multiple='dodge', kde=True, bins=9)

In [None]:
pi2.head()

In [None]:
#TODO: save cleaned pi version
pi.to_csv(os.path.join(save_dir, "inmodi_data_personalinformation.csv"), index=False)

#TODO: save cleaned unpivoted pi version
pi2.to_csv(os.path.join(save_dir, "inmodi_data_personalinformation_unpivoted.csv"), index=False)

# Correlation Analysis

## Heatmap of Correlation

**Pearson Correlation Coefficient**

What it measures:
- The strength and direction of a linear relationship between two continuous variables.

Key Characteristics:
- Sensitive to linear relationships only.
- Values range from -1 to +1.
    - +1: Perfect positive linear relationship
    - 0: No linear relationship
    - -1: Perfect negative linear relationship
- Assumes both variables are normally distributed.
- Affected by outliers.

Use Case:
When you suspect or want to test for a straight-line relationship.

**Spearman Rank Correlation Coefficient**

What it measures:
The strength and direction of a monotonic relationship (doesn't have to be linear) by comparing ranks of the data.

Key Characteristics:
- Non-parametric (does not assume normality).
- Converts raw data to ranks, then applies Pearson's formula to the ranks.
- Captures any monotonic relationship (e.g., curved but consistently increasing or decreasing).
- Less sensitive to outliers than Pearson.

Use Case:
- When the relationship is non-linear but consistently increasing or decreasing.
- Data contains outliers or isn't normally distributed.

**Kendall Tau Correlation Coefficient**

What it measures
The strength and direction of a monotonic relationship based on the number of concordant and discordant pairs.

Key Characteristics:
- Also non-parametric.
- Compares all possible pairs of observations:
- Concordant pair: Both values increase or decrease together.
- Discordant pair: One increases while the other decreases.
- More robust to small sample sizes than Spearman.
- Slightly more conservative (produces smaller absolute values) than Spearman.

Use Case:
Small datasets with ordinal or continuous data.
You want a measure based on the ordering of pairs rather than ranks.

In [None]:
columns_corr = ['pain', 'age',
       'ce_bmi', 'ce_fm', 'COMI_score', 'OKS_score', 'UCLA_score',
       'FJS_score', 'KOOS_pain', 'KOOS_symptoms', 'KOOS_sport', 'KOOS_adl',
       'KOOS_qol']
corr_types = ['pearson', 'kendall', 'spearman']

for corr in corr_types:
    print(f"Calculating {corr} correlation...")

    pi2_corr = pi2[columns_corr].corr(method=corr)
    plt.figure(figsize=(12, 8))
    sns.heatmap(pi2_corr, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
    plt.title(f"{corr.capitalize()} Correlation Heatmap")
    plt.show()

## Pairplot

### Personal Information w/ respect to gender

In [None]:
col_cat = ['gender']
col_num = ['pain', 'age',
       'ce_bmi', 'ce_fm']
cols = col_cat + col_num

sns.pairplot(pi2[cols], hue = 'gender')
plt.savefig(os.path.join(save_dir, "pairplot_personalinformation.png"), bbox_inches='tight')
plt.show()

### Scores w/ respect to gender

In [None]:
score_values = ['COMI_score', 'OKS_score', 'UCLA_score',
       'FJS_score', 'KOOS_pain', 'KOOS_symptoms', 'KOOS_sport', 'KOOS_adl',
       'KOOS_qol']

for value in score_values:
    scatterplot(pi2, x_list=col_num, y=value, hue='gender', figsize = (6, 6), savepath=save_dir)

### Personal Information w/respect to pain

In [None]:
pi2['pain'].unique()

In [None]:
pi3 = pi2.copy()
pi3.dropna(subset=['pain'], axis=0, inplace=True)

pi3['pain']= pi3['pain'].astype(int).astype('str')

In [None]:
col_num = ['age',
       'ce_bmi', 'ce_fm']
cols = col_num

sns.pairplot(pi3, vars=cols, hue = 'pain', hue_order=pi3['pain'].unique(), plot_kws={'alpha': 0.5, 's':30})
plt.savefig(os.path.join(save_dir, "pairplot_personalinformation.png"), bbox_inches='tight')
plt.show()

### Scores w/ respect to pain

In [None]:
for value in score_values:
    scatterplot(pi3, x_list=col_num, y=value, hue='pain', figsize = (6, 6), savepath=save_dir)