In [None]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Image
import datetime

from utils.data_exploration_utils import *

In [None]:
today = datetime.date.today()

base_dir = config.RAW_DATA_PATH
proc_dir = config.PROC_DATA_PATH
img_dir = config.SCHULTHESS_DATAPATH

folder_name = "2025-07-03_data_exploration"

save_dir = os.path.join(proc_dir, folder_name)

kl = pd.read_csv(os.path.join(save_dir, 'missing_kl_scores.csv'))
df = pd.read_csv(os.path.join(base_dir, '241027_RedcapData.csv'))

In [None]:
df = df.drop(columns='Unnamed: 0')

dfu = df.melt(id_vars=['id', 'rx_reader'], value_vars=['rx_ap_kl_left', 'rx_ap_kl_right'], var_name='side', value_name='kl')
dfu['side'] = dfu['side'].str.split('_').str[-1]
dfu['name'] = dfu['id'] + '_' + dfu['side']

display(dfu.head())

## Check for missing values

In [None]:
missing_from_kl, _ = missing_from_df(dfu, kl, 'name', 'name')

In [None]:
missing_from_kl.sort_values(by='record_id')

## Get DataFrame with Missing KL scores

In [None]:
missing_names = list(set(kl['name']))

dfu_filtered = dfu[dfu['name'].isin(missing_names)]

In [None]:
dfu_filtered.sort_values(by = 'id')

### Duplicated Rows

In [None]:
duplicate_kl = dfu_filtered.duplicated(subset=['id', 'side', 'kl', 'name'], keep=False)

display(dfu_filtered[duplicate_kl])

In [None]:
dfu_filtered = dfu_filtered.drop_duplicates(subset=['id', 'side', 'kl', 'name'], keep='first')

Duplicated KL values are filtered out. Next we check for those that have varying KL values.

In [None]:
display(dfu_filtered[dfu_filtered.duplicated(subset=['id', 'side', 'name'], keep=False)])

In [None]:
def find_png_file(root_dir, filename):
    for dirpath, dirnames, filenames in os.walk(root_dir):
        if filename in filenames:
            full_path = os.path.join(dirpath, filename)
            print(f"Found: {full_path}")
            return full_path  # or collect in a list if you want all matches
    print("File not found.")
    return None

In [None]:
tbc = dfu_filtered[dfu_filtered.duplicated(subset=['id', 'side', 'name'], keep=False)]['name']
tbc = list(tbc)

for i in tbc:
    #file_path = os.path.join(img_dir, i + '.png')
    file_path = find_png_file(img_dir, i + '.png')


    print(file_path)
    try:
        display(Image(filename=file_path))
    except:
        print(f'Image for {i} does not exist!')

    #plt.imshow(file_path)

Doesn't appear to have an image, therefore we will keep them, but need to keep this in mind going forward. Will take the higher KL score.

In [None]:
duplicates = dfu_filtered[dfu_filtered.duplicated(subset=['id', 'side', 'name'], keep=False)]

best_rows = duplicates.sort_values('kl', ascending=False).drop_duplicates(subset=['id', 'side', 'name'], keep='first')
display(best_rows)

df_no_dupes = dfu_filtered[~dfu_filtered.set_index(['id', 'side', 'name']).index.isin(duplicates.set_index(['id', 'side', 'name']).index)]

df_final = pd.concat([df_no_dupes, best_rows], ignore_index=True)
display(df_final.sort_values(by='id'))

In [None]:
# dfu_filtered['record_id'] = dfu_filtered['id'].str.split('_').str[0]
# tbc = dfu_filtered[dfu_filtered.duplicated(subset=['id', 'side', 'name'], keep=False)]
# #tbc['record_id'] = tbc['id'].str.split('_').str[0]
# tbc_side = list(tbc['side'])
# tbc = list(tbc['record_id'])

# for i in range(len(tbc)):
#     # print(tbc[i])
#     # print(tbc[i] + '_' + '1' + '_' + tbc_side[i] + '.png')
#     file_path = find_png_file(img_dir, tbc[i] + '_' + '2' + '_' + tbc_side[i] + '.png')

#     #print(file_path)
#     try:
#         #display(Image(filename=file_path))
#         img = plt.imread(file_path)
#         plt.imshow(img, 'gray')
#         plt.axis('off')
#         plt.show()
#         display(dfu_filtered[dfu_filtered['record_id']==tbc[i]])
#     except:
#         print(f'Image for {i} does not exist!')

### NaN Values

In [None]:
df_final[df_final['kl'].isna()]

In [None]:
tbc = df_final[df_final['kl'].isna()]
# tbc_side = tbc['side']
# print(tbc_side)
visit = ['1', '2']
tbc = list(tbc['id'])

for i in range(len(tbc)):
    for j in range(len(visit)):
    #file_path = os.path.join(img_dir, i + '.png')
        file_path = find_png_file(img_dir, tbc[i] + '_' + visit[j] + '_' + 'left' + '.png')


        print(file_path)
        try:
            display(Image(filename=file_path))
        except:
            print(f'Image for {i} does not exist!')

In [None]:
df_final = df_final.dropna(subset=['kl'])

# Final Data

In [None]:
df_final.sort_values(by='name')

In [None]:
df_final.to_csv(os.path.join(save_dir, 'missing_kl_scores_edited.csv'), index=False)