In [1]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import datetime
from utils.data_exploration_utils import drop_unnamedcolumn,  investigate_data

In [2]:
today = datetime.date.today()

base_dir = config.RAW_DATA_PATH

filename = "2025-09-25_mri.csv"

mri = pd.read_csv(os.path.join(base_dir, filename))
display(mri.head())

Unnamed: 0,record_id,redcap_event_name,redcap_repeat_instrument,redcap_repeat_instance,redcap_survey_identifier,mri_operator,mri_date,mri_eval_date,mri_side,mri_show_instr,...,mri_periarticular___2,mri_periarticular___3,mri_periarticular___4,mri_periarticular___5,mri_periarticular___6,mri_periarticular___7,mri_periarticular___8,mri_periarticular___9,mri_remarks,mri_complete
0,IM0001,first_visit_arm_1,,,,,,,,,...,,,,,,,,,,
1,IM0001,second_visit_arm_1,,,,,,,,,...,,,,,,,,,,
2,IM0001,first_visit_arm_1,mri,1.0,,fisc,2023-01-30,2023-11-05,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tendinosis quad tendon,2.0
3,IM0001,first_visit_arm_1,mri,2.0,,rosand,2023-01-30,2023-10-23,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,2.0
4,IM0001,second_visit_arm_1,mri,1.0,,fisc,2023-11-17,2024-05-06,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2.0


In [3]:
mri.columns

Index(['record_id', 'redcap_event_name', 'redcap_repeat_instrument',
       'redcap_repeat_instance', 'redcap_survey_identifier', 'mri_operator',
       'mri_date', 'mri_eval_date', 'mri_side', 'mri_show_instr',
       ...
       'mri_periarticular___2', 'mri_periarticular___3',
       'mri_periarticular___4', 'mri_periarticular___5',
       'mri_periarticular___6', 'mri_periarticular___7',
       'mri_periarticular___8', 'mri_periarticular___9', 'mri_remarks',
       'mri_complete'],
      dtype='object', length=225)

'mri_bml_yn': bool, bone marrow loss

'mri_cart_yn': bool, visible cartilage loss

'mri_osteo_yn': bool, presence of osteophytes

'mri_syn_yn': bool, hoffa's synovitis (inflammation of infrapatellar fat pad) and/or effusion synovitis (exess fluid in knee joint due to inflammation of snyovial membrane/joint swelling), 0 = all normal / 1 = Pathologic findings

'mri_mnsc_yn': bool, changes in position of meniscus

('mri_lig_yn': bool, tears of ligaments

'mri_lig_acl_repair': bool, ACL repair)

In [8]:
cols = ['record_id', 'redcap_event_name', 'mri_operator', 'mri_side', 'mri_bml_yn', 'mri_cart_yn', 'mri_osteo_yn', 'mri_syn_yn', 'mri_mnsc_yn', 'mri_lig_yn', 'mri_lig_acl_repair']

In [9]:
mrismall = mri[cols]

In [21]:
# delete specific rows, where cols 'mri_operator', 'mri_side', 'mri_bml_yn', 'mri_cart_yn', 'mri_osteo_yn', 'mri_syn_yn', 'mri_mnsc_yn', 'mri_lig_yn', 'mri_lig_acl_repair' are all nan
mrismall = mrismall.dropna(subset=['mri_operator', 'mri_side', 'mri_bml_yn', 'mri_cart_yn', 'mri_osteo_yn', 'mri_syn_yn', 'mri_mnsc_yn', 'mri_lig_yn', 'mri_lig_acl_repair'], how='all')
mrismall = mrismall.dropna(subset=['mri_bml_yn', 'mri_cart_yn', 'mri_osteo_yn', 'mri_syn_yn', 'mri_mnsc_yn', 'mri_lig_yn', 'mri_lig_acl_repair'], how='all')
mrismall = mrismall.dropna(subset=[ 'mri_cart_yn', 'mri_osteo_yn', 'mri_syn_yn', 'mri_mnsc_yn', 'mri_lig_yn', 'mri_lig_acl_repair'], how='all')
mrismall['visit'] = mrismall['redcap_event_name'].apply(lambda x: 1 if x == 'first_visit_arm_1' else (2 if x == 'second_visit_arm_1' else None))
mrismall['side'] = mrismall['mri_side'].apply(lambda x: 'left' if x == 2 else ('right' if x == 1 else None))

In [22]:
mrismall['id'] = mrismall['record_id'].astype(str) + '_' + mrismall['visit'].astype(str) + '_' + mrismall['side'].astype(str)

In [23]:
mrismall_nan = investigate_data(mrismall, id_col='id')

Column 'mri_lig_acl_repair' has 386 missing values (53 out of 439 non-null).

Columns with NaN values:  ['mri_lig_acl_repair']

NaN values per column:
mri_lig_acl_repair    386
dtype: int64


Unnamed: 0,record_id,redcap_event_name,mri_operator,mri_side,mri_bml_yn,mri_cart_yn,mri_osteo_yn,mri_syn_yn,mri_mnsc_yn,mri_lig_yn,mri_lig_acl_repair,visit,side,id
2,IM0001,first_visit_arm_1,fisc,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,1,right,IM0001_1_right
3,IM0001,first_visit_arm_1,rosand,2.0,0.0,1.0,0.0,0.0,0.0,0.0,,1,left,IM0001_1_left
4,IM0001,second_visit_arm_1,fisc,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,2,right,IM0001_2_right
5,IM0001,second_visit_arm_1,rosand,2.0,0.0,1.0,0.0,0.0,0.0,0.0,,2,left,IM0001_2_left
8,IM0002,first_visit_arm_1,fisc,1.0,0.0,0.0,0.0,0.0,1.0,0.0,,1,right,IM0002_1_right
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709,IM3022,first_visit_arm_1,rosand,2.0,1.0,1.0,1.0,1.0,1.0,0.0,,1,left,IM3022_1_left
710,IM3022,second_visit_arm_1,rosand,2.0,1.0,1.0,1.0,1.0,1.0,0.0,,2,left,IM3022_2_left
713,IM3023,first_visit_arm_1,fisc,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1,right,IM3023_1_right
714,IM3023,first_visit_arm_1,rosand,2.0,1.0,1.0,1.0,1.0,1.0,0.0,,1,left,IM3023_1_left



No duplicate rows based on id.


In [24]:
mrismall.drop(columns=['mri_lig_acl_repair'], inplace=True)

In [25]:
mrismall.to_csv(os.path.join(base_dir, f"{today}_mrismall.csv"), index=False)