In [5]:
#!pip install nibabel

In [6]:
import os
import nibabel as nib
import numpy as np
import pandas as pd
import h5py
import re

In [7]:
def get_patient_images_from_id(filenames, subject_id, last_visit_right=12, last_visit_left=12):
    possible_right_images = {str(subject_id) + "-" + "Right" + "-" + f"V{visit:02}.nii.gz" for visit in range(last_visit_right)}
    possible_left_images = {str(subject_id) + "-" + "Left" + "-" + f"V{visit:02}.nii.gz" for visit in range(last_visit_left)}
    files = set(filenames)
    return list(possible_right_images & files), list(possible_left_images & files)

# Creating subjects dataset
### Steps:
- Filter all subjects with images
- Find all subjects with left or right TKR
- For each TKR subject
    - Filter all subjects without TKR
    - Filter all subjects with bmi +- 2
    - Filter all subjects with age +- 4
    - Take random (max 2)
    - If less than 4, discard TKR patient
    - For TKR patient take last 3 visits before TKR
- Filter all without medical insurance
- Filter all where no problems with MRI
- Filter all who could not walk 400m at baseline


In [8]:
path = "../OAIdata21/"
files = os.listdir(path)
reg = re.compile(r'^AllClinical.*txt')
all_clinical = [file for file in files if reg.match(file)]

In [9]:
all_clinical_df = [pd.read_csv(path + file, sep='|', index_col="ID") for file in all_clinical]
outcomes_df = pd.read_csv(path + "Outcomes99.txt", sep='|', index_col="id")
outcomes_df.index.names = ['ID']
subjects_df = pd.concat([*all_clinical_df, outcomes_df], axis=1)

In [10]:
# Formatting columns and cleaning
df = subjects_df.copy()
print(f"Initial subject count: {df.shape[0]}")

# Keep subjects with medical insurance
df = df[df["V00MEDINS"] == "1: Yes"]
print(f"Subject count with insurance: {df.shape[0]}")

# Keep subjects without MRI problems
df = df[df["P01MRPRBCV"] == "0: No"]
print(f"Subject count without MRI problems: {df.shape[0]}")

# Keep subjects that completed 400m walk
df = df[df["V00400MCMP"].isin(["1: Completed test without stopping", "2: Completed test with one or more rests"])]
print(f"Subject count that could walk 400m: {df.shape[0]}")

df["Right TKR"] = np.where(df["V99ERKDAYS"] == ".: Missing Form/Incomplete Workbook", 0, 1)
df["Left TKR"] = np.where(df["V99ELKDAYS"] == ".: Missing Form/Incomplete Workbook", 0, 1)
df["TKR"] = df[["Right TKR", "Left TKR"]].any(axis="columns")

last_visit_dict = {
    ".: Missing Form/Incomplete Workbook": -1,
    "10: 96-month": 10,
    "9: 84-month": 9,
    "8: 72-month": 8,
    "7: 60-month": 7,
    "6: 48-month": 6,
    "1: 12-month": 1,
    "5: 36-month": 5,
    "3: 24-month": 3,
    "0: Baseline": 0,
    "4: 30-month": 4, 
    "2: 18-month": 2
}
df = df.replace({"V99ERKVSPR": last_visit_dict, "V99ELKVSPR": last_visit_dict})

columns = ["V00AGE", "P01BMI", "V99ERKVSPR", "V99ELKVSPR", "Right TKR", "Left TKR", "TKR"]

print(f"Went from {subjects_df.shape[0]} subjects, to {df.shape[0]} subjects.")

Initial subject count: 4796
Subject count with insurance: 4145
Subject count without MRI problems: 3871
Subject count that could walk 400m: 3655
Went from 4796 subjects, to 3655 subjects.


In [11]:
TKR_df = df[df["TKR"] == True][columns]
non_TKR_df = df[df["TKR"] == False][columns]

print(f"{TKR_df.shape[0]} TKR subjects\n{non_TKR_df.shape[0]} non TKR subjects ")

336 TKR subjects
3319 non TKR subjects 


In [12]:
dataset_df = pd.DataFrame(columns=["ID", *columns])
dataset_df.set_index("ID", inplace=True)

potential_matches_df = non_TKR_df.copy()
exclude_TKR_indices = []

for i, p in TKR_df.iterrows():
    age = p["V00AGE"]
    bmi = p["P01BMI"]
    
    age_match_df = potential_matches_df[potential_matches_df["V00AGE"].between(age - 4, age + 4, inclusive="both")]
    bmi_match_df = age_match_df[age_match_df["P01BMI"].between(bmi - 2, bmi + 2, inclusive="both")]
    
    n = min(bmi_match_df.shape[0], 2)
    if n == 0:
        exclude_TKR_indices.append(i)
        continue
        
    matches = bmi_match_df.sample(n=n)
    potential_matches_df.drop(matches.index.values, inplace=True)
    
    dataset_df = pd.concat([dataset_df, matches], axis=0)
    
dataset_df = pd.concat([dataset_df, TKR_df.drop(exclude_TKR_indices)], axis=0)

In [13]:
dataset_df["TKR"].value_counts()

False    670
True     335
Name: TKR, dtype: int64

In [14]:
dataset_df.to_csv("subjects.csv")

# Get images from subjects

In [15]:
image_filenames = list(np.loadtxt("oai-files.txt", dtype=str))
image_files = []

for subject_id, subject_info in dataset_df.iterrows():
    last_visit_right = 12
    last_visit_left = 12
    
    if subject_info["Right TKR"] == 1:
        last_visit_right = subject_info["V99ERKVSPR"]
        
    if subject_info["Left TKR"] == 1:
        last_visit_left = subject_info["V99ELKVSPR"]
        
    right, left = get_patient_images_from_id(image_filenames, subject_id, last_visit_right=last_visit_right, last_visit_left=last_visit_left)
    
    image_files += right + left

In [16]:
print(f"Found {len(image_files)} images")
with open("subject_images.txt", "w") as f:
    for file in image_files:
        f.write(file + "\n")

Found 3803 images


In [17]:
dataset_df[dataset_df["TKR"]].head()

Unnamed: 0_level_0,V00AGE,P01BMI,V99ERKVSPR,V99ELKVSPR,Right TKR,Left TKR,TKR
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9011918,62,23.1,-1,1,0,1,True
9014797,67,26.4,9,-1,1,0,True
9014883,74,31.1,-1,6,0,1,True
9015363,64,23.0,5,-1,1,0,True
9028418,64,29.8,5,-1,1,0,True
