In [5]:
#!pip install nibabel

In [None]:
import os
import nibabel as nib
import numpy as np
import pandas as pd
import h5py
from typing import List, Set

In [41]:
def get_patient_images_from_id(filenames: Set[str], subject_id: int, last_visit_right=12, last_visit_left: int=12):
    possible_right_images = {str(subject_id) + "-" + "Right" + "-" + f"V{visit:02}.nii.gz" for visit in range(last_visit_right)}
    possible_left_images = {str(subject_id) + "-" + "Left" + "-" + f"V{visit:02}.nii.gz" for visit in range(last_visit_left)}
    return list(possible_right_images & files), list(possible_left_images & files)

# Creating subjects dataset
### Steps:
- Filter all subjects with images
- Find all subjects with left or right TKR
- For each TKR knee subject
    - Filter all subjects without TKR
    - Filter all subjects with bmi +- 2
    - Filter all subjects with age +- 4
    - Take random (max 2)
    - If less than 4, discard TKR patient
    - For TKR patient take last 3 visits before TKR
- Filter all without medical insurance
- Filter all where no problems with MRI
- Filter all who could not walk 400m at baseline


In [42]:
path = "~/OAIdata21/"
files = os.listdir(path)
reg = re.compile(r'^AllClinical.*txt')
all_clinical = [file for file in files if reg.match(file)]

In [43]:
all_clinical_df = [pd.read_csv(path + file, sep='|', index_col="ID") for file in all_clinical]
outcomes_df = pd.read_csv(path + "Outcomes99.txt", sep='|', index_col="id")
outcomes_df.index.names = ["ID"]
subjects_df = pd.concat([*all_clinical_df, outcomes_df], axis=1)
subjects_df = subjects_df.reset_index()

In [44]:
# Formatting columns and cleaning
df = subjects_df.copy()
print(f"Initial subject count: {df.shape[0]}")

# Keep subjects with medical insurance
df = df[df["V00MEDINS"] == "1: Yes"]
print(f"Subject count with insurance: {df.shape[0]}")

# Keep subjects without MRI problems
df = df[df["P01MRPRBCV"] == "0: No"]
print(f"Subject count without MRI problems: {df.shape[0]}")

# Keep subjects that completed 400m walk
df = df[df["V00400MCMP"].isin(["1: Completed test without stopping", "2: Completed test with one or more rests"])]
print(f"Subject count that could walk 400m: {df.shape[0]}")

df["TKR-R"] = np.where(df["V99ERKDAYS"] == ".: Missing Form/Incomplete Workbook", 0, 1)
df["TKR-L"] = np.where(df["V99ELKDAYS"] == ".: Missing Form/Incomplete Workbook", 0, 1)

last_visit_dict = {
    ".: Missing Form/Incomplete Workbook": -1,
    "10: 96-month": 10,
    "9: 84-month": 9,
    "8: 72-month": 8,
    "7: 60-month": 7,
    "6: 48-month": 6,
    "1: 12-month": 1,
    "5: 36-month": 5,
    "3: 24-month": 3,
    "0: Baseline": 0,
    "4: 30-month": 4, 
    "2: 18-month": 2
}
df = df.replace({"V99ERKVSPR": last_visit_dict, "V99ELKVSPR": last_visit_dict})

# Columns: Subject id, Age at baseline, BMI at baseline, last visit before right tkr, last visit before left tkr, right tkr, left tkr
columns = ["ID", "V00AGE", "P01BMI", "V99ERKVSPR", "V99ELKVSPR", "TKR-R", "TKR-L"]
df = df[columns]
print(f"Went from {subjects_df.shape[0]} subjects, to {df.shape[0]} subjects.")

Initial subject count: 4796
Subject count with insurance: 4145
Subject count without MRI problems: 3871
Subject count that could walk 400m: 3655
Went from 4796 subjects, to 3655 subjects.


In [65]:
# Split each subject into left and right knee
new_cols = ["subject_id_and_knee", "age", "BMI", "last_visit_before_tkr", "TKR", "is_right"]

right_df = df.copy()
right_df = right_df.rename(columns={"V00AGE": "age", "P01BMI": "BMI", "V99ERKVSPR": "last_visit_before_tkr", "TKR-R": "TKR"})
right_df["is_right"] = True
right_df = right_df

left_df = df.copy()
left_df = left_df.rename(columns={"V00AGE": "age", "P01BMI": "BMI", "V99ELKVSPR": "last_visit_before_tkr", "TKR-L": "TKR"})
left_df["is_right"] = False
left_df = left_df

split_df = pd.concat([right_df, left_df], ignore_index=True)
split_df["subject_id_and_knee"] = split_df.apply(lambda row: str(row["ID"]) + ("-R" if row["is_right"] else "-L"), axis=1)

df = split_df[new_cols]
df.set_index("subject_id_and_knee")
df.head()


Unnamed: 0,subject_id_and_knee,age,BMI,last_visit_before_tkr,TKR,is_right
0,9000296-R,69,29.8,-1,0,True
1,9000798-R,56,32.4,-1,0,True
2,9001695-R,52,28.6,-1,0,True
3,9001897-R,72,25.9,-1,0,True
4,9002316-R,76,25.1,-1,0,True


In [66]:
TKR_df = df[(df["TKR"] == True)]
non_TKR_df = df.drop(TKR_df.index)

print(f"{TKR_df.shape[0]} TKR samples\n{non_TKR_df.shape[0]} non TKR samples ")

426 TKR samples
6884 non TKR samples 


In [67]:
dataset_df = pd.DataFrame(columns=new_cols)
#dataset_df.set_index("subject_id_and_knee", inplace=True)

potential_matches_df = non_TKR_df.copy()
exclude_TKR_indices = []

for i, p in TKR_df.iterrows():
    age = p["age"]
    bmi = p["BMI"]
    
    age_match_df = potential_matches_df[potential_matches_df["age"].between(age - 4, age + 4, inclusive="both")]
    bmi_match_df = age_match_df[age_match_df["BMI"].between(bmi - 2, bmi + 2, inclusive="both")]
    
    n = min(bmi_match_df.shape[0], 2)
    if n == 0:
        exclude_TKR_indices.append(i)
        continue
        
    matches = bmi_match_df.sample(n=n)
    potential_matches_df.drop(matches.index.values, inplace=True)
    
    dataset_df = pd.concat([dataset_df, matches], axis=0)
    
dataset_df = pd.concat([dataset_df, TKR_df.drop(exclude_TKR_indices)], axis=0)

In [68]:
dataset_df.head()#.set_index("subject_id_and_knee").head()

Unnamed: 0,age,BMI,last_visit_before_tkr,TKR,is_right,subject_id_and_knee
2958,64,27.2,-1,0,True,9800154-R
6863,65,27.5,-1,0,False,9873103-L
1566,61,21.1,-1,0,True,9460287-R
1550,61,22.4,-1,0,True,9457718-R
31,63,30.7,-1,0,True,9008884-R


In [48]:
dataset_df.to_csv("subjects.csv")

# Get images from subjects

In [54]:
image_filenames = os.listdir("~/ucph-erda-home/Osteoarthritis-initiative/NIFTY")
image_files = []

for subject_idx, subject_info in dataset_df.iterrows():
    last_visit_right = 12
    last_visit_left = 12

    subject_id = subject_info["subject_id_and_knee"][:-2]
    
    if subject_info["TKR"] and subject_info["is_right"]:
        last_visit_right = subject_info["last_visit_before_tkr"]
        
    if subject_info["TKR"] and not subject_info["is_right"]:
        last_visit_left = subject_info["last_visit_before_tkr"]
        
    right, left = get_patient_images_from_id(image_filenames, subject_id, last_visit_right=last_visit_right, last_visit_left=last_visit_left)
    
    image_files += right + left

In [55]:
print(f"Found {len(image_files)} images")
with open("subject_images.txt", "w") as f:
    for file in image_files:
        f.write(file + "\n")

Found 4925 images


In [78]:
row = dataset_df.loc["9800154-R"]
row["age"]

64