In [6]:
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
import numpy as np
import pandas as pd
from os.path import join
# from preprocessing_ukbb.Utils import check_or_save

TABULAR_BASE = "Raw tabular data you have"
DATAFILE_BASE = "./datasets/data_files/tabular_files"
CLEANED_FEATURES_PATH = "Clean tabular data"

In [None]:
from pathlib import Path
import pickle
from sklearn.model_selection import train_test_split

dir_path = Path("imaging data path")
with open("datasets/data_files/image_files/subj_ids_with_required_size.pkl", "rb") as f:
    all_image_ids = pickle.load(f)

data_df = pd.read_csv(CLEANED_FEATURES_PATH)
date_attended_imaging = pd.read_csv(join(TABULAR_BASE,'col67.txt'))
date_attended_imaging.rename(columns={'eid':'eid','53-2.0':'Date of attending imaging centre-2.0'},inplace=True)
data_df_extended = data_df.merge(date_attended_imaging, left_on='eid', right_on='eid', how='inner')
assert len(data_df_extended) == len(data_df)

for target_name in ['CAD', 'Stroke', 'Hypertension', 'Infarct', 'Diabetes']:
    if target_name == 'CAD':
        target = ['I200', 'I201', 'I208', 'I209', 'I220', 'I221', 'I228', 'I229', 'I210', 'I211', 'I212', 'I213', 'I214', 
                  'I219','I240', 'I248', 'I249', 'I250', 'I251', 'I252', 'I253', 'I254', 'I255', 'I256', 'I258', 'I259',]
    elif target_name == 'Stroke':
        target = ['I630', 'I631', 'I632', 'I633', 'I634', 'I635', 'I636', 'I638', 'I639']
    elif target_name == 'Hypertension':
        target = ['I10', 'I110', 'I119', 'I120', 'I129', 'I130', 'I131', 'I132', 'I139', 'I150', 'I151', 'I152', 'I158', 'I159']
    elif target_name == 'Infarct':
        target = ['I210', 'I211', 'I212', 'I213', 'I214', 'I219', 'I252'] # change according to the paper
    elif target_name == 'Diabetes':
        target = ['E100','E101','E102','E103','E104','E105','E106','E107','E108','E109','E110','E111','E112','E113','E114','E115','E116','E117','E118','E119','E121','E123','E125','E128','E129','E130','E131','E132','E133','E134','E135','E136','E137','E138','E139','E140','E141','E142','E143','E144','E145','E146','E147','E148','E149']

    array_length = 243
    diag_name = 'Diagnoses - ICD10-0.'
    date_name = 'Date of first in-patient diagnosis - ICD10-0.'
    all_target_dates = []
    all_target_indices = []
    all_target_ids = []
    for i in range(array_length):
        all_target_dates.extend(list(data_df[data_df[f'{diag_name}{i}'].isin(target)][f'{date_name}{i}']))
        all_target_indices.extend(list(data_df[data_df[f'{diag_name}{i}'].isin(target)].index))
        all_target_ids.extend(list(data_df[data_df[f'{diag_name}{i}'].isin(target)]['eid']))

    date_attending_centre = []
    for i in all_target_indices:
        date_attending_centre.append(data_df_extended.loc[i,'Date of attending imaging centre-2.0'])
    date_attending_centre = pd.Series(date_attending_centre).astype('datetime64[ns]')

    target_df = pd.DataFrame({'eid':all_target_ids,'target date':all_target_dates,'imaging date':date_attending_centre})
    for time in ['all']:
        if time == 'future':
            target_ids = target_df[target_df['target date']>target_df['imaging date']]['eid']
        elif time == 'past':
            target_ids = target_df[target_df['target date']<target_df['imaging date']]['eid']
        else:
            target_ids = target_df['eid']
            
    labels_data = {"eid": [], f"Diagnosed_{target_name}": []}
    labels_data["eid"] = data_df["eid"]
    l = []
    for eid in list(labels_data["eid"]):
        l.append(int(eid in list(target_df['eid'])))
    print(sum(l))
    labels_data[f"Diagnosed_{target_name}"] = l
    labels_df = pd.DataFrame(labels_data)
    labels_df.to_csv(join(DATAFILE_BASE, f'labels_{target_name}.csv'), index=False)

    # Split into train, val, and test image paths accirding Diagnosed label
    labels_with_image_df = labels_df[labels_df["eid"].isin(all_image_ids)]
    df_train, df_temp = train_test_split(labels_with_image_df, test_size=2000, 
                                        stratify=labels_with_image_df[f'Diagnosed_{target_name}'], 
                                        random_state=42)
    df_val, df_test = train_test_split(df_temp, test_size=1000, stratify=df_temp[f'Diagnosed_{target_name}'], random_state=42)
    image_paths = {}
    for set in ["train", "val", "test"]:
        df = eval(f"df_{set}")
        positive_num = sum(df[f'Diagnosed_{target_name}'])
        set_paths =[]
        for i in df["eid"]:
            set_paths.append(dir_path / str(i) / "processed_seg_allax.npz")
        print(f"{target_name}_{set}: {positive_num}/", len(set_paths))
        image_paths[set] = set_paths
    with open(f"datasets/data_files/image_files/recon_cmr_subject_paths_50k_{target_name}.pkl", "wb") as f:
        pickle.dump(image_paths, f)

In [None]:
target_name = 'High_blood_pressure'
target = 'Age high blood pressure diagnosed-2.0'
labels_data = {"eid": [], f"Diagnosed_{target_name}": []}
labels_data["eid"] = data_df["eid"]
l = []
for eid in list(labels_data["eid"]):
    l.append(int(~data_df[data_df["eid"] == eid][target].isna()))
print(sum(l))
labels_data[f"Diagnosed_{target_name}"] = l
labels_df = pd.DataFrame(labels_data)
labels_df.to_csv(join(DATAFILE_BASE, f'labels_{target_name}.csv'), index=False)

# Split into train, val, and test image paths accirding Diagnosed label
labels_with_image_df = labels_df[labels_df["eid"].isin(all_image_ids)]
df_train, df_temp = train_test_split(labels_with_image_df, test_size=2000, 
                                    stratify=labels_with_image_df[f'Diagnosed_{target_name}'], 
                                    random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=1000, stratify=df_temp[f'Diagnosed_{target_name}'], random_state=42)
image_paths = {}
for set in ["train", "val", "test"]:
    df = eval(f"df_{set}")
    positive_num = sum(df[f'Diagnosed_{target_name}'])
    set_paths =[]
    for i in df["eid"]:
        set_paths.append(dir_path / str(i) / "processed_seg_allax.npz")
    print(f"{target_name}_{set}: {positive_num}/", len(set_paths))
    image_paths[set] = set_paths
with open(f"datasets/data_files/image_files/recon_cmr_subject_paths_50k_{target_name}.pkl", "wb") as f:
    pickle.dump(image_paths, f)