In [None]:
import pandas as pd
import tqdm
import os
import numpy as np
import sys
import copy

from collections import Counter

from sklearn.model_selection import train_test_split

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

# Input here

The path to the folder where Swedish MS REGistry (SMSREG) is kept.  

In [None]:
filepath = ""

# Demographic data

In [None]:
# read base file and visit file
basdata_df = pd.read_csv(filepath + "basdata.csv")
besoksdata_df = pd.read_csv(filepath + "besoksdata.csv")

# Checking patient and visit counts
print ("Total number of patients in the data =",len(basdata_df["patient code"].unique()))
print ("Total number of visits in the data =",len(besoksdata_df[besoksdata_df["patient code"].isin(basdata_df["patient code"].unique())]))
print ("Total number of PPMS patients",len(basdata_df[basdata_df["progress"] == "PP - Primary Progressive"]))
print ("Total number of RRMS and SPMS patients",len(basdata_df[~(basdata_df["progress"] == "PP - Primary Progressive")]))

In [None]:
# Get progress and to sp from unique entry
# get pcode, debut date, diagnosis date, to sp date, date of birth, current progress, sex, visit date
# Convert them to required data format
PatientCode_Progress_df = basdata_df[basdata_df["patient code"].notna()][["patient code","progress","to sp","birthday","sex","debut date","diagnosis date"]]
PatientCode_Progress_df["debut_date"] =  pd.to_datetime(PatientCode_Progress_df['debut date'], format='%Y-%m-%d')
PatientCode_Progress_df["diagnosis_date"] =  pd.to_datetime(PatientCode_Progress_df['diagnosis date'], format='%Y-%m-%d')
PatientCode_Progress_df["to_sp"] =  pd.to_datetime(PatientCode_Progress_df['to sp'], format='%Y')
PatientCode_Progress_df["birthday"] =  pd.to_datetime(PatientCode_Progress_df['birthday'], format='%Y-%m-%d')
PatientCode_Progress_df = PatientCode_Progress_df[PatientCode_Progress_df["progress"].notna()]
PatientCode_Progress_df = PatientCode_Progress_df[PatientCode_Progress_df["sex"].notna()]
PatientCode_Progress_df = PatientCode_Progress_df.drop(["diagnosis date"],axis=1)
PatientCode_Progress_df = PatientCode_Progress_df.drop(["debut date"],axis=1)
PatientCode_Progress_df = PatientCode_Progress_df.drop(["to sp"],axis=1)

PatientCode_VisitDate_EDSS_Age_df = besoksdata_df[["patient code","date","edss value/score"]].merge(basdata_df[["patient code","birthday"]], on='patient code', how='left')
PatientCode_VisitDate_EDSS_Age_df["visit_date"] =  pd.to_datetime(PatientCode_VisitDate_EDSS_Age_df['date'], format='%Y-%m-%d')
PatientCode_VisitDate_EDSS_Age_df = PatientCode_VisitDate_EDSS_Age_df.drop(["date"],axis=1)
PatientCode_VisitDate_EDSS_Age_df["birthday"] =  pd.to_datetime(PatientCode_VisitDate_EDSS_Age_df['birthday'], format='%Y-%m-%d')
PatientCode_VisitDate_EDSS_Age_df["age_at_visit"] = np.round((PatientCode_VisitDate_EDSS_Age_df["visit_date"]-PatientCode_VisitDate_EDSS_Age_df["birthday"]) / np.timedelta64(1, 'Y'),3)
PatientCode_VisitDate_EDSS_Age_df = PatientCode_VisitDate_EDSS_Age_df[["patient code","visit_date","edss value/score","age_at_visit"]]
PatientCode_VisitDate_EDSS_Age_df = PatientCode_VisitDate_EDSS_Age_df.dropna()

# Duplicate visits to be removed 
PatientCode_VisitDate_EDSS_Age_df  = PatientCode_VisitDate_EDSS_Age_df.drop_duplicates(subset=['patient code', 'visit_date'],keep="last")
demographic_df = PatientCode_VisitDate_EDSS_Age_df.merge(PatientCode_Progress_df, on='patient code', how='left')

demographic_df["age_at_debut"] = np.round((demographic_df["debut_date"]-demographic_df["birthday"]) / np.timedelta64(1, 'Y'),3)
demographic_df["age_at_diagnosis"] = np.round((demographic_df["diagnosis_date"]-demographic_df["birthday"]) / np.timedelta64(1, 'Y'),3)
demographic_df["age_at_to_sp"] = np.round((demographic_df["to_sp"]-demographic_df["birthday"]) / np.timedelta64(1, 'Y'),3)
demographic_df = demographic_df.dropna(subset=['patient code', 'visit_date', 'edss value/score', 'age_at_visit', 'progress', 'birthday', 'sex'])
demographic_df= demographic_df.sort_values(by=["patient code",'visit_date'])

demographic_df = demographic_df[~(demographic_df["debut_date"].isna() & demographic_df["diagnosis_date"].isna())] # Remove if both debut and diagnosis dates are missing
print ("Number of person in demographic data = ",len(set(demographic_df["patient code"].to_list())))
print ("Total number of entries = ",len(demographic_df["patient code"].to_list()))

In [None]:
# Assigning RRMS or SPMS or PPMS for each visits
demographic_df["days_diff"] = (demographic_df["visit_date"] - demographic_df["to_sp"])/ np.timedelta64(1, 'D')
demographic_df.loc[demographic_df["days_diff"] < 0.1,"progress during visit"] = "RR"
demographic_df.loc[demographic_df["days_diff"] > 0.1,"progress during visit"] = "SP"
demographic_df.loc[demographic_df["progress"] == "PP - Primary Progressive","progress during visit"] = "PP"
demographic_df.loc[demographic_df["progress"] == "RR - Intermittent progressive MS","progress during visit"] = "RR"
demographic_df = demographic_df.drop(["days_diff"],axis=1)

# Printing statistics of the data
print ("Total number of datapoints in the df = ", len(demographic_df))
demographic_df = demographic_df[demographic_df["progress during visit"].notna()]
pp_snapshots = demographic_df[(demographic_df["progress during visit"] == "PP")]
rr_snapshots = demographic_df[(demographic_df["progress during visit"] == "RR")]
sp_snapshots = demographic_df[demographic_df["progress during visit"] == "SP"]
print ("Total number of datapoints in the df after removing no sp/pp/rr info= ", len(demographic_df))
sp_patient_list = list(set(sp_snapshots["patient code"].to_list()))
rr_patient_list = list(set(rr_snapshots["patient code"].to_list()))
rr_patient_list = list(set(rr_patient_list) - set(sp_patient_list))
pp_patient_list = list(set(pp_snapshots["patient code"].to_list()))
print ("Total number of people in PP snapshots = ", len(set(pp_patient_list)))
print ("Total number of people in RR snapshots = ", len(set(rr_patient_list)))
print ("Total number of people in SP snapshots = ", len(set(sp_patient_list)))
print ("Total number of PP snapshots = ", len(demographic_df[demographic_df["patient code"].isin(pp_patient_list)]))
print ("Total number of RR snapshots = ", len(demographic_df[demographic_df["patient code"].isin(rr_patient_list)]))
print ("Total number of SP snapshots = ", len(demographic_df[demographic_df["patient code"].isin(sp_patient_list)]))

# EQ5D

In [None]:
demographic_addinfo_df = copy.deepcopy(demographic_df)
eq5d_df = pd.read_csv(filepath + "eq5d.csv")
eq5d_df["date"] =  pd.to_datetime(eq5d_df['date'], format='%Y-%m-%d',errors = "coerce") # Change to date format
eq5d_df = eq5d_df[~eq5d_df["date"].isna()] # Remove all the NaT

eq5d_df = eq5d_df.dropna(subset=["date"]) # Remove if date is not present
eq5d_df= eq5d_df.sort_values("date")

def get_last_eq5d_using_date(visit_date,patient_code):
    df = eq5d_df[eq5d_df["patient code"]==patient_code][eq5d_df[eq5d_df["patient code"]==patient_code]["date"] <= visit_date]
    df= df.sort_values("date")
    if len(df) > 0:
        last_eq5d_date = df["date"].to_list()[-1]
        eq5d_score = df["eq5d score"].to_list()[-1]
        return pd.Series([last_eq5d_date,eq5d_score])
    return pd.Series([np.nan,np.nan])

demographic_addinfo_df[["eq5d_date",'eq5d_score']] = demographic_addinfo_df.parallel_apply(lambda x: get_last_eq5d_using_date(x["visit_date"],x["patient code"]),axis=1)
#demographic_addinfo_df = demographic_addinfo_df[demographic_addinfo_df["sdmt days"].notna()] # remove if sdmt== nan
demographic_addinfo_df["age_at_eq5d"] = np.round((demographic_addinfo_df["eq5d_date"]-demographic_addinfo_df["birthday"]) / np.timedelta64(1, 'Y'),3)
demographic_addinfo_df.loc[demographic_addinfo_df["age_at_eq5d"].notna() & demographic_addinfo_df["eq5d_score"].isna(),"age_at_eq5d"] = np.nan # Remove when only visit is present and not score

# SDMT

In [None]:
sdmt_df = pd.read_csv(filepath + "sdmt.csv")
sdmt_df["date"] =  pd.to_datetime(sdmt_df['date'], format='%Y-%m-%d',errors = "coerce")
sdmt_df = sdmt_df[sdmt_df["date"].notna() & sdmt_df["number passed"].notna()]

sdmt_df = sdmt_df.dropna(subset=["date"]) # Remove if date is not present
sdmt_df= sdmt_df.sort_values("date")

def get_last_sdmt_using_date(visit_date,patient_code):
    df = sdmt_df[sdmt_df["patient code"]==patient_code][sdmt_df[sdmt_df["patient code"]==patient_code]["date"] <= visit_date]
    df= df.sort_values("date")
    if len(df) > 0:
        last_sdmt_date = df["date"].to_list()[-1]
        sdmt_days_passed = df["number passed"].to_list()[-1]
        
        return pd.Series([last_sdmt_date,sdmt_days_passed])
    return pd.Series([np.nan,np.nan])

demographic_addinfo_df[["sdmt_date",'sdmt_score']] = demographic_addinfo_df.parallel_apply(lambda x: get_last_sdmt_using_date(x["visit_date"],x["patient code"]),axis=1)
#demographic_addinfo_df = demographic_addinfo_df[demographic_addinfo_df["sdmt days"].notna()] # remove if sdmt== nan
demographic_addinfo_df["age_at_sdmt"] = np.round((demographic_addinfo_df["sdmt_date"]-demographic_addinfo_df["birthday"]) / np.timedelta64(1, 'Y'),3)
demographic_addinfo_df.loc[demographic_addinfo_df["age_at_sdmt"].notna() & demographic_addinfo_df["sdmt_score"].isna(),"age_at_sdmt"] = np.nan

# Treatments

In [None]:
def get_terapi_drug(patient_code,visit_date):
    current_vist_date_drugs = []
    for index,row in terapi_df[terapi_df["patient code"]==patient_code].iterrows():
        start_date = row["insert_date"]
        end_date = row["end_date"]
        if (start_date - visit_date).days < 0 and (end_date - visit_date).days >= 0:
            drug_given = row["preparation"]
            current_vist_date_drugs.append(drug_given)
        elif (start_date - visit_date).days < 0 and str(end_date - visit_date) == "NaT":
            drug_given = row["preparation"]
            current_vist_date_drugs.append(drug_given)
    if len(current_vist_date_drugs) == 1:
        if current_vist_date_drugs[0] == "Ingen behandling":
            return "no treatment"
        return current_vist_date_drugs[0]
    elif len(current_vist_date_drugs) == 0:
        return "no treatment"
    return current_vist_date_drugs


terapi_df = pd.read_csv(filepath + "terapi.csv")
terapi_df = terapi_df[["patient code","insert","preparation","exposed"]]
terapi_df["insert_date"] =  pd.to_datetime(terapi_df['insert'], format='%Y-%m-%d',errors = "coerce")
terapi_df["end_date"] =  pd.to_datetime(terapi_df['exposed'], format='%Y-%m-%d',errors = "coerce")

demographic_addinfo_df["terapi drug"] = demographic_addinfo_df.parallel_apply(lambda x: get_terapi_drug(x["patient code"],x["visit_date"]),axis=1)

In [None]:
# Adding drug classification to the data
drug_classification_df = pd.read_excel("../drug_classification.xlsx")
drug_types = [column_name for column_name in drug_classification_df.columns.to_list() if column_name != "drugs"]
drug_type_catagory = {drug_type:drug_types.index(drug_type) for drug_type in drug_types}
no_treatment_label = len(drug_type_catagory)
drug_type_catagory["no_treatment"] = no_treatment_label
drug_type_catagory_inv = {drug_type_catagory[entry]:entry for entry in drug_type_catagory}


drug_catagorized = {}
for index,row in drug_classification_df.iterrows():
    row_index_of_drug_type = row[row == 1].index
    if len(row_index_of_drug_type.to_list()) > 1:
        drug_catagorized[row["drugs"]] = [drug_type_catagory[entry] for entry in row_index_of_drug_type]
    else:
        drug_catagorized[row["drugs"]] = drug_type_catagory[row_index_of_drug_type[0]]
drug_catagorized['Ingen behandling'] = no_treatment_label
drug_catagorized['no treatment'] = no_treatment_label

In [None]:
# Adding drug infor the df
for index,row in tqdm.tqdm(demographic_addinfo_df.iterrows()):
    entry = row["terapi drug"]
    if type(entry) == list:
        for drugname in entry:
            if drugname in drug_catagorized:
                if type(drug_catagorized[drugname]) == list:
                    for drug_label in drug_catagorized[drugname]:
                        rowname = drug_type_catagory_inv[drug_label]
                        demographic_addinfo_df.loc[index,rowname]= 1
                else:
                    rowname = drug_type_catagory_inv[drug_catagorized[drugname]]
                    #drug_catagory = drug_catagorized[drugname]
                    demographic_addinfo_df.loc[index,rowname]= 1
            else:
                demographic_addinfo_df.loc[index,"not found"]= drugname
    else:
        if entry in drug_catagorized:
            if type(drug_catagorized[entry]) == list:
                for drug_label in drug_catagorized[entry]:
                    rowname = drug_type_catagory_inv[drug_label]
                    demographic_addinfo_df.loc[index,rowname]= 1
            else:
                rowname = drug_type_catagory_inv[drug_catagorized[entry]]
                #drug_catagory = drug_catagorized[entry]
                demographic_addinfo_df.loc[index,rowname]= 1
        else:
            demographic_addinfo_df.loc[index,"not found"]= entry
            
# Fill 0 in case of NA
demographic_addinfo_df[["first_line_DMT","second_line_DMT","no_treatment","relapse_treatment_drugs","other_drugs","stem_cell_treatment"]] = demographic_addinfo_df[["first_line_DMT","second_line_DMT","no_treatment","relapse_treatment_drugs","other_drugs","stem_cell_treatment"]].fillna(0)
# if no treatment and some treatment is present. Set no treatment to 0
index_to_have_no_treatment_changed = (demographic_addinfo_df["no_treatment"] == 1) & ((demographic_addinfo_df["first_line_DMT"] == 1)|(demographic_addinfo_df["second_line_DMT"] == 1)|(demographic_addinfo_df["relapse_treatment_drugs"] == 1)|(demographic_addinfo_df["other_drugs"] == 1))
demographic_addinfo_df.loc[index_to_have_no_treatment_changed,"no_treatment"] = 0

# Relapse

In [None]:
skov_df = pd.read_csv(filepath + "skov.csv")
df = copy.deepcopy(demographic_addinfo_df)
skov_df["date"] =  pd.to_datetime(skov_df['date'], format='%Y-%m-%d')
skov_df = skov_df.dropna(subset=["date"]) # Remove if date is not present
skov_df = skov_df.sort_values("date")

In [None]:
# Remove duplicate debut date
# We can see that there are multiple debut date for the same patient
# We need to change the second entry of the debut to first one.
debut_pcode = skov_df[skov_df["debut relapse"] == 1]["patient code"].to_list() # Find pcode with debut == 1
duplicated_pcodes = Counter(debut_pcode)
duplicated_pcodes = [entry for entry in duplicated_pcodes if duplicated_pcodes[entry] > 1] # Find the duplicate pcodes


# Change the debut relapse to 1 for the first date, when there are duplicates for the same patient
for pcode in duplicated_pcodes:
    subset_df = skov_df[(skov_df["patient code"] == pcode) & (skov_df["debut relapse"] == 1)]
    first_debut_date_index = subset_df.sort_values("date")["date"].index[0]
    skov_df.loc[subset_df.index,"debut relapse"] = 0
    skov_df.loc[first_debut_date_index,"debut relapse"] = 1
    
# Remove patient entry if there is a relapse before debut date
for pcode in list(set(skov_df["patient code"].to_list())):
    sub_df = skov_df[skov_df["patient code"] == pcode]
    if 1 in sub_df["debut relapse"].to_list():
        index_of_1 = sub_df["debut relapse"].to_list().index(1)
        if index_of_1 != 0:
            index_to_remove = sub_df["debut relapse"].index[index_of_1]
            skov_df.drop(index=index_to_remove)
            
print ("Number of patients with debut date =",len(set(skov_df[skov_df["debut relapse"] == 1]["patient code"].to_list())))
print ("Total number of patients",len(set(skov_df["patient code"].to_list())))

In [None]:
# If debut relapse is not found. Consider the first date as the debut relapse
for pcode in list(set(skov_df["patient code"].to_list())):
    sub_df = skov_df[skov_df["patient code"] == pcode]
    if 1 not in sub_df["debut relapse"].to_list():
        skov_df.loc[sub_df.index[0],"debut relapse"] = 1
print ("Number of patients with debut date =",len(set(skov_df[skov_df["debut relapse"] == 1]["patient code"].to_list())))
print ("Total number of patients",len(set(skov_df["patient code"].to_list())))

In [None]:
def get_multi_focal_skov(entry):
    if entry == 0.0:
        return 1
    else:
        return np.nan

skov_df["multi_focal"] = skov_df["mono on"].apply(lambda x: get_multi_focal_skov(x))

In [None]:
sub_df = skov_df.fillna(0) # Fill all the na values with 0
pandarallel.initialize(progress_bar=True,nb_workers=32)
def get_number_of_relapse_using_visitdate(visit_date,patient_code):
    df = sub_df[sub_df["patient code"]==patient_code][sub_df[sub_df["patient code"]==patient_code]["date"] <= visit_date]
    if len(df) > 0:
        mono_on_sum = sum(df["mono on"])
        multi_focal_sum = sum(df["multi_focal"])
        monofocal_sum = sum(df["monofocal"])
        afferent_non_on_sum = sum(df["afferent non on"])
        steroid_treatment_sum = sum(df["steroid treatment"])
        is_last_relapse_steroid_treated = df["steroid treatment"].to_list()[-1]
        is_last_relapse_completely_remitted = df["complete remission"].to_list()[-1]
        last_relapse_date = df["date"].to_list()[-1]
        debut_relapse_date = df["date"].to_list()[0]
        
        return pd.Series([mono_on_sum,multi_focal_sum, monofocal_sum, afferent_non_on_sum, steroid_treatment_sum, is_last_relapse_steroid_treated, is_last_relapse_completely_remitted, last_relapse_date, debut_relapse_date])
    return pd.Series([np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan])

demographic_addinfo_df[["mono_on_sum","multi_focal_sum","monofocal_sum","afferent_non_on_sum","steroid_treatment_sum","is_last_relapse_steroid_treated","is_last_relapse_completely_remitted","last_relapse_date","debut_relapse_date"]] = demographic_addinfo_df.parallel_apply(lambda x: get_number_of_relapse_using_visitdate(x["visit_date"],x["patient code"]),axis=1)
demographic_addinfo_df["age_at_relapse"] = np.round((demographic_addinfo_df["last_relapse_date"]-demographic_addinfo_df["birthday"]) / np.timedelta64(1, 'Y'),3)
demographic_addinfo_df["age_at_debut_relapse"] = np.round((demographic_addinfo_df["debut_relapse_date"]-demographic_addinfo_df["birthday"]) / np.timedelta64(1, 'Y'),3)

# MRI

In [None]:
mri_df = pd.read_csv(filepath + "mri.csv")
mri_df["date"] =  pd.to_datetime(mri_df['date'], format='%Y-%m-%d', errors = 'coerce')
mri_df = mri_df[mri_df["date"].notna()]
mri_df= mri_df.sort_values(by=["date"])

## Column definition
Date

"total number of t2 lesions"  range of ['3', '1-9', '>20', '6', '5', '1', '2', '7', '10-20', '0', '8', '4', '9']  
"number of new or advanced t2 lesions"  range of ['3', '1-9', '>20', '6', '5', '1', '2', '7', '10-20', '0', '8', '4', '9']  
"total number of lesions with brain barrier injury" range of ['3', '>20', '6', '5', '1', '2', '7', '10-20', '0', '8', '4', '9']  
"total number of lesions with spinal cord barrier injury" range of ['3', 0.0, 2.0, 1.0, 3.0, 5.0, 6.0, '>20', '6', '5', '1', '2', '7', '10-20', '0', '8', '4', '9']  
"number of lesions compared to previous spinal cord" range of  [1.0, 2.0, 3.0]  
  
"total brain volume" in millilitre  
"bpf" is in percentage  
  
"lesion in the spinal cord" is binary [0,1]  

In [None]:
def lesion_count_to_catagory(input_value):
    if input_value >= 20:
        input_value = 20
    if input_value < 20 and input_value >= 10:
        input_value = 20
    
    return lesion_catagory[lesion_count_dict_inv[input_value]]

def mri_lesion_count_to_int_number(input_value):
    try: # if int value
        return int(input_value)
    except:
        try: # 1-9,10-20 or >20
            return lesion_count_dict[input_value]
        except: # If nan
            return 0

In [None]:
# Making catagories for lesion count
lesion_catagory = {}
for entry in [0.0, 1.0, 2.0, 3.0, 5.0, 6.0, '6', '9', '3', '5', '2', '0', '1-9', '>20', '1', '7', '4', '10-20', '8']:
    try:
        if int(entry) == 0:
            lesion_catagory[int(entry)] = 0
        else:
            lesion_catagory[int(entry)] = 1
    except:
        if entry == '1-9':
            lesion_catagory[entry] = 1
        if entry == '10-20':
            lesion_catagory[entry] = 2
        if entry == '>20':
            lesion_catagory[entry] = 3
            
lesion_count_dict = {}
for entry in lesion_catagory:
    try:
        entry = int(entry)
        lesion_count_dict[entry] = entry
    except:
        if entry == '1-9':
            lesion_count_dict[entry] = 9
        if entry == '10-20':
            lesion_count_dict[entry] = 20
        if entry == '>20':
            lesion_count_dict[entry] = 25
            
lesion_count_dict_inv = {lesion_count_dict[entry]:entry for entry in lesion_count_dict}

In [None]:
def process_mri_data(visit_date,patient_code):
    df = mri_df[mri_df["patient code"]==patient_code][mri_df[mri_df["patient code"]==patient_code]["date"] <= visit_date]
    df= df.sort_values(by=["date"])
    if len(df) > 0:
        t2_lesion_count = mri_lesion_count_to_int_number(df["total number of t2 lesions"].to_list()[-1])
        t2_new_lesion_count = mri_lesion_count_to_int_number(df["number of new or advanced t2 lesions"].to_list()[-1])
        t2_lesion_catagory = lesion_count_to_catagory(t2_lesion_count+t2_new_lesion_count)
        
        brain_barrier_lesion_count = mri_lesion_count_to_int_number(df["total number of lesions with brain barrier injury"].to_list()[-1])
        brain_barrier_lesion_catagory = lesion_count_to_catagory(brain_barrier_lesion_count)
        
        spinal_barrier_lesion_count = mri_lesion_count_to_int_number(df["total number of lesions with spinal cord barrier injury"].to_list()[-1])
        spinal_barrier_lesion_catagory = lesion_count_to_catagory(spinal_barrier_lesion_count)
        
        checkup_date = df["date"].to_list()[-1]
        
        
        return pd.Series([t2_lesion_catagory,brain_barrier_lesion_catagory,spinal_barrier_lesion_catagory,checkup_date])
    return pd.Series([np.nan,np.nan,np.nan,np.nan])

demographic_addinfo_df[["t2_lesion_catagory","brain_barrier_lesion_catagory","spinal_barrier_lesion_catagory","mri_date"]] = demographic_addinfo_df.parallel_apply(lambda x: process_mri_data(x["visit_date"],x["patient code"]),axis=1)
demographic_addinfo_df["age_at_mri"] = np.round((demographic_addinfo_df["mri_date"]-demographic_addinfo_df["birthday"]) / np.timedelta64(1, 'Y'),3)

# MSIS-29

In [None]:
selected_columns = ['date','msis_01', 'msis_02', 'msis_03', 'msis_04', 'msis_05', 'msis_06', 'msis_07', 
 'msis_08', 'msis_09', 'msis_10', 'msis_11', 'msis_12', 'msis_13', 'msis_14', 'msis_15', 
 'msis_16', 'msis_17', 'msis_18', 'msis_19', 'msis_20', 'msis_21', 'msis_22', 'msis_23', 
 'msis_24', 'msis_25', 'msis_26', 'msis_27', 'msis_28', 'msis_29', 'physically', 'psychologically', 'physically_100', 'psychologically_100']

selected_columns_renamed = ['msis_date','msis_01', 'msis_02', 'msis_03', 'msis_04', 'msis_05', 'msis_06', 'msis_07', 
 'msis_08', 'msis_09', 'msis_10', 'msis_11', 'msis_12', 'msis_13', 'msis_14', 'msis_15', 
 'msis_16', 'msis_17', 'msis_18', 'msis_19', 'msis_20', 'msis_21', 'msis_22', 'msis_23', 
 'msis_24', 'msis_25', 'msis_26', 'msis_27', 'msis_28', 'msis_29', 'msis_physically', 'msis_psychologically', 'msis_physically_100', 'msis_psychologically_100']

In [None]:
msis_df = pd.read_csv(filepath + "msis_29.csv")
msis_df.columns = [entry.replace(" ","_") for entry in msis_df.columns.to_list()] # replace column names
msis_df["date"] =  pd.to_datetime(msis_df['date'], format='%Y-%m-%d',errors = "coerce")
msis_df = msis_df[msis_df["date"].notna()]
msis_df= msis_df.sort_values(by=["date"])


def get_last_msis_using_date(visit_date,patient_code):
    df = msis_df[msis_df["patient_code"]==patient_code][msis_df[msis_df["patient_code"]==patient_code]["date"] <= visit_date]
    df= df.sort_values("date")
    if len(df) > 0:
        last_msis_date = df["date"].to_list()[-1]
        output_values = df[df["date"] == last_msis_date][selected_columns].values.tolist()[0]
        return pd.Series(output_values)
    return pd.Series([np.nan for entry in selected_columns])

demographic_addinfo_df[selected_columns_renamed] = demographic_addinfo_df.parallel_apply(lambda x: get_last_msis_using_date(x["visit_date"],x["patient code"]),axis=1)
demographic_addinfo_df["age_at_msis"] = np.round((demographic_addinfo_df["msis_date"]-demographic_addinfo_df["birthday"]) / np.timedelta64(1, 'Y'),3)

# Cleaning and saving the whole data

In [None]:
df = demographic_addinfo_df.copy(deep=True)
df= df.sort_values(by=["patient code",'visit_date'])
# Snapshot data

df = df.reset_index()
df = df.drop(["index"],axis=1)
df["sex_label"] = (df["sex"] == "MAN").astype("int")

In [None]:
# Remove entries when debut date is after visit date
print ("Entries removed =", np.sum((((df["visit_date"] - df["debut_date"])/ np.timedelta64(1, 'D')) < 0)))
df = df[~(((df["visit_date"] - df["debut_date"])/ np.timedelta64(1, 'D')) < 0)]

# Remove entries if birthday is bigger than visit date
print ("Entries removed = ",np.sum(((df["birthday"] - df["visit_date"])/ np.timedelta64(1, 'D')) >= 0))
df = df[~(((df["birthday"] - df["visit_date"])/ np.timedelta64(1, 'D')) >= 0)]

# Remove entries where last relapse date is before debut relapse date
print ("Entries removed = ",np.sum(((df["debut_relapse_date"] - df["last_relapse_date"])/ np.timedelta64(1, 'D')) > 0))
df = df[~(((df["debut_relapse_date"] - df["last_relapse_date"])/ np.timedelta64(1, 'D')) > 0)]

# Fixing debut date
df["revised_debut_date"] = df["diagnosis_date"]
df.loc[df["diagnosis_date"].isna(),"revised_debut_date"] = df["debut_date"]
df["revised_debut_age"] = np.round((df["revised_debut_date"]-df["birthday"]) / np.timedelta64(1, 'Y'),3)

# If the values for these features are not available, it will be imputed with padding index (-1)
other_columns = ['no_treatment', 'first_line_DMT', 'second_line_DMT', 'other_drugs', 'relapse_treatment_drugs','stem_cell_treatment',
                     'eq5d_score','age_at_eq5d','sdmt_score','age_at_sdmt',
                     'mono_on_sum','monofocal_sum','multi_focal_sum','afferent_non_on_sum','steroid_treatment_sum','is_last_relapse_steroid_treated','is_last_relapse_completely_remitted','age_at_relapse','age_at_debut_relapse',
                't2_lesion_catagory', 'brain_barrier_lesion_catagory', 'spinal_barrier_lesion_catagory','age_at_mri',
                     'msis_01', 'msis_02', 'msis_03', 'msis_04', 'msis_05', 'msis_06', 'msis_07', 'msis_08', 'msis_09', 
                'msis_10', 'msis_11', 'msis_12', 'msis_13', 'msis_14', 'msis_15', 'msis_16', 'msis_17', 'msis_18', 'msis_19',
                'msis_20', 'msis_21', 'msis_22', 'msis_23', 'msis_24', 'msis_25', 'msis_26', 'msis_27', 'msis_28', 'msis_29', 
                'msis_physically', 'msis_psychologically', 'msis_physically_100', 'msis_psychologically_100', 'age_at_msis','sex_label'] 

base_columns = ['edss value/score','age_at_visit','revised_debut_age'] # If values for there features are not present, the data is removed

df[other_columns] = df[other_columns].fillna(-1)

df.to_csv("data_for_pca_python_cleaned.csv",index=False)

# Creating data splits (Train/Valid/Test/Calibration)

In [None]:
# select only RRMS and SPMS patients
# remove ppms patients
df = df[df["progress during visit"] != "PP"]
df.loc[df["progress during visit"] == "RR","y_label"]= 0
df.loc[df["progress during visit"] == "SP","y_label"]= 1

# Split to train, valid and test data
# The split data is kept as object to enable augmentation in the training data
train_size=0.65
calibration_size = 0.05
valid_size=0.05
test_size=1 - (train_size+valid_size+calibration_size)
random_state=0

In [None]:
# segregate patients into different catagories for stratification during data splitting
def get_RRMS_SPMS_transitioning_patients(input_df):
    pcode_label_dict = {}
    for entry in list(df[["patient code","y_label"]].values):
        pcode = int(entry[0])
        label = int(entry[1])
        if pcode in pcode_label_dict:
            pcode_label_dict[pcode].append(label)
        else:
            pcode_label_dict[pcode] = [label]
    pcode_label_dict = {entry:list(set(pcode_label_dict[entry])) for entry in pcode_label_dict}
    
    rr_pcode,sp_pcode,t_pcode = [],[],[]
    for pcode in pcode_label_dict:
        if len(pcode_label_dict[pcode]) == 2:
            t_pcode.append([pcode,2])
        elif pcode_label_dict[pcode] == [0]:
            rr_pcode.append([pcode,0])
        elif pcode_label_dict[pcode] == [1]:
            sp_pcode.append([pcode,1])


    rr_pcode.extend(sp_pcode)
    rr_pcode.extend(t_pcode)
    return rr_pcode

def create_data_splits(input_df,train_size=0.7,valid_size=0.15,test_size=0.15,calibration_set_size=0.1,random_state=0):
    x_train, x_valid_test_calibration = train_test_split(all_pcodes, test_size=(valid_size+test_size+calibration_size),random_state=random_state,stratify =all_pcodes[:,1:])
    x_calibration, x_valid_test = train_test_split(x_valid_test_calibration, test_size=(valid_size+test_size)/(1-train_size),random_state=random_state,stratify=x_valid_test_calibration[:,1:])
    x_valid, x_test = train_test_split(x_valid_test, test_size=test_size/(1-(train_size+calibration_size)) ,random_state=random_state,stratify=x_valid_test[:,1:])
    
    train_pcodes = x_train[:,:1].squeeze()
    valid_pcodes = x_valid[:,:1].squeeze()
    test_pcodes = x_test[:,:1].squeeze()
    calibration_pcodes = x_calibration[:,:1].squeeze()\

    train_df = df[df["patient code"].isin(train_pcodes)]
    valid_df = df[df["patient code"].isin(valid_pcodes)]
    test_df = df[df["patient code"].isin(test_pcodes)]
    calibration_df = df[df["patient code"].isin(calibration_pcodes)]


    return train_df,valid_df,test_df,calibration_df

In [None]:
all_pcodes = np.array(get_RRMS_SPMS_transitioning_patients(df))
train_df,valid_df,test_df,calibration_df= create_data_splits(df,train_size=train_size,valid_size=valid_size, test_size=test_size,calibration_set_size=calibration_size,random_state=random_state)

In [None]:
os.mkdir("data_splits")
train_df.to_csv("data_splits/train.csv",index=False)
valid_df.to_csv("data_splits/valid.csv",index=False)
test_df.to_csv("data_splits/test.csv",index=False)
calibration_df.to_csv("data_splits/calibration.csv",index=False)