# Create datasets of MI patient data

**Objective: extract from the SPUM patient dataset meaningful information related to myocardial infraction (MI)**

## 0.Init

In [1]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Constants
PATH_TO_DATA = "../../pocePatientData/spumDataMerged.csv" # path to the original file
SAVE_PATH = "patient_mi_data/"
MAX_NB_DAYS_COMPICATION = 30 # max nb of days to consider a second MI just the complication of another one
MAX_NB_DAYS_BTW_MI_AND_DEATH = 14 # maximum nb of days btw MI and death to consider MI related to death (check with doctor)

## 1. Select columns of interest

### 1.1 Prediction features

Only take a subset of meaningful features, selected from the email of a doctor (Thabo):

I can get you going with the parameters to include in the model. In short, in cardiology there are several clinical parameters that we know are correlated with bad outcomes (baseline cardiovascular risk factors and then clinical factors during the hospitalisation that are also predictive). Here's a list with the parameter names in the SPUM sheet:

CV risk factors :
- Age: age
- Sex: sex
- Hypertension: hypertension_hist
- Diabetes: diabetes_hist
- Hypercholesterolaemia: cholesterolemia_hist
- Previous cardiovascular disease (MI, stroke, peripheral arterial disease): prev_CVD
- Smoking: smoking_bln
- Obesity/BMI: bmi

Clinical factors:
- LVEF (left ventricular ejection fraction i.e. how well the heart is pumping): *lvef_comb*
- Killip class (stage of heart failure on examination - graded 1 to 4 with higher being worse): chf_code_proc
- Cardiac arrest (did the heart ever stop during the admission): resuscitation 
- Grace score (a risk score that is derived from several risk clinical parameters that is known to predict mortality after an MI): grace_calc
- Kidney function: eGFR_bln

In [3]:
cv_risk_factor_l = ["age", "sex", "hypertension_hist", "diabetes_hist", "cholesterolemia_hist", "prev_CVD", "smoking_bln", 
                    "bmi"]
clincial_factor_l = ["lvef_comb", "chf_code_proc", "resuscitation", "grace_calc", "eGFR_bln"]

### 1.2 Targets

The objective may be to predict if there will be another MI, if the patient will need a revascularisation or if he/she will die

In [4]:
mi_information_l = ["mi_1", "mi_2", "mi_3", "mi_4"]
death_information_l = ["death.x"]
revasc_information_l = ["revasc_1", "revasc_2", "revasc_3", "revasc_4", "revasc_5", "revasc_6"]

### 1.3 Time to target event

For future application, the time until a target event may be a useful information. 

It is also used to exclude events that are related to the original MI and are not a new MI. 

From the doctor (Thabo): The time interval to MI is "time_to_mi_1", the "1" is because some patients have more than one MI. It may be necessary to exclude the MIs that occurred in the first 30 days as it is likely that these were linked to the stent put in for the first MI (stent thrombosis).

In [5]:
time_information_l = ["time_to_mi_1", "time_to_mi_2", "time_to_mi_3", "time_to_mi_4",
                      "time_to_death", 
                      "time_to_revasc_1", "time_to_revasc_2", "time_to_revasc_3", 
                      "time_to_Revasc_4", "time_to_revas_5" , "time_to_revasc_6"]

### 1.4 Other information

sjid: id of the patient

In [6]:
patient_information_l = ["sjid"] 

### 1.5 Gather all

In [7]:
all_columns = cv_risk_factor_l + clincial_factor_l + mi_information_l + \
              death_information_l + revasc_information_l + time_information_l + patient_information_l

## 2. Load and clean the data

### 2.1 Create the df

In [8]:
df = pd.read_csv(PATH_TO_DATA, usecols=all_columns)
df = df.set_index("sjid")
df = df.rename(columns={"time_to_Revasc_4": "time_to_revasc_4", "time_to_revas_5": "time_to_revasc_5"})

print("Nb of entries: {}".format(len(df)))
df.head(5)

Nb of entries: 984


Unnamed: 0_level_0,death.x,time_to_death,mi_1,mi_2,mi_3,mi_4,time_to_mi_1,time_to_mi_2,time_to_mi_3,time_to_mi_4,...,diabetes_hist,smoking_bln,hypertension_hist,cholesterolemia_hist,prev_CVD,eGFR_bln,resuscitation,chf_code_proc,lvef_comb,grace_calc
sjid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
003-529-877,,,,,,,,,,,...,0,1,1,1.0,0,72.170715,0.0,1.0,,118.0
003-551-891,,,,,,,,,,,...,0,3,0,0.0,0,85.532219,0.0,1.0,65.0,154.0
003-648-529,,,,,,,,,,,...,0,1,0,1.0,1,80.782486,0.0,1.0,,129.0
005-251-825,,,,,,,,,,,...,0,3,0,1.0,0,105.97847,0.0,1.0,45.0,135.0
005-561-090,,,,,,,,,,,...,0,2,1,1.0,0,52.820919,0.0,1.0,50.0,164.0


### 2.2 Replace NaN for non occured events

Some column contain Nan when the event just not occured

In [9]:
for i in range(1,5):
    df["mi_"+str(i)] = df["mi_"+str(i)].fillna(0)
    df["time_to_mi_"+str(i)] = df["time_to_mi_"+str(i)].fillna(0)

for i in range(1,7):
    df["revasc_"+str(i)] = df["revasc_"+str(i)].fillna(0)
    df["time_to_revasc_"+str(i)] = df["time_to_revasc_"+str(i)].fillna(0)

df["death.x"] = df["death.x"].fillna(0)
df["time_to_death"] = df["time_to_death"].fillna(0)

## 3. Create correct labels 

Only consider labels if occured at least 30 days after (else, could be related to the original MI). If never happened will be replace time by -1

### 3.1 Death event and time to death

Only consider deaths that are related to MI (and not to the first one)

<span style="color:red">We consider death related to MI if death occured 14 days after MI. Need to check with a doctor this assumption</span>

In [11]:
df["death_mi"] = False

for i in range(1,5):
    # Only consider the deathes that happened in the next two weeks after the previous one
    df["death_mi"] = df["death_mi"] | ( 
                    (df["time_to_death"] - df["time_to_mi_"+str(i)]) <= MAX_NB_DAYS_BTW_MI_AND_DEATH)

# Remove deathes related to first MI
df["death_mi"] = (df["death_mi"]) & (df["time_to_death"]>MAX_NB_DAYS_COMPICATION)

# Check that the person really died (in theory useless if time correctly indicated in the file)
df["death_mi"] = df["death_mi"] & df["death_mi"]

# Correct time to death
df["time_to_death_mi"] = -1
df.loc[df["death_mi"] ,"time_to_death"] = df["time_to_death"]

### 3.2 MI event and time to MI

In [12]:
# Only consider the MI that occured more than 30 days after the last MI
for i in range(1,5):
    if i>1:
        nb_days_last_mi = df["time_to_mi_"+str(i)] - df["time_to_mi_"+str(i-1)]
    else:
        nb_days_last_mi = df["time_to_mi_"+str(i)]  
    df["mi_"+str(i)] = (nb_days_last_mi > MAX_NB_DAYS_COMPICATION) & (df["mi_"+str(i)] == 1.0)
    
# Consider MI if one of the MI is validated
df["mi"] = df["mi_1"] | df["mi_2"] | df["mi_3"] | df["mi_4"]
df["nb_mi"] = df["mi_1"].astype(int) + df["mi_2"].astype(int) + df["mi_3"].astype(int) + df["mi_4"].astype(int)

# Take the lowest time of valid mi
df["time_to_mi"] = np.inf
for i in range(1,5):
    mask = df["mi_"+str(i)] & (df["time_to_mi_"+str(i)]<df["time_to_mi"])
    df.loc[mask, "time_to_mi"] = df["time_to_mi_"+str(i)]
df.loc[df["time_to_mi"]==np.inf, "time_to_mi"] = -1

### 3.3 Revascularisation event and time to revascularisation

In [13]:
# Only consider the revascularisation that occured more than 30 days after the first valid MI
for i in range(1,7):
    nb_days_from_first_mi = df["time_to_revasc_"+str(i)] - df["time_to_mi"]
    df["revasc_"+str(i)] = (nb_days_from_first_mi > MAX_NB_DAYS_COMPICATION) & (df["revasc_"+str(i)] == 1.0) 
    
# Consider revasc if one of the revasc is validated
df["revasc"] = df["revasc_1"] | df["revasc_2"] | df["revasc_3"] | df["revasc_4"] | df["revasc_5"] | df["revasc_6"]
df["nb_revasc"] = df["revasc_1"].astype(int) + df["revasc_2"].astype(int) + df["revasc_3"].astype(int) + \
                  df["revasc_4"].astype(int) + df["revasc_5"].astype(int) + df["revasc_6"].astype(int)

# Take the lowest time of valid mi
df["time_to_revasc"] = np.inf
for i in range(1,7):
    mask = df["revasc_"+str(i)] & (df["time_to_revasc_"+str(i)]<df["time_to_revasc"])
    df.loc[mask, "time_to_revasc"] = df["time_to_revasc_"+str(i)]
df.loc[df["time_to_revasc"]==np.inf, "time_to_revasc"] = -1

## 4. Remove used features

In [14]:
df = df.drop(columns=["revasc_1", "revasc_2", "revasc_3", "revasc_4", "revasc_5", "revasc_6", 
                      "time_to_revasc_1", "time_to_revasc_2", "time_to_revasc_3", 
                      "time_to_revasc_4", "time_to_revasc_5", "time_to_revasc_6", 
                      "mi_1", "mi_2", "mi_3", "mi_4",
                      "time_to_mi_1", "time_to_mi_2", "time_to_mi_3", "time_to_mi_4",
                      "death.x", "time_to_death"])

## 5. Save datasets

### 5.1 Raw data

In [15]:
# Force all categories to go from 0 to N
df["sex"] -= 1
df["smoking_bln"] -= 1
df["chf_code_proc"] -= 1

# Save
df.to_csv(SAVE_PATH+"full_mi_patient_data.csv")
df.to_excel(SAVE_PATH+"full_mi_patient_data.xlsx")

### 5.2 NaN filled

For categorical, filled with most frequent, for continuous with median

In [16]:
df_no_na = df.copy()

In [17]:
for col in df_no_na:
    nb_nan = df_no_na[col].isna().sum()
    if nb_nan != 0:
        print("Nb of non valid values for {} is {}".format(col, nb_nan))

Nb of non valid values for bmi is 3
Nb of non valid values for cholesterolemia_hist is 1
Nb of non valid values for eGFR_bln is 31
Nb of non valid values for resuscitation is 3
Nb of non valid values for chf_code_proc is 10
Nb of non valid values for lvef_comb is 214
Nb of non valid values for grace_calc is 82


In [18]:
# Categorical data filled with most frequent
df_no_na["cholesterolemia_hist"] = df_no_na["cholesterolemia_hist"].fillna(
    df_no_na["cholesterolemia_hist"].value_counts().argmax())
df_no_na["resuscitation"] = df_no_na["resuscitation"].fillna(
    df_no_na["resuscitation"].value_counts().argmax())
df_no_na["chf_code_proc"] = df_no_na["chf_code_proc"].fillna(
    df_no_na["chf_code_proc"].value_counts().argmax())

# Continous data filled with median
df_no_na["bmi"] = df_no_na["bmi"].fillna(df_no_na["bmi"].median())
df_no_na["eGFR_bln"] = df_no_na["eGFR_bln"].fillna(df_no_na["eGFR_bln"].median())
df_no_na["grace_calc"] = df_no_na["grace_calc"].fillna(df_no_na["grace_calc"].median())
df_no_na["lvef_comb"] = df_no_na["lvef_comb"].fillna(df_no_na["lvef_comb"].median())

In [19]:
for col in df_no_na:
    nb_nan = df_no_na[col].isna().sum()
    if nb_nan != 0:
        print("Nb of non valid values for {} is {}".format(col, nb_nan))

In [20]:
df_no_na.to_csv(SAVE_PATH+"full_mi_patient_data_no_na.csv")
df_no_na.to_excel(SAVE_PATH+"full_mi_patient_data_no_na.xlsx")

----