In [39]:
import pandas as pd

### Merging individual Treatment Files

In [40]:
pressor = pd.read_csv("files/Pressor.csv")
ventilation = pd.read_csv("files/ventilation.csv")
dialysis = pd.read_csv("files/Dialysis.csv")

#### Vasopressor: Unite into one activity from the first to the last timestamp

In [41]:
agg_timestamps = {'starttime': ['first'],'endtime': ['last']}
pressor_new = pressor.groupby(['stay_id','linkorderid'], as_index=False).agg(agg_timestamps)
pressor_new = pressor_new.drop(columns=["linkorderid"], axis = 1)
pressor_new = pressor_new.droplevel(level=1, axis = 1)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


#### Vasopressor: Generalize all vasopressor under the name pressor

In [42]:
pressor_new["activity"] = "PRESSOR"

#### Dialysis: Unite ongoing Dialysis into one event

In [43]:
arr = dialysis.to_numpy()
activated = False
activated_tr = ""
activated_stay = 0
activated_line = 0
last_activated_time = ""

lines_to_drop = []
end_timestamps = ['0' for i in range(0,len(arr))]

last_stay = 0
last_stat = 0
#clean activatedness
for line in range(0,len(arr)):
    curr_stay = arr[line][0]
    curr_stat = arr[line][3]
    if curr_stat == 1:
        if last_stat == 0:
            activated = True
            activated_tr = arr[line][4]
            activated_stay = curr_stay
            activated_line = line
            last_activated_time = arr[line][1]
        if last_stat == 1:
            #check if same treatment
            if curr_stay == activated_stay and activated_tr == arr[line][4]:
                last_activated_time = arr[line][1]
                lines_to_drop.append(line)
            else: 
                end_timestamps[activated_line] = last_activated_time
                #store new treatment information
                activated = True
                activated_tr = arr[line][4]
                activated_stay = curr_stay
                activated_line = line
                last_activated_time = arr[line][1]
    else:
        activated = False
        lines_to_drop.append(line)
        if last_stat == 1:
            #check if same treatment
            if curr_stay == activated_stay and activated_tr == arr[line][4]:
                end_timestamps[activated_line] = arr[line][1]
            else: 
                end_timestamps[activated_line] = last_activated_time
    last_stat = curr_stat
    last_stay = curr_stay
if activated:
    end_timestamps[activated_line] = last_activated_time
dialysis["endtime"] = end_timestamps
dialysis = dialysis.drop(lines_to_drop,axis = 0)

In [44]:
dialysis["activity"] = "DIALYSIS"
dialysis = dialysis.rename(columns = {"charttime":"starttime"})
dialysis = dialysis.drop(["dialysis_present","dialysis_active","dialysis_type"],axis = 1)

#### Ventilation: Rename Activity and Drop the status info

In [45]:
ventilation["activity"] = "VENTILATION"
ventilation = ventilation.drop(["ventilation_status"],axis = 1)

### Merge to an event log

In [46]:
log = pd.concat([dialysis, ventilation, pressor_new])
log["starttime"] = pd.to_datetime(log["starttime"])
log["endtime"] = pd.to_datetime(log["endtime"])
log.sort_values(by='starttime', inplace=True)

#### Join treatments of the same kind to one single treatment with start and end

In [47]:
log = log.groupby(['stay_id','activity'], as_index=False).agg(agg_timestamps)
log = log.droplevel(level=1, axis = 1)

### Add Patient Data

In [48]:
pat = pd.read_csv("files/patient_data.csv") ####Add the control variables and comorbidities here

In [49]:
log = log.join(pat.set_index('stay_id'),on="stay_id", how="left",rsuffix="_r")


In [50]:
log["language"] = log["language"].replace("?","Non-native speaker")
log["language"] = log["language"].replace("ENGLISH","Native english speaker")
log["language"].value_counts()
log = log[log['language'].notna()]

### Add admit and discharge event for each patient and sofa score for each event

In [51]:
sofa = pd.read_csv("files/sofa_scores_update.csv")

In [52]:
from datetime import datetime, timedelta
log_arr_new = []
log_arr = [x for _,x in log.groupby("stay_id")]
for df in log_arr:
    first_row = df.iloc[0].copy()
    last_row = df.iloc[0].copy()
    sofa_scores = sofa[sofa["stay_id"]==first_row["stay_id"]].sort_values("starttime")
    sofa_scores["starttime"] = pd.to_datetime(sofa_scores["starttime"])
    sofa_scores["endtime"] = pd.to_datetime(sofa_scores["endtime"])
    df["starttime"] = pd.to_datetime(df["starttime"])
    df["endtime"] = pd.to_datetime(df["endtime"])
    
    first_row["activity"] = "ADMIT"
    last_row["activity"] = "DISCHARGE"
    first_row["starttime"] = first_row["admittime"]
    first_row["endtime"] = first_row["admittime"]
    last_row["starttime"] = last_row["dischtime"]
    last_row["endtime"] = first_row["dischtime"]
    first_row["SOFA"] = sofa_scores.iloc[0]["sofa_24hours"]
    last_row["SOFA"] = sofa_scores.iloc[-1]["sofa_24hours"]
    sofa_score_arr = sofa_scores.to_numpy()
    new_df = pd.DataFrame({})
    for i_row, row in df.iterrows():
        found = False
        for i in range(0,len(sofa_score_arr)):
            #some sofas are recorded after the first treatment, they will be assigned with the first recorded sofa score
            if sofa_score_arr[i][1] <= row["starttime"] and sofa_score_arr[i][2]>= row["starttime"]:
                row["SOFA"] = sofa_score_arr[i][3]
                found = True
                break
        if not found:
            row["SOFA"] = sofa_score_arr[0][3]
        new_df = pd.concat([new_df,row.to_frame().T])  
    log_arr_new.append(pd.concat([new_df,last_row.to_frame().T,first_row.to_frame().T]))

In [53]:
log = pd.concat(log_arr_new)

In [55]:
log = log.sort_values(by=["stay_id","starttime"])

### Write the event log

In [56]:
log.to_csv("files/event_log.csv",index = False)