# 1- Packages Import

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

# 2- Approach

In this notebook, we perform elementary preprocessing operations on our 2 data files: <br/>
- Masterfile 24h <br/>
- Masterfile 4h

# 3- Data Import

In [8]:
data24 = pd.read_csv('../data/Creatinine_data/CR_Masterfile24h.csv', index_col=0)
data6 = pd.read_csv('../data/Creatinine_data/CR_Masterfile6h.csv', index_col=0)
data4 = pd.read_csv('../data/Creatinine_data/CR_Masterfile4h.csv', index_col=0)
data2 = pd.read_csv('../data/Creatinine_data/CR_Masterfile2h.csv', index_col=0)

In [9]:
patients4 = data4.subject_id.unique()
patients2 = data2.subject_id.unique()

print(data2.shape)
print(data4.shape)
data4 = data4.loc[data4.subject_id.isin(patients2)]
data6 = data6.loc[data6.subject_id.isin(patients2)]
data24 = data24.loc[data24.subject_id.isin(patients2)]

(4786, 85)
(5627, 85)


In [10]:
data ={}
data[24]= data24
data[6]= data6
data[4]= data4
data[2]= data2

In [11]:
for i in [2,4,6,24]:
    print(data[i].shape)

(4786, 85)
(4786, 85)
(4786, 85)
(4786, 85)


# 4- Elementary Preprocessing

### 4.1- Trend Features?

### 4.2- Timelag filter

Here, we delete rows with a timelag greater than 30 days.

In [12]:
nb_max_days = 30
timelag = "Cr_base_timelag"

In [13]:
for i in [2,4,6,24]:   
    print("Number of rows deleted for "+str(i) +"hours : ", len(data[i].loc[data[i][timelag]<-30]))
    data[i] = data[i].loc[data[i][timelag]>=-30]

Number of rows deleted for 2hours :  143
Number of rows deleted for 4hours :  143
Number of rows deleted for 6hours :  143
Number of rows deleted for 24hours :  143


### 4.3- Features selection

Here, we delete features using: <br/>
- our correlation analysis <br/>
- the medical insights about redundant features

In [14]:
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)

We do not perform any "best subset selection" here.

In [16]:
i=24
id_features_tbr = ['subject_id', 'icustay_id', 'hadm_id']
time_features_tbr = ['Cr_baseline_time','admittime', 'Cr_initial_time']
corr_features_tbr = ['Hematocrit', 'PT','MCH', 'MCV']
corr_features_tbr += [c for c in data[i].columns if ((c.startswith('MAP')))]
other_features_tbr = ['icu_length_of_stay', 'admission_location', 'diagnosis']
features_to_be_removed = id_features_tbr + time_features_tbr + corr_features_tbr + other_features_tbr

In [17]:
for i in [2,4,6,24]:   
    data[i] = data[i].drop(columns = features_to_be_removed, axis=1)

### 4.4- Missing Values Filter

In [18]:
df_missing = {}
for i in [2,4,6,24]:   
    df_missing[i] = pd.DataFrame({"column":list(data[i].columns), "% missing":[round(sum(data[i][c].isnull())/len(data[i])*100, 2) for c in data[i].columns]})

In [19]:
i=24
features_to_drop = list(df_missing[i].loc[df_missing[i]['% missing']>=80]["column"].unique())
features_to_drop.remove("vassopresor_doses")
features_to_drop = [c for c in features_to_drop if c in data[i].columns]

In [20]:
for i in [2,4,6,24]:   
    data[i] = data[i].drop(columns = features_to_drop, axis=1)

### 4.5- Encode categorical variables

In [4]:
import re
def get_race(race):
    return re.split(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>? ]', race)[0]
    
def convert_race(race):
    if race in ['WHITE', 'BLACK', 'HISPANIC', 'ASIAN']:
        return race
    return 'OTHER'

def get_race_straight(race_long):
    race= re.split(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>? ]', race_long)[0]
    if race in ['WHITE', 'BLACK', 'HISPANIC', 'ASIAN']:
        return race
    return 'OTHER'



In [21]:
def is_female(g):
    if g=='F':
        return 1
    return 0

In [23]:
for i in [2,4,6,24]:   
    for col in [ 'admission_type', 'current_service','RenalFailure_dx','Sepsis_dx','Infection_dx']:
        le = preprocessing.LabelEncoder()
        le.fit(data[i][col])
        data[i][col]=le.transform(data[i][col])
    
for i in [2,4,6,24]:   
    data[i]['vassopresor_doses'] = data[i]['vassopresor_doses'].fillna(0)

In [22]:
for i in [2,4,6,24]:   
    data[i]['is_female'] = data[i]['gender'].apply(is_female)
    data[i].drop(columns='gender',inplace=True)




In [24]:
for i in [2,4,6,24]:   
    data[i]['ethnicity'] = data[i]['ethnicity'].apply(get_race_straight)





# 5- Data Export

In [25]:
for i in [2,4,6,24]:
    export_name_data = '../data/Creatinine_data/master_file_different_hours/PreProcessed_Masterfile{}h.csv'.format(str(i))
    data[i].to_csv(export_name_data)