In [1]:
# Libraries used
using DataFrames;
using CSV;
include("preprocessing.jl");

# Preprocessing

In first place we are going to drop columns with more than 35% instances of missing values.

In [2]:
# Load dataset
dataset = CSV.read("support2.csv", DataFrame, delim = ',');
dataset = select(dataset, Not("id"))
support2 = copy(dataset);

# Eliminate features with more than 35% of data missing.
featsOut = getMissingColumns(support2, 0.35);
support2 = select(support2, Not(featsOut))

# Eliminate missings in race and dnr columns (only categoricals with missing and few missing values).
support2 = dropmissing(support2, [:dnr,:race])

println("Features original dataset: ", ncol(dataset))
println("Intances original dataset: ", nrow(dataset))
println();
println("Features with more than 35% missing data: ", featsOut);
println("Features after transform: ", ncol(support2))
println("Instances after transform: ", nrow(support2))

Features original dataset: 47
Intances original dataset: 9105

Features with more than 35% missing data: [:totmcst, :alb, :glucose, :bun, :urine, :adlp]
Features after transform: 41
Instances after transform: 9033


Now, we divide the inputs in categorical and numerical data as they have different preprocessing

In [3]:
# Partition by data type to perform preprocessing
catNames= ["sex", "dzgroup", "dzclass", "race", "dnr", "dementia", "diabetes"]
targetName = ["death", "hospdead"]
numNames = num_feats = names(select(support2, Not(catNames, targetName)), Union{Missing, Number});

support2Target = select(support2, targetName);
support2Cat = select(support2, catNames);
support2Num = select(support2, Not(targetName, catNames));

println("Target name: ", targetName);
println("Numerical features (", length(numNames), "): ", numNames);
println("Categorical features (", length(catNames), "): ", catNames);

Target name: ["death", "hospdead"]
Numerical features (29): ["age", "slos", "d.time", "num.co", "edu", "scoma", "charges", "totcst", "avtisst", "sps", "aps", "surv2m", "surv6m", "hday", "prg2m", "prg6m", "dnrday", "meanbp", "wblc", "hrt", "resp", "temp", "pafi", "bili", "crea", "sod", "ph", "adls", "adlsc"]
Categorical features (7): ["sex", "dzgroup", "dzclass", "race", "dnr", "dementia", "diabetes"]


- Categorical

In [4]:
###############
# Categorical #
###############

# Dementia and diabetes are already in OHE format
for cat in catNames[1:end-2]
    support2Cat = dfOneHotEncoding!(support2Cat, cat)
end;

catNames = names(support2Cat)
println("Categorical features (", length(catNames), "): ", catNames);
show(first(support2Cat,2), allcols = true, allrows = true)

Categorical features (24): ["dementia", "diabetes", "sex_male", "sex_female", "dzgroup_Lung Cancer", "dzgroup_Cirrhosis", "dzgroup_ARF/MOSF w/Sepsis", "dzgroup_Coma", "dzgroup_CHF", "dzgroup_Colon Cancer", "dzgroup_COPD", "dzgroup_MOSF w/Malig", "dzclass_Cancer", "dzclass_COPD/CHF/Cirrhosis", "dzclass_ARF/MOSF", "dzclass_Coma", "race_other", "race_white", "race_black", "race_hispanic", "race_asian", "dnr_no dnr", "dnr_dnr after sadm", "dnr_dnr before sadm"]
[1m2×24 DataFrame[0m
[1m Row [0m│[1m dementia [0m[1m diabetes [0m[1m sex_male [0m[1m sex_female [0m[1m dzgroup_Lung Cancer [0m[1m dzgroup_Cirrhosis [0m[1m dzgroup_ARF/MOSF w/Sepsis [0m[1m dzgroup_Coma [0m[1m dzgroup_CHF [0m[1m dzgroup_Colon Cancer [0m[1m dzgroup_COPD [0m[1m dzgroup_MOSF w/Malig [0m[1m dzclass_Cancer [0m[1m dzclass_COPD/CHF/Cirrhosis [0m[1m dzclass_ARF/MOSF [0m[1m dzclass_Coma [0m[1m race_other [0m[1m race_white [0m[1m race_black [0m[1m race_hispanic [0m[1m race_asian [0m

- Numerical

In [5]:
#############
# Numerical #
#############
# Some ordinal features do not have appropiate format, so we correct it
ordinalDict = Dict("income" => Dict(missing => missing,
                                   "under \$11k" => 1,
                                    "\$11-\$25k" => 2,
                                    "\$25-\$50k" => 3,
                                    ">\$50k" => 4),
                    "sfdm2" => Dict(missing => missing,
                                    "no(M2 and SIP pres)" => 1,
                                    "adl>=4 (>=5 if sur)" => 2,
                                    "SIP>=30" => 3,
                                    "Coma or Intub" => 4,
                                    "<2 mo. follow-up" => 5),
                    "ca" => Dict("no" => 1,
                                 "yes" => 2,
                                 "metastatic" => 3))

for (feat, ordDict) in ordinalDict
    support2Num = dfOrdinalEncoding!(support2Num, feat, ordDict)
end;

# Change missing by NaN for using imputer.
support2Num = coalesce.(support2Num, NaN);

Store results in a csv

In [6]:
support2Clean = hcat(support2Num, support2Cat, support2Target);
CSV.write("support2_cleaned.csv", support2Clean);

show(first(support2Clean,2), allcols = true, allrows = true)

[1m2×58 DataFrame[0m
[1m Row [0m│[1m age     [0m[1m slos  [0m[1m d.time [0m[1m num.co [0m[1m edu     [0m[1m scoma [0m[1m charges [0m[1m totcst  [0m[1m avtisst [0m[1m sps     [0m[1m aps   [0m[1m surv2m   [0m[1m surv6m    [0m[1m hday  [0m[1m prg2m   [0m[1m prg6m   [0m[1m dnrday [0m[1m meanbp  [0m[1m wblc    [0m[1m hrt     [0m[1m resp    [0m[1m temp    [0m[1m pafi    [0m[1m bili     [0m[1m crea    [0m[1m sod     [0m[1m ph      [0m[1m adls    [0m[1m adlsc   [0m[1m income_ord [0m[1m ca_ord [0m[1m sfdm2_ord [0m[1m dementia [0m[1m diabetes [0m[1m sex_male [0m[1m sex_female [0m[1m dzgroup_Lung Cancer [0m[1m dzgroup_Cirrhosis [0m[1m dzgroup_ARF/MOSF w/Sepsis [0m[1m dzgroup_Coma [0m[1m dzgroup_CHF [0m[1m dzgroup_Colon Cancer [0m[1m dzgroup_COPD [0m[1m dzgroup_MOSF w/Malig [0m[1m dzclass_Cancer [0m[1m dzclass_COPD/CHF/Cirrhosis [0m[1m dzclass_ARF/MOSF [0m[1m dzclass_Coma [0m[1m race_other [0m[1m