In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 120)

# Dataset Engineering

Input: data/dataset.xlsx
Output: data/dataset_clean.xlsx

Data Cleaning
 - filtering for covid positive patients
 - removing cols with >= 98% missing
Data Engineering:
 - Imputing numerical vars based on mean for col
 - Grouping vars: Either create binary indicator of whether or not a group of tests was completed OR create numerical variable of # of tests from group completed
 - Y var: any changes here?

In [2]:
#original data
df = pd.read_excel("../data/dataset.xlsx")

#filter for only positive patients
df = df[df['SARS-Cov-2 exam result'] == 'positive']
df.head(5)

Unnamed: 0,Patient ID,Patient age quantile,SARS-Cov-2 exam result,"Patient addmited to regular ward (1=yes, 0=no)","Patient addmited to semi-intensive unit (1=yes, 0=no)","Patient addmited to intensive care unit (1=yes, 0=no)",Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,Basophils,Mean corpuscular hemoglobin (MCH),Eosinophils,Mean corpuscular volume (MCV),Monocytes,Red blood cell distribution width (RDW),Serum Glucose,Respiratory Syncytial Virus,Influenza A,Influenza B,Parainfluenza 1,CoronavirusNL63,Rhinovirus/Enterovirus,Mycoplasma pneumoniae,Coronavirus HKU1,Parainfluenza 3,Chlamydophila pneumoniae,Adenovirus,Parainfluenza 4,Coronavirus229E,CoronavirusOC43,Inf A H1N1 2009,Bordetella pertussis,Metapneumovirus,Parainfluenza 2,Neutrophils,Urea,Proteina C reativa mg/dL,Creatinine,Potassium,Sodium,"Influenza B, rapid test","Influenza A, rapid test",Alanine transaminase,Aspartate transaminase,Gamma-glutamyltransferase,Total Bilirubin,Direct Bilirubin,Indirect Bilirubin,Alkaline phosphatase,Ionized calcium,Strepto A,Magnesium,pCO2 (venous blood gas analysis),Hb saturation (venous blood gas analysis),Base excess (venous blood gas analysis),pO2 (venous blood gas analysis),Fio2 (venous blood gas analysis),Total CO2 (venous blood gas analysis),pH (venous blood gas analysis),HCO3 (venous blood gas analysis),Rods #,Segmented,Promyelocytes,Metamyelocytes,Myelocytes,Myeloblasts,Urine - Esterase,Urine - Aspect,Urine - pH,Urine - Hemoglobin,Urine - Bile pigments,Urine - Ketone Bodies,Urine - Nitrite,Urine - Density,Urine - Urobilinogen,Urine - Protein,Urine - Sugar,Urine - Leukocytes,Urine - Crystals,Urine - Red blood cells,Urine - Hyaline cylinders,Urine - Granular cylinders,Urine - Yeasts,Urine - Color,Partial thromboplastin time (PTT),Relationship (Patient/Normal),International normalized ratio (INR),Lactic Dehydrogenase,"Prothrombin time (PT), Activity",Vitamin B12,Creatine phosphokinase (CPK),Ferritin,Arterial Lactic Acid,Lipase dosage,D-Dimer,Albumin,Hb saturation (arterial blood gases),pCO2 (arterial blood gas analysis),Base excess (arterial blood gas analysis),pH (arterial blood gas analysis),Total CO2 (arterial blood gas analysis),HCO3 (arterial blood gas analysis),pO2 (arterial blood gas analysis),Arteiral Fio2,Phosphor,ctO2 (arterial blood gas analysis)
67,78511c183ae18bc,7,positive,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
284,d7834ed75f2da44,16,positive,1,0,0,,,,,,,,,,,,,,,,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
513,b16b49f7bd3e692,10,positive,0,0,0,,,,,,,,,,,,,,,,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
568,4382f5ea05e60c4,2,positive,0,0,0,0.991838,0.792188,-0.341548,1.469188,1.653476,-0.048383,-0.452899,-0.420197,1.303529,-1.442245,-0.498393,-1.396114,1.933339,0.967144,,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,not_detected,-0.414215,-0.513909,,-0.32234,-0.058626,0.143752,,,-0.504127,-0.317035,-0.330608,1.355535,1.163312,1.198484,3.060642,,,,-0.090035,0.337027,-0.611396,-0.084646,,-0.479346,-0.436537,-0.512865,,,,,,,,,,,,,,,,,,,,,,,,,,0.000994,0.86241,-0.620717,,,0.125483,,,,,,,,,,,,,,,
676,d3729cd2658ca64,15,positive,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Feature Engineering

Reading in and combining all the binary features we created based on lab test groupings.

In [3]:
#engineered features
virus = pd.read_csv('../data/positive_virus_bac_labs.csv', usecols = range(1,4))
pot_sod = pd.read_csv('../data/pot_sod_grp.csv', usecols = range(1,3))
bili_blood = pd.read_csv('../data/bili-bloodgasses-grp.csv', usecols = range(1,6))
urine = pd.read_csv('../data/urine_data.csv', names = ['Patient ID', 'urine_test'], skiprows = 1)
blood = pd.read_csv('../data/blood_test_id.csv')

#combining all new features
features = virus.merge(pot_sod,on='Patient ID').merge(bili_blood,on='Patient ID').merge(urine,on='Patient ID').merge(blood,on='Patient ID')
features.head()

Unnamed: 0,Patient ID,virus_lab_taken,num_virus_or_bacteria_dectected,pot_sod_grp,arterial_grp,venous_grp,art_ven_grp,bili_grp,urine_test,blood_test_grp
0,78511c183ae18bc,0,0,0,0,0,0,0,0,0
1,d7834ed75f2da44,1,0,0,0,0,0,0,0,0
2,b16b49f7bd3e692,1,0,0,0,0,0,0,0,0
3,4382f5ea05e60c4,1,0,1,0,1,0,1,0,1
4,d3729cd2658ca64,0,0,0,0,0,0,0,0,0


## Removing columns with missing values

We decided to remove all columns with 98% or more missing values. First we check the proportion of missing values in the entire dataset. Then we remove all the ones with at least 98% missing values.

There are 25 variables we need to remove.

In [4]:
#find columns with 98% or higher values missing
missing = (df.isnull().sum().sort_values(ascending = False)).to_frame(name = 'num')
missing.reset_index(inplace=True)
missing.rename(columns = {'index':'var'}, inplace = True)
missing['prop'] = missing['num']/len(df)
missing[missing.prop>=.98]

Unnamed: 0,var,num,prop
0,Mycoplasma pneumoniae,558,1.0
1,Vitamin B12,558,1.0
2,Urine - Sugar,558,1.0
3,Urine - Nitrite,558,1.0
4,Fio2 (venous blood gas analysis),558,1.0
5,Partial thromboplastin time (PTT),558,1.0
6,Albumin,558,1.0
7,D-Dimer,558,1.0
8,"Prothrombin time (PT), Activity",558,1.0
9,Phosphor,557,0.998208


In [5]:
#remove above columns from df
remove = list(missing[missing.prop>=.98]['var'])
df.drop(remove, axis = 1, inplace = True)

## Remove infection test results

We have two new variables that tell us whether a patient was tested for bacterial/viral infections and the number of positive test results. Remove individual test result cols. 

In [6]:
df.drop(df.loc[:, 'Respiratory Syncytial Virus':'Parainfluenza 2'].columns, axis = 1, inplace = True)
df.head(10)

Unnamed: 0,Patient ID,Patient age quantile,SARS-Cov-2 exam result,"Patient addmited to regular ward (1=yes, 0=no)","Patient addmited to semi-intensive unit (1=yes, 0=no)","Patient addmited to intensive care unit (1=yes, 0=no)",Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,Basophils,Mean corpuscular hemoglobin (MCH),Eosinophils,Mean corpuscular volume (MCV),Monocytes,Red blood cell distribution width (RDW),Serum Glucose,Neutrophils,Urea,Proteina C reativa mg/dL,Creatinine,Potassium,Sodium,"Influenza B, rapid test","Influenza A, rapid test",Alanine transaminase,Aspartate transaminase,Gamma-glutamyltransferase,Total Bilirubin,Direct Bilirubin,Indirect Bilirubin,Alkaline phosphatase,Strepto A,pCO2 (venous blood gas analysis),Hb saturation (venous blood gas analysis),Base excess (venous blood gas analysis),pO2 (venous blood gas analysis),Total CO2 (venous blood gas analysis),pH (venous blood gas analysis),HCO3 (venous blood gas analysis),Urine - Aspect,Urine - pH,Urine - Hemoglobin,Urine - Bile pigments,Urine - Density,Urine - Urobilinogen,Urine - Leukocytes,Urine - Crystals,Urine - Red blood cells,Urine - Granular cylinders,Urine - Yeasts,Urine - Color,Relationship (Patient/Normal),International normalized ratio (INR),Lactic Dehydrogenase,Creatine phosphokinase (CPK),Arterial Lactic Acid,Hb saturation (arterial blood gases),pCO2 (arterial blood gas analysis),Base excess (arterial blood gas analysis),pH (arterial blood gas analysis),Total CO2 (arterial blood gas analysis),HCO3 (arterial blood gas analysis),pO2 (arterial blood gas analysis),ctO2 (arterial blood gas analysis)
67,78511c183ae18bc,7,positive,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
284,d7834ed75f2da44,16,positive,1,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
513,b16b49f7bd3e692,10,positive,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
568,4382f5ea05e60c4,2,positive,0,0,0,0.991838,0.792188,-0.341548,1.469188,1.653476,-0.048383,-0.452899,-0.420197,1.303529,-1.442245,-0.498393,-1.396114,1.933339,0.967144,,-0.414215,-0.513909,,-0.32234,-0.058626,0.143752,,,-0.504127,-0.317035,-0.330608,1.355535,1.163312,1.198484,3.060642,,-0.090035,0.337027,-0.611396,-0.084646,-0.479346,-0.436537,-0.512865,,,,,,,,,,,,,0.000994,0.86241,-0.620717,0.125483,,,,,,,,,
676,d3729cd2658ca64,15,positive,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
884,ab9745da9d139b9,15,positive,1,0,0,-0.495919,-0.398276,-0.718402,-0.438097,-0.56795,-0.935404,0.244149,-0.820919,-1.140144,0.334989,-0.66695,0.22628,-0.456613,-0.978899,-0.928044,1.282118,0.974759,-0.247246,0.160114,-1.047272,0.862512,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1011,a0435dd40e745fd,7,positive,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1108,9d7c293a6773d6a,14,positive,0,0,0,-0.312811,-0.6489,-0.027502,-0.101517,-0.656101,-0.099557,-1.448681,-0.968407,-0.529226,0.021361,0.175837,0.807138,1.513128,0.347948,-0.110364,-0.315877,0.304858,-0.479726,-0.046652,0.682859,1.940651,negative,negative,-0.504127,-0.394452,-0.409223,-0.787085,-0.586463,-0.771034,-0.243405,negative,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1113,ff5ec3abd7d6088,5,positive,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1129,7b1d6bf5993aa9b,9,positive,0,0,0,,,,,,,,,,,,,,,,,,,,,,negative,negative,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Create Y variable

To predict which ward a patient was placed in we need to take the three binary ward variables and create one y variable with 4 classes:
* Admitted to the regular ward
* Admitted to the semi-intensive unit
* Admitted to the intensive care unit
* discharged

In [7]:
conditions = [
    (df['Patient addmited to regular ward (1=yes, 0=no)'] == 1),
    (df['Patient addmited to semi-intensive unit (1=yes, 0=no)'] == 1),
    (df['Patient addmited to intensive care unit (1=yes, 0=no)'] == 1),
    (df['Patient addmited to intensive care unit (1=yes, 0=no)'] == 0) & 
     (df['Patient addmited to semi-intensive unit (1=yes, 0=no)'] == 0) &
     (df['Patient addmited to regular ward (1=yes, 0=no)'] == 0)]

# create a list of the values we want to assign for each condition
values = ['regular', 'semi', 'icu', 'discharged']

# create a new column and use np.select to assign values to it using our lists as arguments
df['y'] = np.select(conditions, values)

#remove unnecessary columns
df.drop(['Patient addmited to regular ward (1=yes, 0=no)',
        'Patient addmited to semi-intensive unit (1=yes, 0=no)',
        'Patient addmited to intensive care unit (1=yes, 0=no)',
        'SARS-Cov-2 exam result'], axis = 1, inplace = True)

df['y'].value_counts()

discharged    506
regular        36
semi            8
icu             8
Name: y, dtype: int64

## Joining in Features

Joining in the clean dataset with the features we engineered.

In [8]:
df = df.merge(features, on = 'Patient ID')

In [9]:
df.head()

Unnamed: 0,Patient ID,Patient age quantile,Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,Basophils,Mean corpuscular hemoglobin (MCH),Eosinophils,Mean corpuscular volume (MCV),Monocytes,Red blood cell distribution width (RDW),Serum Glucose,Neutrophils,Urea,Proteina C reativa mg/dL,Creatinine,Potassium,Sodium,"Influenza B, rapid test","Influenza A, rapid test",Alanine transaminase,Aspartate transaminase,Gamma-glutamyltransferase,Total Bilirubin,Direct Bilirubin,Indirect Bilirubin,Alkaline phosphatase,Strepto A,pCO2 (venous blood gas analysis),Hb saturation (venous blood gas analysis),Base excess (venous blood gas analysis),pO2 (venous blood gas analysis),Total CO2 (venous blood gas analysis),pH (venous blood gas analysis),HCO3 (venous blood gas analysis),Urine - Aspect,Urine - pH,Urine - Hemoglobin,Urine - Bile pigments,Urine - Density,Urine - Urobilinogen,Urine - Leukocytes,Urine - Crystals,Urine - Red blood cells,Urine - Granular cylinders,Urine - Yeasts,Urine - Color,Relationship (Patient/Normal),International normalized ratio (INR),Lactic Dehydrogenase,Creatine phosphokinase (CPK),Arterial Lactic Acid,Hb saturation (arterial blood gases),pCO2 (arterial blood gas analysis),Base excess (arterial blood gas analysis),pH (arterial blood gas analysis),Total CO2 (arterial blood gas analysis),HCO3 (arterial blood gas analysis),pO2 (arterial blood gas analysis),ctO2 (arterial blood gas analysis),y,virus_lab_taken,num_virus_or_bacteria_dectected,pot_sod_grp,arterial_grp,venous_grp,art_ven_grp,bili_grp,urine_test,blood_test_grp
0,78511c183ae18bc,7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,0,0,0,0,0,0,0,0,0
1,d7834ed75f2da44,16,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,regular,1,0,0,0,0,0,0,0,0
2,b16b49f7bd3e692,10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,1,0,0,0,0,0,0,0,0
3,4382f5ea05e60c4,2,0.991838,0.792188,-0.341548,1.469188,1.653476,-0.048383,-0.452899,-0.420197,1.303529,-1.442245,-0.498393,-1.396114,1.933339,0.967144,,-0.414215,-0.513909,,-0.32234,-0.058626,0.143752,,,-0.504127,-0.317035,-0.330608,1.355535,1.163312,1.198484,3.060642,,-0.090035,0.337027,-0.611396,-0.084646,-0.479346,-0.436537,-0.512865,,,,,,,,,,,,,0.000994,0.86241,-0.620717,0.125483,,,,,,,,,,discharged,1,0,1,0,1,0,1,0,1
4,d3729cd2658ca64,15,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,0,0,0,0,0,0,0,0,0


## Change y var

Based on conversations w/ Dr. Zhang + team, we decided to adapt our 4-class classification problem into 2 classes: admitted vs discharged. The data quality is not high enough to build a model with 4 classes. 

In [10]:
# if patient discharged, mark 0; 1 o.w.
df['y_bin'] = np.where(df['y'] == 'discharged', '0', '1')
df.head(10)

Unnamed: 0,Patient ID,Patient age quantile,Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,Basophils,Mean corpuscular hemoglobin (MCH),Eosinophils,Mean corpuscular volume (MCV),Monocytes,Red blood cell distribution width (RDW),Serum Glucose,Neutrophils,Urea,Proteina C reativa mg/dL,Creatinine,Potassium,Sodium,"Influenza B, rapid test","Influenza A, rapid test",Alanine transaminase,Aspartate transaminase,Gamma-glutamyltransferase,Total Bilirubin,Direct Bilirubin,Indirect Bilirubin,Alkaline phosphatase,Strepto A,pCO2 (venous blood gas analysis),Hb saturation (venous blood gas analysis),Base excess (venous blood gas analysis),pO2 (venous blood gas analysis),Total CO2 (venous blood gas analysis),pH (venous blood gas analysis),HCO3 (venous blood gas analysis),Urine - Aspect,Urine - pH,Urine - Hemoglobin,Urine - Bile pigments,Urine - Density,Urine - Urobilinogen,Urine - Leukocytes,Urine - Crystals,Urine - Red blood cells,Urine - Granular cylinders,Urine - Yeasts,Urine - Color,Relationship (Patient/Normal),International normalized ratio (INR),Lactic Dehydrogenase,Creatine phosphokinase (CPK),Arterial Lactic Acid,Hb saturation (arterial blood gases),pCO2 (arterial blood gas analysis),Base excess (arterial blood gas analysis),pH (arterial blood gas analysis),Total CO2 (arterial blood gas analysis),HCO3 (arterial blood gas analysis),pO2 (arterial blood gas analysis),ctO2 (arterial blood gas analysis),y,virus_lab_taken,num_virus_or_bacteria_dectected,pot_sod_grp,arterial_grp,venous_grp,art_ven_grp,bili_grp,urine_test,blood_test_grp,y_bin
0,78511c183ae18bc,7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,0,0,0,0,0,0,0,0,0,0
1,d7834ed75f2da44,16,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,regular,1,0,0,0,0,0,0,0,0,1
2,b16b49f7bd3e692,10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,1,0,0,0,0,0,0,0,0,0
3,4382f5ea05e60c4,2,0.991838,0.792188,-0.341548,1.469188,1.653476,-0.048383,-0.452899,-0.420197,1.303529,-1.442245,-0.498393,-1.396114,1.933339,0.967144,,-0.414215,-0.513909,,-0.32234,-0.058626,0.143752,,,-0.504127,-0.317035,-0.330608,1.355535,1.163312,1.198484,3.060642,,-0.090035,0.337027,-0.611396,-0.084646,-0.479346,-0.436537,-0.512865,,,,,,,,,,,,,0.000994,0.86241,-0.620717,0.125483,,,,,,,,,,discharged,1,0,1,0,1,0,1,0,1,0
4,d3729cd2658ca64,15,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,0,0,0,0,0,0,0,0,0,0
5,ab9745da9d139b9,15,-0.495919,-0.398276,-0.718402,-0.438097,-0.56795,-0.935404,0.244149,-0.820919,-1.140144,0.334989,-0.66695,0.22628,-0.456613,-0.978899,-0.928044,1.282118,0.974759,-0.247246,0.160114,-1.047272,0.862512,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,regular,1,0,1,0,0,0,0,0,1,1
6,a0435dd40e745fd,7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,0,0,0,0,0,0,0,0,0,0
7,9d7c293a6773d6a,14,-0.312811,-0.6489,-0.027502,-0.101517,-0.656101,-0.099557,-1.448681,-0.968407,-0.529226,0.021361,0.175837,0.807138,1.513128,0.347948,-0.110364,-0.315877,0.304858,-0.479726,-0.046652,0.682859,1.940651,negative,negative,-0.504127,-0.394452,-0.409223,-0.787085,-0.586463,-0.771034,-0.243405,negative,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,1,0,1,0,0,0,1,0,1,0
8,ff5ec3abd7d6088,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,0,0,0,0,0,0,0,0,0,0
9,7b1d6bf5993aa9b,9,,,,,,,,,,,,,,,,,,,,,,negative,negative,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,0,0,0,0,0,0,0,0,0,0


In [11]:
df['y_bin'].value_counts()

0    506
1     52
Name: y_bin, dtype: int64

This dataset will be saved to github so that Tonnar can impute data in R. 

In [12]:
df.drop(['y'], axis = 1, inplace = True)
pd.write_csv('../data/dataset_new_features.csv')

Unnamed: 0,Patient ID,Patient age quantile,Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,Basophils,Mean corpuscular hemoglobin (MCH),Eosinophils,Mean corpuscular volume (MCV),Monocytes,Red blood cell distribution width (RDW),Serum Glucose,Neutrophils,Urea,Proteina C reativa mg/dL,Creatinine,Potassium,Sodium,"Influenza B, rapid test","Influenza A, rapid test",Alanine transaminase,Aspartate transaminase,Gamma-glutamyltransferase,Total Bilirubin,Direct Bilirubin,Indirect Bilirubin,Alkaline phosphatase,Strepto A,pCO2 (venous blood gas analysis),Hb saturation (venous blood gas analysis),Base excess (venous blood gas analysis),pO2 (venous blood gas analysis),Total CO2 (venous blood gas analysis),pH (venous blood gas analysis),HCO3 (venous blood gas analysis),Urine - Aspect,Urine - pH,Urine - Hemoglobin,Urine - Bile pigments,Urine - Density,Urine - Urobilinogen,Urine - Leukocytes,Urine - Crystals,Urine - Red blood cells,Urine - Granular cylinders,Urine - Yeasts,Urine - Color,Relationship (Patient/Normal),International normalized ratio (INR),Lactic Dehydrogenase,Creatine phosphokinase (CPK),Arterial Lactic Acid,Hb saturation (arterial blood gases),pCO2 (arterial blood gas analysis),Base excess (arterial blood gas analysis),pH (arterial blood gas analysis),Total CO2 (arterial blood gas analysis),HCO3 (arterial blood gas analysis),pO2 (arterial blood gas analysis),ctO2 (arterial blood gas analysis),y,virus_lab_taken,num_virus_or_bacteria_dectected,pot_sod_grp,arterial_grp,venous_grp,art_ven_grp,bili_grp,urine_test,blood_test_grp,y_bin
0,78511c183ae18bc,7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,0,0,0,0,0,0,0,0,0,0
1,d7834ed75f2da44,16,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,regular,1,0,0,0,0,0,0,0,0,1
2,b16b49f7bd3e692,10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,1,0,0,0,0,0,0,0,0,0
3,4382f5ea05e60c4,2,0.991838,0.792188,-0.341548,1.469188,1.653476,-0.048383,-0.452899,-0.420197,1.303529,-1.442245,-0.498393,-1.396114,1.933339,0.967144,,-0.414215,-0.513909,,-0.32234,-0.058626,0.143752,,,-0.504127,-0.317035,-0.330608,1.355535,1.163312,1.198484,3.060642,,-0.090035,0.337027,-0.611396,-0.084646,-0.479346,-0.436537,-0.512865,,,,,,,,,,,,,0.000994,0.86241,-0.620717,0.125483,,,,,,,,,,discharged,1,0,1,0,1,0,1,0,1,0
4,d3729cd2658ca64,15,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,discharged,0,0,0,0,0,0,0,0,0,0


<br>

## Impute Values

Now, upload file with imputed data (missing values are mean 0 with added noise of 0.5 standard deviation)

In [3]:
#del df

In [2]:
df = pd.read_csv("../data/data_imputed.csv")
# remove non-numeric cols we don't need bc decided not to impute
df.drop(df.loc[:, 'Influenza B, rapid test':'...1'].columns, axis = 1, inplace = True)
df.head(10)

Unnamed: 0,Patient ID,Patient age quantile,Hematocrit,Hemoglobin,Platelets,Mean platelet volume,Red blood Cells,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,Basophils,Mean corpuscular hemoglobin (MCH),Eosinophils,Mean corpuscular volume (MCV),Monocytes,Red blood cell distribution width (RDW),Serum Glucose,Neutrophils,Urea,Proteina C reativa mg/dL,Creatinine,Potassium,Sodium,Alanine transaminase,Aspartate transaminase,Gamma-glutamyltransferase,Total Bilirubin,Direct Bilirubin,Indirect Bilirubin,Alkaline phosphatase,pCO2 (venous blood gas analysis),Hb saturation (venous blood gas analysis),Base excess (venous blood gas analysis),pO2 (venous blood gas analysis),Total CO2 (venous blood gas analysis),pH (venous blood gas analysis),HCO3 (venous blood gas analysis),Urine - pH,Urine - Density,Urine - Red blood cells,Relationship (Patient/Normal),International normalized ratio (INR),Lactic Dehydrogenase,Creatine phosphokinase (CPK),Arterial Lactic Acid,Hb saturation (arterial blood gases),pCO2 (arterial blood gas analysis),Base excess (arterial blood gas analysis),pH (arterial blood gas analysis),Total CO2 (arterial blood gas analysis),HCO3 (arterial blood gas analysis),pO2 (arterial blood gas analysis),ctO2 (arterial blood gas analysis),virus_lab_taken,num_virus_or_bacteria_dectected,pot_sod_grp,arterial_grp,venous_grp,art_ven_grp,bili_grp,urine_test,blood_test_grp,y_bin
0,78511c183ae18bc,7,-0.369184,-0.695672,0.249053,0.308962,-0.768133,-0.271741,0.046156,0.064047,-0.527989,-0.879165,0.715954,0.343707,-1.052966,0.442654,0.558682,-0.265081,0.194856,-0.492867,0.143893,-0.371118,0.025878,-0.019359,0.421084,-0.461294,-0.507384,0.818203,-0.397609,-0.569274,-0.389812,-0.851242,0.250154,0.531284,-0.184236,-0.486458,-0.428014,0.48746,-0.122666,-0.325212,-1.262923,0.65903,-0.487915,0.026997,-0.257383,-0.235953,0.116966,0.060104,-0.066694,-0.470682,0.61853,0.160981,0.206797,0,0,0,0,0,0,0,0,0,0
1,d7834ed75f2da44,16,0.829992,-0.144919,0.14337,0.54513,0.626207,-0.570106,0.08638,0.533026,-0.24864,-0.34614,-0.198347,-1.062053,-0.986692,0.89194,-0.39448,0.070069,-0.585948,-0.349945,0.481527,-0.260364,-0.270922,-0.198096,-0.177148,-0.173672,-0.099505,0.305496,-0.048645,-0.034385,0.41668,0.514125,-0.721696,0.313188,-0.364969,-0.327281,-0.306035,-0.056456,-0.97628,0.437802,-0.255364,0.184058,-0.177552,-0.866311,0.397573,-0.223466,-0.143102,-0.550219,0.293702,0.148437,-0.051261,0.113754,0.309747,1,0,0,0,0,0,0,0,0,1
2,b16b49f7bd3e692,10,0.395916,-0.640232,0.237237,-0.294564,-0.245377,-0.407412,-0.387671,0.328331,0.954359,0.405043,0.496989,-0.36002,1.202528,0.624098,0.303565,0.360202,-0.202306,0.286742,0.52195,1.272759,0.119637,0.366493,1.725775,-0.027987,0.370701,0.657976,-0.027691,0.727932,0.159831,0.349876,-0.042833,-0.791131,0.536474,-0.521163,0.027237,0.253157,-0.234269,-0.947677,0.27499,0.011772,0.526829,0.08576,0.53009,0.274776,1.436588,0.435771,0.160013,0.517677,-0.224668,-1.044017,-0.875195,1,0,0,0,0,0,0,0,0,0
3,4382f5ea05e60c4,2,0.991838,0.792188,-0.341548,1.469188,1.653476,-0.048383,-0.452899,-0.420197,1.303529,-1.442245,-0.498393,-1.396114,1.933339,0.967144,0.220772,-0.414215,-0.513909,0.766627,-0.32234,-0.058626,0.143752,-0.504127,-0.317035,-0.330608,1.355535,1.163312,1.198484,3.060642,-0.090035,0.337027,-0.611396,-0.084646,-0.479346,-0.436537,-0.512865,-0.41058,-0.013534,-0.264911,0.000994,0.86241,-0.620717,0.125483,-0.125169,-0.516981,0.370335,-0.943797,0.571281,-1.094973,0.623437,0.099754,0.396676,1,0,1,0,1,0,1,0,1,0
4,d3729cd2658ca64,15,-0.265743,0.110848,0.046599,-0.116829,0.66422,0.18116,0.584715,-0.013361,0.062431,-0.345446,-0.025917,-0.926495,0.237849,-1.009884,-0.311546,-0.364648,0.192753,-1.091292,-0.755679,-0.427844,-0.129638,0.114772,0.172218,0.172065,-0.298399,-0.250647,0.164507,-0.592396,-0.937909,0.455299,-0.158021,-0.335337,-0.597833,0.366511,0.734958,-0.233412,0.779065,0.451102,0.142752,0.224727,0.782951,0.045138,-0.049335,0.302745,0.37632,-0.700144,0.168575,-0.610486,0.265842,-0.925952,-0.407494,0,0,0,0,0,0,0,0,0,0
5,ab9745da9d139b9,15,-0.495919,-0.398276,-0.718402,-0.438097,-0.56795,-0.935404,0.244149,-0.820919,-1.140144,0.334989,-0.66695,0.22628,-0.456613,-0.978899,-0.928044,1.282118,0.974759,-0.247246,0.160114,-1.047272,0.862512,0.191551,0.361906,-0.607559,-0.741319,0.710957,-0.03663,-0.472511,-0.487588,0.416414,1.006266,0.429962,-0.925895,-0.331706,0.653742,-0.633978,0.05968,0.587476,-0.137608,0.183053,0.853131,-0.07242,-0.011289,-0.981378,0.206784,-0.084968,-0.419658,0.052373,0.464696,0.82796,-0.086896,1,0,1,0,0,0,0,0,1,1
6,a0435dd40e745fd,7,-0.556451,0.018873,0.38283,-0.057518,0.35424,0.206015,-0.704808,1.170787,-0.933415,0.387816,1.210078,0.164385,-0.499601,0.799359,0.609917,0.598611,0.865398,-0.457533,-0.913376,0.112033,-0.065063,0.901364,0.112728,-0.257636,0.174669,-0.384298,0.580145,0.18236,1.066957,0.710059,0.086824,-0.314262,-1.023203,0.251441,0.246637,-0.075065,-1.490177,0.243432,-0.023284,0.031531,-0.529706,-0.324431,-0.436422,-0.401344,0.547819,-0.276775,-0.243713,-0.318266,0.246625,-0.100789,-0.059839,0,0,0,0,0,0,0,0,0,0
7,9d7c293a6773d6a,14,-0.312811,-0.6489,-0.027502,-0.101517,-0.656101,-0.099557,-1.448681,-0.968407,-0.529226,0.021361,0.175837,0.807138,1.513128,0.347948,-0.110364,-0.315877,0.304858,-0.479726,-0.046652,0.682859,1.940651,-0.504127,-0.394452,-0.409223,-0.787085,-0.586463,-0.771034,-0.243405,0.520829,-0.23752,-0.415479,0.012059,0.177081,0.233668,0.843936,0.394184,0.608989,-0.01116,0.296027,0.322525,-0.145402,0.328614,-0.531676,-0.199982,0.075521,0.181489,-0.31742,0.338176,0.046049,-0.658097,0.544111,1,0,1,0,0,0,1,0,1,0
8,ff5ec3abd7d6088,5,-0.249594,0.181611,0.271055,0.899674,0.037829,-0.716312,0.192413,0.336056,0.018442,-0.041397,0.227432,0.077893,-0.587499,0.144233,0.042631,-0.621475,0.475431,-0.334078,0.00114,0.517584,0.666772,-0.588838,-1.397801,-0.361118,0.661732,0.210945,-0.448501,-0.439313,-0.38729,0.859485,-0.535966,-0.233341,0.434339,-0.73577,0.369595,0.259825,0.632985,-0.120016,0.028746,-0.303512,1.16586,0.113056,-0.2478,-0.000802,0.348299,-0.24377,0.245415,0.343282,-0.321653,0.13655,-0.002337,0,0,0,0,0,0,0,0,0,0
9,7b1d6bf5993aa9b,9,-0.132217,-0.407344,-0.030726,1.085458,-0.396083,-0.296004,0.539668,-0.233758,-0.113868,0.792161,0.259712,0.204484,0.244841,0.513421,-0.532301,0.414344,0.033545,0.786823,-0.327183,0.678791,0.204539,-0.545808,0.536301,-0.009274,0.074615,-0.238366,0.168948,-0.214219,-0.392048,-0.577602,0.894167,0.510392,-0.944354,-0.667898,0.209242,0.41189,1.4903,-0.134672,-0.455642,-0.705182,-0.591996,0.461069,0.267642,-0.002783,-0.111013,-0.42514,0.246859,0.020285,0.973677,-0.135126,-0.566229,0,0,0,0,0,0,0,0,0,0


Drop patient ID col

In [3]:
df.drop('Patient ID', axis = 1, inplace = True)

Save untouched data

In [5]:
df.to_csv("../data/data_original_sampling.csv")

## Balancing dataset

Read in data, split into test and train. 

In [4]:
# import unbalanced, feature-engineered TRAIN data:
train = pd.read_csv("../data/final/train_original_sampling.csv")

# split into X & y
X, y = train.drop(['y_bin'], axis = 1, inplace = False), train['y_bin']

### Option 1: Oversamling using SMOTE

In [40]:
from imblearn.over_sampling import SMOTE

# init
sm = SMOTE(random_state=42, sampling_strategy = "not majority")

# resample
X_sm, y_sm = sm.fit_resample(X, y)

print(f'''Shape of X before SMOTE: {X.shape}
Shape of X after SMOTE: {X_sm.shape}''')

print('\nBalance of positive and negative classes (%):')
y_sm.value_counts()

Shape of X before SMOTE: (558, 61)
Shape of X after SMOTE: (1012, 61)

Balance of positive and negative classes (%):


1    506
0    506
Name: y_bin, dtype: int64

Save results

In [42]:
final_sm = pd.concat([y_sm, X_sm], axis = 1)
#final_sm.head(5)
final_sm.to_csv("../data/data_oversampling.csv")

### Option 2: Combine undersampling and oversampling strategies.

Note that both undersampling and oversampling add bias into our data. Undersampling removes information from our data and oversampling introduces noise. By combining these approaches, we hope to minimize the bias from both techniques. 

See https://imbalanced-learn.org/stable/combine.html for details

In [5]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# init samplers
over_sample = RandomOverSampler(sampling_strategy = 0.5, random_state = 30)
under_sample = RandomUnderSampler(sampling_strategy = 0.8, random_state = 30)

# oversample minority
X_over, y_over = over_sample.fit_resample(X, y)
print(f"Oversampled: {Counter(y_over)}")

# undersample majority
X_combined, y_combined = under_sample.fit_resample(X_over, y_over)
print(f"Combined Random Sampling: {Counter(y_combined)}")

Oversampled: Counter({0: 353, 1: 176})
Combined Random Sampling: Counter({0: 220, 1: 176})


Save results

In [6]:
final_combined = pd.concat([y_combined, X_combined], axis = 1)
final_combined.head(5)

Unnamed: 0,y_bin,patient_age_quantile,hematocrit,hemoglobin,platelets,mean_platelet_volume,red_blood_cells,lymphocytes,mean_corpuscular_hemoglobin_concentration_mchc,leukocytes,basophils,mean_corpuscular_hemoglobin_mch,eosinophils,mean_corpuscular_volume_mcv,monocytes,red_blood_cell_distribution_width_rdw,serum_glucose,neutrophils,urea,proteina_c_reativa_mg_d_l,creatinine,potassium,sodium,alanine_transaminase,aspartate_transaminase,gamma_glutamyltransferase,total_bilirubin,direct_bilirubin,indirect_bilirubin,alkaline_phosphatase,p_co2_venous_blood_gas_analysis,hb_saturation_venous_blood_gas_analysis,base_excess_venous_blood_gas_analysis,p_o2_venous_blood_gas_analysis,total_co2_venous_blood_gas_analysis,p_h_venous_blood_gas_analysis,hco3_venous_blood_gas_analysis,urine_p_h,urine_density,urine_red_blood_cells,relationship_patient_normal,international_normalized_ratio_inr,lactic_dehydrogenase,creatine_phosphokinase_cpk,arterial_lactic_acid,hb_saturation_arterial_blood_gases,p_co2_arterial_blood_gas_analysis,base_excess_arterial_blood_gas_analysis,p_h_arterial_blood_gas_analysis,total_co2_arterial_blood_gas_analysis,hco3_arterial_blood_gas_analysis,p_o2_arterial_blood_gas_analysis,ct_o2_arterial_blood_gas_analysis,virus_lab_taken,num_virus_or_bacteria_dectected,pot_sod_grp,arterial_grp,venous_grp,art_ven_grp,bili_grp,urine_test,blood_test_grp
0,0,12,-0.529081,0.11741,-0.187611,0.508671,0.944025,0.479895,-0.192793,0.089316,-0.890499,-0.398251,-0.257933,-0.672354,0.407674,0.158719,-0.519022,-0.522584,0.75354,0.030468,-0.025872,0.633923,-0.623298,-0.265801,0.814973,-0.190461,-0.430021,-0.010605,-0.312163,-0.286307,0.013171,-0.064359,-0.462544,0.805068,-0.453275,0.025638,-0.430598,-0.056292,0.168551,0.301423,-0.032276,-0.676321,0.177096,0.372887,-0.930791,-0.041502,0.12343,0.910263,-0.45186,-0.264212,0.422759,-0.704904,0.400156,0,0,0,0,0,0,0,0,0
1,0,8,-0.051193,-0.653691,0.418722,-0.119454,-0.102946,-0.377924,0.268642,0.867506,0.172807,-0.023938,0.564462,0.032563,0.121847,-0.549777,-0.097025,0.39204,-0.420566,-0.411044,0.463408,0.548386,0.047681,-0.127324,0.569495,-0.110478,0.09722,-0.882672,0.259571,0.404534,-0.869586,0.329919,0.316182,0.224966,0.004061,0.237713,0.51314,-0.480294,-0.377237,0.603365,-0.137233,0.087603,-0.312927,-0.569243,0.226842,0.512076,0.710768,0.567195,0.285523,-1.034775,0.641923,0.196895,-0.431753,0,0,0,0,0,0,0,0,0
2,0,17,-1.061625,0.695669,0.748753,0.995594,-0.411847,0.642234,0.757859,-0.818495,0.469294,-0.772189,1.047479,-0.153827,0.627613,-0.34486,0.028613,-0.870748,0.837966,-0.388924,-0.656243,0.799423,0.318389,0.28322,-0.686327,0.546457,0.03336,-0.379124,-0.718067,0.690444,-0.096369,0.660774,-0.7798,0.389501,0.348258,0.893893,-0.09773,-0.268383,0.909228,-0.251452,-0.50699,0.686309,0.25025,-1.333669,-0.695395,-0.079127,-0.286265,-0.561773,-0.358891,0.275635,0.121292,0.174776,0.29918,0,0,0,0,0,0,0,0,0
3,0,18,0.762952,0.541564,-1.74847,-0.213711,0.278308,0.864224,-0.652057,-0.261578,-1.140144,0.387261,0.00728,0.74705,-1.00814,0.259492,-0.320478,0.849638,0.081558,-0.531388,-0.218957,0.93002,0.503132,0.267684,-0.1622,0.040005,1.355535,0.580054,1.690864,-0.319535,0.62943,0.650094,-0.283748,0.353194,0.134795,-0.834564,0.100957,0.602543,0.21578,0.43213,0.272419,0.183374,-0.185902,0.082377,0.299857,0.38126,-0.222827,0.093576,-0.13815,-0.028998,-0.696267,-0.151542,-0.591851,1,0,1,0,1,0,1,0,1
4,0,5,0.425047,-0.709196,0.088248,0.181806,0.42148,-0.630462,-0.082175,-0.037511,0.634774,-0.62665,-0.621506,-0.700366,-0.806705,0.112039,-0.468883,0.743411,0.298726,0.06386,0.435101,-0.307179,-0.154592,-0.18235,0.32337,0.299133,0.287909,-0.066919,0.199563,0.791961,-0.543287,0.486814,0.301327,-0.246722,-0.608941,0.572929,0.460096,-0.866829,0.62982,0.184958,0.683699,0.162462,0.368442,0.096874,0.013169,-0.645104,-0.442904,-0.226839,-0.440291,-0.431714,0.32977,0.184999,-0.050811,0,0,0,0,0,0,0,0,0


In [7]:
final_combined.to_csv("../data/final/train_combined_sampling.csv")

Same thing using SMOTEEN

See https://imbalanced-learn.org/stable/references/generated/imblearn.combine.SMOTEENN.html#imblearn.combine.SMOTEENN for details.

In [15]:
from imblearn.combine import SMOTEENN 
from imblearn.under_sampling import EditedNearestNeighbours

# init
enn = EditedNearestNeighbours(sampling_strategy = "all", n_neighbors = 5)
sme = SMOTEENN(random_state=42, enn = enn)

# resample
X_sme, y_sme = sme.fit_resample(X, y)

print('Original dataset shape %s' % Counter(y))
print('Resampled dataset shape %s' % Counter(y_sme))

Original dataset shape Counter({0: 506, 1: 52})
Resampled dataset shape Counter({1: 401, 0: 244})


In [16]:
final_sme = pd.concat([y_sme, X_sme], axis = 1)
final_combined.to_csv("../data/final/train_sme_sampling.csv")

### Option 3: Use class weights in sklearn models

Leave the data as-is (with 2 classes) and include hyperparam in our models to adjust for class imbalance

In [None]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced', np.unique(y), y)

# then in model.fit, we add class_weight = class_weight as a param

### Option 4: MCMC (markov chain monte carlo) sampling

This requires effort way beyond scope of this project.
MCMC lets you draw data from a data distribution w/o having to actually know what that distribution is by generating it from your sample. It's actually somewhat theoretically complicated and it's been over a year since I learned it/used it so... too much work. 