Author: Kevin ALBERT

Created: Nov 2020

### import modules

In [1]:
# install python modules
# ! sudo /anaconda/bin/conda install -y --update-all --name py38_datareport -c conda-forge/label/main azure-storage-blob

In [62]:
import pandas as pd

In [63]:
import sys  
sys.path.insert(0, '../scripts')
import blobstorage
from io import BytesIO

### load dataset

In [64]:
# import the dataset
synthetic_df = pd.read_csv("../../data/bronze/synthetic_data.csv")
real_df = pd.read_csv("../../data/bronze/real_data.csv")

### data type definitions

### feature engineering

In [65]:
# # creating unique patient ID
# synthetic_df["id"] = synthetic_df.index
# real_df["id"] = real_df.index

In [66]:
# creating new columns, binned in binary procedures or not
synthetic_df["num_procedures_bin"] = synthetic_df["num_procedures"].apply(lambda x: False if (x==0) else True)
real_df["num_procedures_bin"] = real_df["num_procedures"].apply(lambda x: False if (x==0) else True)

In [67]:
# created #meds / # days_in_hospital
synthetic_df["num_medications_perday"] = synthetic_df["num_medications"]/synthetic_df["time_in_hospital"]
real_df["num_medications_perday"] = real_df["num_medications"]/real_df["time_in_hospital"]

In [68]:
# synthetic_df["number_outpatient_perday"] = synthetic_df["number_outpatient"]/synthetic_df["time_in_hospital"]
# synthetic_df["number_inpatient_perday"] = synthetic_df["number_inpatient"]/synthetic_df["time_in_hospital"]

In [69]:
# created a binned version - severity lvl
bin_labels_4 = ['Normal', 'Mild', 'Moderate', 'Severe']
synthetic_df['time_in_hospital_severitylvl'] = pd.qcut(synthetic_df['time_in_hospital'], q=[0, 0.25, 0.5, 0.75, 1], labels=bin_labels_4)
real_df['time_in_hospital_severitylvl'] = pd.qcut(real_df['time_in_hospital'], q=[0, 0.25, 0.5, 0.75, 1], labels=bin_labels_4)

In [70]:
# synthetic_df["number_emergency"].apply(lambda x: "normalPatient" if (x==0) else x)

In [71]:
# remove the [] and () from age with regex
synthetic_df["age"] = synthetic_df["age"].str.replace(r'[^\-\w ]', '')
real_df["age"] = real_df["age"].str.replace(r'[^\-\w ]', '')

In [73]:
synthetic_df.columns

Index(['race', 'gender', 'age', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       '_diag_1', '_diag_2', '_diag_3', 'num_procedures_bin',
       'num_medications_perday', 'time_in_hospital_severitylvl'],
      dtype='object')

In [74]:
# rename a few columns with '_' underscores for the DB loading
synthetic_df.columns = ['race', 'gender', 'age', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed', 'readmitted',
       '_diag_1', '_diag_2', '_diag_3', 'num_procedures_bin',
       'num_medications_perday', 'time_in_hospital_severitylvl']
real_df.columns = ['race', 'gender', 'age', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed', 'readmitted',
       '_diag_1', '_diag_2', '_diag_3', 'num_procedures_bin',
       'num_medications_perday', 'time_in_hospital_severitylvl']

### save new dataset

In [75]:
# store to cloud datalake as *.parquet (preserve schema)
blobstorage.write_blob("silver/synthetic_data_processed.parquet", synthetic_df)
blobstorage.write_blob("silver/real_data_processed.parquet", real_df)

# store local as *.csv files
synthetic_df.to_csv("../../data/silver/synthetic_data_processed.csv", sep=',', index=False)
real_df.to_csv("../../data/silver/real_data_processed.csv", sep=',', index=False)
# store local as *.parquet
synthetic_df.to_parquet("../../data/silver/synthetic_data_processed.parquet")
real_df.to_parquet("../../data/silver/real_data_processed.parquet")

## analyze ML model with 3 scenarios:
 * '<30'
 * '>30'
 * 'No'
 
then use predictor variable "change"

In [76]:
# option 1 <30
blobstorage.write_blob("silver/synthetic_data_processed_lessthan30.parquet", synthetic_df[synthetic_df["readmitted"] == '<30'])

In [80]:
synthetic_df[synthetic_df["readmitted"] == '<30'].shape

(8867, 45)

In [77]:
# option 2 >30
blobstorage.write_blob("silver/synthetic_data_processed_morethan30.parquet", synthetic_df[synthetic_df["readmitted"] == '>30'])

In [81]:
synthetic_df[synthetic_df["readmitted"] == '>30'].shape

(27895, 45)

In [85]:
# option 1 No
blobstorage.write_blob("silver/synthetic_data_processed_withNo.parquet", synthetic_df[synthetic_df["readmitted"] == 'NO'])

In [86]:
synthetic_df[synthetic_df["readmitted"] == 'NO'].shape

(41679, 45)

## new test - HBA1C_STATUS remove 'None'

In [91]:
blobstorage.write_blob("silver/synthetic_data_processed_remove_hba1c.parquet", synthetic_df[synthetic_df["A1Cresult"] != 'None'])

In [92]:
# synthetic_df[synthetic_df["A1Cresult"] != 'None']["A1Cresult"].head(20)