## General example
This example covers some of the core functionalities of the toolbox.
### Outline
- Load a test dataset
- Preliminary analysis
 - Show summary
 - Show some plots
- Preprocess data
- Evaluate a classifier
- Show some model selection plots

In [35]:
import sys
sys.path.append("C:\\Users\\aartetxe\\ARKAITZ\\par-toolbox\\partb")
sys.path.append("C:\\Users\\aartetxe\\ARKAITZ\\par-toolbox")

In [36]:
# Dependencies
import pandas as pd
import numpy as np
from datetime import datetime
from utility.preprocessing import PreProcessing

In [37]:
# Allow hot reload of modules
%load_ext autoreload
%autoreload 2
# Show plots within notebook
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
# Load data
df = pd.read_csv('../data/readmission_example.csv', sep=';')

In [48]:
# Construct preprocessor
prepro = PreProcessing(df, 'OUTCOME')

# Calculate and set length of stay (LOS) 
prepro.set_los('HOSP_DATE', 'DISCHARGE_DATE')
prepro.set_first_diagnostic('FIRSTDIAG', 'HOSP_DATE')
prepro.set_age('DAY_OF_BIRTH', 'HOSP_DATE')

df.head()

Unnamed: 0,PATIENT_ID,HOSP_DATE,DISCHARGE_DATE,DAY_OF_BIRTH,SEX,SMOKE,LVEF,FIRSTDIAG,NEED_OXYGEN,UREA,CREATININE,SODIUM,POTASSIUM,HEMOGLOBIN,SINUS_RHYTHM,ATRIAL_FIBRILLATION,PACEMAKER_RHYTHM,OUTCOME,LOS,AGE
0,PATIENT001,22/02/2014,05/03/2014,01/01/1949,female,yes,20,0.0,0,45,0.94,139,5.3,15.3,1,0,0,readmitted,11.0,65.0
1,PATIENT002,14/03/2014,21/03/2014,01/01/1924,male,no,20,34.0,0,95,1.39,141,4.3,10.2,1,0,0,readmitted,7.0,90.0
2,PATIENT003,14/05/2014,16/05/2014,01/01/1947,male,yes,40,1.0,0,47,1.01,145,3.06,16.6,1,0,0,not-readmitted,2.0,67.0
3,PATIENT004,29/03/2014,09/04/2014,01/01/1928,male,no,25,18.0,0,50,1.13,145,2.94,14.0,0,0,1,readmitted,11.0,86.0
4,PATIENT005,,,01/01/1949,male,former,60,,0,42,1.33,141,4.3,10.5,0,1,0,not-readmitted,,


In [45]:
# TODO: Remove
def encode_fdiag(x):
    date_format="%d/%m/%Y"
    date_admission = x['HOSP_DATE']
    date_first_diagnostic = x['FIRSTDIAG']
    if pd.isnull(date_admission) or pd.isnull(date_first_diagnostic):
        x['FIRSTDIAG'] = 'NaN'
    else:
        d0 = datetime.strptime(date_first_diagnostic, date_format)
        d1 = datetime.strptime(date_admission, date_format)
        delta = d1 - d0
        x['FIRSTDIAG'] = delta.days / 365
    
    return x

# df = df.apply(encode_fdiag, axis=1)

In [49]:
# Encode strings
df['OUTCOME'] = map(lambda x: 1 if x=='readmitted' else 0, df['OUTCOME'])
df['SEX'] = map(lambda x: 1 if x=='male' else 0, df['SEX'])

def encode_smoke(x):
    if x == 'yes':
        return 2 # smoker
    elif x == 'no':
        return 0 # not smoker
    elif x == 'former':
        return 1 # former smoker
    else:
        return

df['SMOKE'] = map(encode_smoke, df['SMOKE'])
    
df.head()

Unnamed: 0,PATIENT_ID,HOSP_DATE,DISCHARGE_DATE,DAY_OF_BIRTH,SEX,SMOKE,LVEF,FIRSTDIAG,NEED_OXYGEN,UREA,CREATININE,SODIUM,POTASSIUM,HEMOGLOBIN,SINUS_RHYTHM,ATRIAL_FIBRILLATION,PACEMAKER_RHYTHM,OUTCOME,LOS,AGE
0,PATIENT001,22/02/2014,05/03/2014,01/01/1949,0,2,20,0.0,0,45,0.94,139,5.3,15.3,1,0,0,1,11.0,65.0
1,PATIENT002,14/03/2014,21/03/2014,01/01/1924,1,0,20,34.0,0,95,1.39,141,4.3,10.2,1,0,0,1,7.0,90.0
2,PATIENT003,14/05/2014,16/05/2014,01/01/1947,1,2,40,1.0,0,47,1.01,145,3.06,16.6,1,0,0,0,2.0,67.0
3,PATIENT004,29/03/2014,09/04/2014,01/01/1928,1,0,25,18.0,0,50,1.13,145,2.94,14.0,0,0,1,1,11.0,86.0
4,PATIENT005,,,01/01/1949,1,1,60,,0,42,1.33,141,4.3,10.5,0,1,0,0,,


In [50]:
# Remove unnecesary data
del df['PATIENT_ID']
del df['HOSP_DATE']
del df['DISCHARGE_DATE']
del df['DAY_OF_BIRTH']

df.head()

Unnamed: 0,SEX,SMOKE,LVEF,FIRSTDIAG,NEED_OXYGEN,UREA,CREATININE,SODIUM,POTASSIUM,HEMOGLOBIN,SINUS_RHYTHM,ATRIAL_FIBRILLATION,PACEMAKER_RHYTHM,OUTCOME,LOS,AGE
0,0,2,20,0.0,0,45,0.94,139,5.3,15.3,1,0,0,1,11.0,65.0
1,1,0,20,34.0,0,95,1.39,141,4.3,10.2,1,0,0,1,7.0,90.0
2,1,2,40,1.0,0,47,1.01,145,3.06,16.6,1,0,0,0,2.0,67.0
3,1,0,25,18.0,0,50,1.13,145,2.94,14.0,0,0,1,1,11.0,86.0
4,1,1,60,,0,42,1.33,141,4.3,10.5,0,1,0,0,,


In [None]:
# TODO: Missing value imputation

In [None]:
# Split data and label
dataset = df.drop(['OUTCOME'], axis=1)
target = df['OUTCOME']
# Format dataset and label
X = dataset.as_matrix()
y = target.as_matrix().astype(int)