## Federated Learning to Predict Hospital Admissions
### Section 2: Single Data Source, Decision Tree
The second script trains a decision on circulatory provider's data

This notebook is meant to signal the current method of training models using a single
centralized data source.  Our results here will be compared to a decision tree trained using federated learning combining multiple data sources

I use the ID3 implementation found here: https://github.com/svaante/decision-tree-id3

In [1]:
import pandas as pd
import numpy as np
from id3 import Id3Estimator
from id3 import export_graphviz, export_text
from sklearn.metrics import accuracy_score, classification_report
from sklearn import tree
import pickle

In [2]:
# depth range
max_depth = 6

In [3]:
provider_train = pd.read_csv('./Party3/Provider_train.csv')
provider_test = pd.read_csv('./Party3/Provider_test.csv')

display(provider_train.head())
display(provider_test.head())

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,class
0,Hispanic,Female,[70-80),3,5,1,"(1.096, 1.499]","(53.4, 79.6]","(-inf, 0.389]","(2.179, 2.921]",...,No,Steady,No,No,No,No,No,No,Yes,<30
1,Caucasian,Female,[70-80),5,6,17,"(1.096, 1.499]","(-inf, 27.2]","(-inf, 0.389]","(2.921, 3.664]",...,No,Up,No,No,No,No,No,Ch,Yes,<30
2,Caucasian,Male,[60-70),1,1,7,"(1.096, 1.499]","(-inf, 27.2]","(1.168, 1.557]","(2.179, 2.921]",...,No,No,No,No,No,No,No,No,Yes,<30
3,AfricanAmerican,Male,[70-80),1,3,7,"(1.902, 2.305]","(27.2, 53.4]","(0.389, 0.778]","(2.921, 3.664]",...,No,Up,No,No,No,No,No,Ch,Yes,<30
4,Caucasian,Female,[70-80),3,1,1,"(-inf, 1.096]","(-inf, 27.2]","(1.557, inf]","(2.179, 2.921]",...,No,Down,No,No,No,No,No,Ch,Yes,<30


Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,class
0,AfricanAmerican,Female,[70-80),3,1,1,"(1.096, 1.499]","(-inf, 27.2]","(0.778, 1.168]","(-inf, 1.436]",...,No,No,No,No,No,No,No,No,No,NO
1,AfricanAmerican,Male,[40-50),1,3,7,"(1.902, 2.305]","(53.4, 79.6]","(0.389, 0.778]","(2.179, 2.921]",...,No,No,No,No,No,No,No,No,No,NO
2,Other,Male,[60-70),1,2,7,"(1.902, 2.305]","(79.6, 105.8]","(1.557, inf]","(2.921, 3.664]",...,No,Down,No,No,No,No,No,Ch,Yes,<30
3,Caucasian,Male,[50-60),1,1,7,"(-inf, 1.096]","(27.2, 53.4]","(-inf, 0.389]","(2.179, 2.921]",...,No,Steady,No,No,No,No,No,Ch,Yes,NO
4,AfricanAmerican,Male,[60-70),1,1,7,"(1.902, 2.305]","(53.4, 79.6]","(1.557, inf]","(2.921, 3.664]",...,No,Down,No,No,No,No,No,Ch,Yes,>30


In [4]:
# convert to standard sklearn format
X_train, y_train = provider_train.drop(columns = ['class']), provider_train['class']
X_test, y_test = provider_test.drop(columns = ['class']), provider_test['class']

# convert to numpy format for estimator
provider_X_train = X_train.applymap(str)
features = X_train.columns
provider_y_train = y_train.values
provider_X_train = X_train.values.astype('<U30')

X_test = X_test.applymap(str)
X_test = X_test.values.astype('<U30')
y_test = y_test.values

In [5]:
# Train decision tree at certain depth
# saves model in text format
def run_id3(depth):
    model = Id3Estimator(max_depth=depth)
    model.fit(provider_X_train, provider_y_train, check_input = False)
    
    # Save Model
    tree = export_text(model.tree_, feature_names = provider_test.columns)
    with open(f'Models/Base DT_{depth}.txt', 'w') as f:
        f.writelines(tree)
        
    return model

In [6]:
# evaluate results on training data
import warnings
warnings.filterwarnings("ignore")

for i in range(1, max_depth + 1):
    print(f'... Running ID3 with depth {i}...')
    model = run_id3(i)

    # Accuracy on Training dataset
    preds = model.predict(provider_X_train)
    print(f"Accuracy at depth {i}: ", str(round(accuracy_score(provider_y_train, preds),2)))
    print(classification_report(provider_y_train, preds))
    print("\n")

... Running ID3 with depth 1...
Accuracy at depth 1:  0.39
              precision    recall  f1-score   support

         <30       0.50      0.25      0.33      1669
         >30       0.38      0.20      0.26      1669
          NO       0.37      0.74      0.50      1669

   micro avg       0.39      0.39      0.39      5007
   macro avg       0.42      0.39      0.36      5007
weighted avg       0.42      0.39      0.36      5007



... Running ID3 with depth 2...
Accuracy at depth 2:  0.44
              precision    recall  f1-score   support

         <30       0.47      0.43      0.45      1669
         >30       0.41      0.27      0.33      1669
          NO       0.44      0.61      0.51      1669

   micro avg       0.44      0.44      0.44      5007
   macro avg       0.44      0.44      0.43      5007
weighted avg       0.44      0.44      0.43      5007



... Running ID3 with depth 3...
Accuracy at depth 3:  0.48
              precision    recall  f1-score   support

  