# Hospitalization Model

In [4]:
import glob
import torch, os
import pandas as pd
import torch.nn as nn
from torch import optim, as_tensor
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn.model_selection import train_test_split

### Getting all patient data

In [None]:
raw_data = []
for file in glob.glob('fhir/general/*'):
    raw_data.append(pd.read_json(file))

### Rebuilding data to include only features

In [None]:
training_data = {}
for patient in raw_data:
    id = patient['entry'][0]['resource']['id']
    training_data[id] = {}
    training_data[id]['Age'] = -1
    training_data[id]['Age Over 50'] = 0
    training_data[id]['Hospitalized'] = 0
    training_data[id]['Prediabetes'] = 0
    training_data[id]['CHF'] = 0 # number of times patient had congestive heart failure
    training_data[id]['Body Mass Index'] = {}
    training_data[id]['Diastolic'] = {}
    training_data[id]['Systolic'] = {}
    for entry in patient['entry']:
        if 'birthDate' in entry['resource']:
            training_data[id]['Age'] = 2020 - (int)(entry['resource']['birthDate'][0:4])
            if training_data[id]['Age'] > 50:
                training_data[id]['Age Over 50'] = 1
    
        resource_type = entry['resource']['resourceType']
        if resource_type == "Observation":
            category_code = entry['resource']['category'][0]['coding'][0]['code']
            if category_code == "vital-signs":
                display = entry['resource']['code']['text']
                if display == "Body Mass Index":
                    value = entry['resource']['valueQuantity']['value']
                    date = entry['resource']['effectiveDateTime']
                    training_data[id][display][date] = value
                if display == "Blood Pressure":
                    diastolic_val = entry['resource']['component'][0]['valueQuantity']['value']
                    systolic_val = entry['resource']['component'][1]['valueQuantity']['value']
                    date = entry['resource']['effectiveDateTime']
                    training_data[id]['Diastolic'][date] = diastolic_val
                    training_data[id]['Systolic'][date] = systolic_val
                    
                    
        if resource_type == "Encounter":
            class_code = entry['resource']['class']['code']
            if class_code == "IMP" or class_code == "EMER":
                training_data[id]['Hospitalized'] += 1
            try:
                reason = entry['resource']['reasonCode'][0]['coding'][0]['display']
                if reason == "Chronic congestive heart failure (disorder)":
                    training_data[id]['CHF'] += 1
            except:
                continue
                
        if resource_type == "Condition":
            display = entry['resource']['code']['coding'][0]['display']
            if display == 'Prediabetes':
                training_data[id]['Prediabetes'] += 1

### Finding max value for BMI and BP features

In [5]:
missing_data = []
for id in training_data:
    if training_data[id]["Diastolic"] and training_data[id]["Systolic"] and training_data[id]["Body Mass Index"]:
        training_data[id]["Diastolic"] = max(training_data[id]["Diastolic"].values())
        training_data[id]["Systolic"] = max(training_data[id]["Systolic"].values())
        training_data[id]["Body Mass Index"] = max(training_data[id]["Body Mass Index"].values())
    else:
        missing_data.append(id)
        

for patient in missing_data:
    del training_data[patient]

In [None]:
data = pd.DataFrame.from_dict(training_data, orient="index")

### Cleaning dataframe and fixing hospitalization as a label

In [132]:
data = data.reset_index()
bins = [0,3,float("inf")]
labels = [0,1]
data['Hospitalized'] = pd.cut(data['Hospitalized'], bins=bins, labels=labels, include_lowest=True)
id_col = ['index']
target_feature = ['Hospitalized']
label = data['Hospitalized']
dropped_cols = id_col+target_feature+['Age']
data = data.drop(dropped_cols, axis=1)

In [141]:
data

Unnamed: 0,Age Over 50,Prediabetes,CHF,Body Mass Index,Diastolic,Systolic
0,0,1,0,30.09,84.0,126.0
1,0,0,0,21.85,90.0,139.0
2,1,0,0,27.89,86.0,135.0
3,0,0,0,29.55,88.0,134.0
4,1,1,0,30.37,83.0,134.0
...,...,...,...,...,...,...
1280,0,0,0,21.48,88.0,138.0
1281,1,1,0,28.12,83.0,133.0
1282,0,0,0,27.29,82.0,128.0
1283,0,0,0,26.98,85.0,138.0


### Splitting Data

In [134]:
train_data, test_data, train_labels, test_labels = train_test_split(data.values, label.values, test_size = 0.2)

### Creating tensors

In [135]:
temp_tensor = torch.tensor(train_data, dtype=torch.float64)
label_tensor = torch.tensor(train_labels, dtype=torch.long)
test_tensor = torch.tensor(test_data, dtype=torch.long)
test_label_tensor = torch.tensor(test_labels, dtype=torch.long)
train_data = [(temp_tensor[i], label_tensor[i]) for i in range(temp_tensor.size()[0])]
test_data = [(test_tensor[i], test_label_tensor[i]) for i in range(test_tensor.size()[0])]

In [136]:
class Brain(nn.Module):
    def __init__(self, input_size, hidden1_size, hidden2_size, num_classes):
        super(Brain, self).__init__()
        self.fc1= nn.Linear(input_size, hidden1_size)
        self.ReLU1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden1_size, hidden2_size)
        self.ReLU2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden2_size, num_classes)
    
    def  forward(self,x):
        out = self.fc1(x)
        out = self.ReLU1(out)
        out = self.fc2(out)
        out = self.ReLU2(out)
        out = self.fc3(out)
        return out

In [137]:
model = Brain(6, 64, 32, 2)
model.double()
#lr = learning rate 
optimizer = optim.SGD(model.parameters(), lr = 0.001)
num_epochs = 30
print(model)

Brain(
  (fc1): Linear(in_features=6, out_features=64, bias=True)
  (ReLU1): ReLU()
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (ReLU2): ReLU()
  (fc3): Linear(in_features=32, out_features=2, bias=True)
)


In [138]:
batch_size = 30
train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size = batch_size, shuffle = True)
#test_loader = torch.utils.data.DataLoader(dataset=testData, batch_size = batch_size, shuffle = True)
criterion = nn.CrossEntropyLoss()

In [139]:
for i in range(num_epochs):
    cum_loss = 0
    for inputs, labels in train_loader:
        # print(inputs)
        optimizer.zero_grad()
       # print(inputs.size())
        output = model(inputs)
#         output = output.long()
      #  print(output)
        loss = criterion(output, labels.long())
        loss.backward()
        optimizer.step()
        
        cum_loss += loss.item()
        
    print("Training loss: ", cum_loss / len(train_loader))

Training loss:  0.9545280344506949
Training loss:  0.7605840163061831
Training loss:  0.7453525244462809
Training loss:  0.615535261069329
Training loss:  0.5725576130843228
Training loss:  0.5529453796819779
Training loss:  0.5454659220249843
Training loss:  0.5498354624972459
Training loss:  0.5529750059682874
Training loss:  0.5492941404732795
Training loss:  0.543670328120895
Training loss:  0.5269076057078473
Training loss:  0.5276810243133377
Training loss:  0.5389015453767195
Training loss:  0.5295789631019318
Training loss:  0.5235411093880289
Training loss:  0.555004279207623
Training loss:  0.5440198625248637
Training loss:  0.545899467121463
Training loss:  0.5393650843931138
Training loss:  0.5167458122961649
Training loss:  0.5282042719031483
Training loss:  0.5230047976361331
Training loss:  0.5126988478743267
Training loss:  0.5258255568757589
Training loss:  0.5107622639238086
Training loss:  0.5077496021381283
Training loss:  0.5173876817861809
Training loss:  0.516823

In [140]:
with torch.no_grad():
  numCorrect = 0
  numTotal = 0
  for item, label in test_data:
    item = item.double()
    prediction = model(item).max(0)[1].item()
    if prediction == label:
        numCorrect += 1
    numTotal += 1
  print("Accuracy = ", (numCorrect / numTotal))
  print("Correct/Total:", numCorrect, "/", numTotal)

Accuracy =  0.7859922178988327
Correct/Total: 202 / 257
