In [37]:
import torch
from torch import nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [38]:
diabetes_df = pd.read_csv("diabetic_data.csv")

In [39]:
from sklearn.model_selection import train_test_split

class ReadmissionPredictionDataset(torch.utils.data.Dataset):
    def __init__(self, diabetes_df: pd.DataFrame, verbose = False):
        diabetes_df['readmitted'] = diabetes_df['readmitted'].replace({'<30': 'YES', '>30': 'YES'})
        diabetes_df = diabetes_df.drop(columns = 'payer_code')
        diabetes_df = diabetes_df.drop(columns = 'patient_nbr')
        diabetes_df = diabetes_df.drop(columns = 'medical_specialty')
        diabetes_df = diabetes_df.drop(columns = 'encounter_id')
        diabetes_df = diabetes_df.drop(columns = 'weight')
        diabetes_df = diabetes_df.drop(columns = 'diag_1')
        diabetes_df = diabetes_df.drop(columns = 'diag_2')
        diabetes_df = diabetes_df.drop(columns = 'diag_3')
        
        diabetes_df = diabetes_df.drop(columns = 'metformin')
        diabetes_df = diabetes_df.drop(columns = 'repaglinide')
        diabetes_df = diabetes_df.drop(columns = 'nateglinide')
        diabetes_df = diabetes_df.drop(columns = 'chlorpropamide')
        diabetes_df = diabetes_df.drop(columns = 'glimepiride')
        diabetes_df = diabetes_df.drop(columns = 'acetohexamide')
        diabetes_df = diabetes_df.drop(columns = 'glipizide')
        diabetes_df = diabetes_df.drop(columns = 'glyburide')
        diabetes_df = diabetes_df.drop(columns = 'tolbutamide')
        diabetes_df = diabetes_df.drop(columns = 'pioglitazone')
        diabetes_df = diabetes_df.drop(columns = 'rosiglitazone')
        diabetes_df = diabetes_df.drop(columns = 'acarbose')
        diabetes_df = diabetes_df.drop(columns = 'miglitol')
        diabetes_df = diabetes_df.drop(columns = 'troglitazone')
        diabetes_df = diabetes_df.drop(columns = 'tolazamide')
        diabetes_df = diabetes_df.drop(columns = 'examide')
        diabetes_df = diabetes_df.drop(columns = 'citoglipton')
        diabetes_df = diabetes_df.drop(columns = 'insulin')
        diabetes_df = diabetes_df.drop(columns = 'glyburide-metformin')
        diabetes_df = diabetes_df.drop(columns = 'glipizide-metformin')
        diabetes_df = diabetes_df.drop(columns = 'glimepiride-pioglitazone')
        diabetes_df = diabetes_df.drop(columns = 'metformin-rosiglitazone')
        diabetes_df = diabetes_df.drop(columns = 'metformin-pioglitazone')

        diabetes_df = diabetes_df.drop(columns = 'number_diagnoses')
        diabetes_df = diabetes_df.drop(columns = 'num_lab_procedures')
        diabetes_df = diabetes_df.drop(columns = 'number_outpatient')

        
        #diabetes_df = diabetes_df.drop(columns = 'num_procedures')
        #diabetes_df = diabetes_df.drop(columns = 'num_medications')


        # diabetes_df = diabetes_df[diabetes_df['diag_1'] != '?']
        # diabetes_df = diabetes_df[diabetes_df['diag_2'] != '?']
        # diabetes_df = diabetes_df[diabetes_df['diag_3'] != '?']
        diabetes_df = diabetes_df[diabetes_df['race'] != '?']
        #diabetes_df = diabetes_df[diabetes_df['weight'] != '?']
        diabetes_df = diabetes_df[diabetes_df['admission_type_id'] != 5]
        diabetes_df = diabetes_df[diabetes_df['admission_type_id'] != 6]
        diabetes_df = diabetes_df[diabetes_df['admission_source_id'] != 17]

        diabetes_df = pd.get_dummies(diabetes_df, columns=['gender'], dtype='int', prefix='is', prefix_sep='')
        diabetes_df = pd.get_dummies(diabetes_df, columns=['race'], dtype='int', prefix='is', prefix_sep='')
        diabetes_df = pd.get_dummies(diabetes_df, columns=['admission_type_id'], dtype='int', prefix='is', prefix_sep='')
        diabetes_df = pd.get_dummies(diabetes_df, columns=['admission_source_id'], dtype='int', prefix='is', prefix_sep='')
        diabetes_df = pd.get_dummies(diabetes_df, columns=['discharge_disposition_id'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['diag_1'], prefix='is', dtype='int', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['diag_2'], prefix='is', dtype='int', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['diag_3'], prefix='is', dtype='int', prefix_sep='')
        diabetes_df = pd.get_dummies(diabetes_df, columns=['max_glu_serum'], dtype='int', prefix='is', prefix_sep='')
        diabetes_df = pd.get_dummies(diabetes_df, columns=['A1Cresult'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['metformin'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['repaglinide'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['nateglinide'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['chlorpropamide'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['glimepiride'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['acetohexamide'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['glipizide'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['glyburide'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['tolbutamide'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['pioglitazone'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['rosiglitazone'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['acarbose'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['miglitol'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['troglitazone'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['tolazamide'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['examide'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['citoglipton'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['insulin'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['glyburide-metformin'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['glipizide-metformin'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['glimepiride-pioglitazone'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['metformin-rosiglitazone'], dtype='int', prefix='is', prefix_sep='')
        # diabetes_df = pd.get_dummies(diabetes_df, columns=['metformin-pioglitazone'], dtype='int', prefix='is', prefix_sep='')
        diabetes_df = pd.get_dummies(diabetes_df, columns=['change'], dtype='int', prefix='is', prefix_sep='')
        diabetes_df = pd.get_dummies(diabetes_df, columns=['diabetesMed'], dtype='int', prefix='is', prefix_sep='')

        age_mapping = {
            '[0-10)': 5,
            '[10-20)': 15,
            '[20-30)': 25,
            '[30-40)': 35,
            '[40-50)': 45,
            '[50-60)': 55,
            '[60-70)': 65,
            '[70-80)': 75,
            '[80-90)': 85,
            '[90-100)': 95
        }
        diabetes_df['age'] = diabetes_df['age'].replace(age_mapping)
        # weight_mapping = {
        #     '[0-25)': 12.5,
        #     '[25-50)': 37.5,
        #     '[50-75)': 62.5,
        #     '[75-100)': 87.5,
        #     '[100-125)': 112.5,
        #     '[125-150)': 137.5,
        #     '[150-175)': 162.5,
        #     '[175-200)': 187.5
        # }
        # diabetes_df['weight'] = diabetes_df['weight'].replace(weight_mapping)

        readmit_mapping = {
            'YES': 1,
            'NO': 0
        }
        diabetes_df['readmitted'] = diabetes_df['readmitted'].replace(readmit_mapping)

        # groups = diabetes_df.groupby('readmitted')
        # counts = groups.count()
        # print(counts)

        # unique_values = diabetes_df['diag_1'].unique()
        # #print(unique_values)
        # groups_type = diabetes_df.groupby('readmitted')
        # counts_type = groups_type.count()
        # print(counts_type)
        #counts['encounter_id'].plot.bar()

        #data_x =  diabetes_df.loc[:, diabetes_df.columns[:len(diabetes_df.columns) - 1]]

        data_x = diabetes_df.select_dtypes(include=[int, float]).drop('readmitted', axis=1)

        data_y = diabetes_df['readmitted']

        self.input = torch.tensor(data_x.values).type(torch.float32)

        self.output = torch.tensor(data_y.values).type(torch.float32)
    
    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        return (self.input[idx], self.output[idx])

X = ReadmissionPredictionDataset(diabetes_df)
# train_dataset, test_dataset = train_test_split(X, test_size=1)


train_dataset, test_dataset = train_test_split(X, test_size=0.5, random_state=42)

# train_dataset = ReadmissionPredictionDataset(train_df, verbose = True)
# test_dataset = ReadmissionPredictionDataset(test_df)

# train_df.head()
print(train_dataset[0])


# input_size = len(train_dataset.input[0])
# print(input_size)
# print(len(test_dataset.input[0]))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 500, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = 500, shuffle = True)


(tensor([45.,  1.,  5., 12.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,
         1.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.]), tensor(1.))


In [40]:
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.layers = nn.Sequential(nn.Linear(38,256), nn.Sigmoid(), nn.Linear(256,512), nn.Sigmoid(), nn.Linear(512, 1024), nn.Sigmoid(), nn.Linear(1024, 1), nn.Sigmoid())
    # def __init__(self):
    #     super(SimpleNet, self).__init__()
    #     self.layers = nn.Sequential(nn.Conv2d(1, 32, kernel_size=(1, 3)), nn.ReLU(), nn.MaxPool2d(kernel_size=(1, 2)), nn.Conv2d(32, 64, kernel_size=(1, 3)), nn.ReLU(), nn.MaxPool2d(kernel_size=(1, 2)), nn.Conv2d(64, 128, kernel_size=(1, 3)), nn.ReLU(), nn.MaxPool2d(kernel_size=(1, 2)), nn.Flatten(), nn.Linear(896, 1024), nn.ReLU(), nn.Linear(1024, 512), nn.ReLU(), nn.Linear(512, 256), nn.ReLU(), nn.Linear(256, 1), nn.Sigmoid())
    
    
    def forward(self, x):
        yhat = self.layers(x)
        return yhat

In [41]:
from tqdm.notebook import tqdm

def train_network(model, train_loader, criterion, optimizer, nepoch=100):
    try:
        for epoch in tqdm(range(nepoch)):
            print('EPOCH %d'%epoch)
            total_loss = 0
            count = 0
            for inputs, labels in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                count += 1
            print('{:>12s} {:>7.5f}'.format('Train loss:', total_loss/count))
    except KeyboardInterrupt:
        print('Exiting from training early')
    return

In [42]:
def test_network(model, test_loader):
    correct = 0
    total = 0
    true, pred = [], []
    with torch.no_grad():
        for inputs, labels  in test_loader:
            outputs = model(inputs)
            predicted = torch.round(outputs).squeeze()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            true.append(labels)
            pred.append(predicted)
    acc = (100 * correct / total)
    print('accuracy: %0.3f' % (acc))
    true = np.concatenate(true)
    pred = np.concatenate(pred)
    return acc, true, pred

In [43]:
model = SimpleNet()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.001)

In [44]:
train_network(model, train_loader, criterion, optimizer, nepoch=300)  

  0%|          | 0/300 [00:00<?, ?it/s]

EPOCH 0
 Train loss: 0.76671
EPOCH 1
 Train loss: 0.70191
EPOCH 2
 Train loss: 0.69981
EPOCH 3
 Train loss: 0.69787
EPOCH 4
 Train loss: 0.68865
EPOCH 5
 Train loss: 0.69384
EPOCH 6
 Train loss: 0.68616
EPOCH 7
 Train loss: 0.69002
EPOCH 8
 Train loss: 0.68681
EPOCH 9
 Train loss: 0.68853
EPOCH 10
 Train loss: 0.68696
EPOCH 11
 Train loss: 0.68460
EPOCH 12
 Train loss: 0.68379
EPOCH 13
 Train loss: 0.68251
EPOCH 14
 Train loss: 0.68367
EPOCH 15
 Train loss: 0.68011
EPOCH 16
 Train loss: 0.67895
EPOCH 17
 Train loss: 0.67600
EPOCH 18
 Train loss: 0.67589
EPOCH 19
 Train loss: 0.67347
EPOCH 20
 Train loss: 0.67359
EPOCH 21
 Train loss: 0.67087
EPOCH 22
 Train loss: 0.67060
EPOCH 23
 Train loss: 0.66849
EPOCH 24
 Train loss: 0.66776
EPOCH 25
 Train loss: 0.66756
EPOCH 26
 Train loss: 0.66242
EPOCH 27
 Train loss: 0.66139
EPOCH 28
 Train loss: 0.65781
EPOCH 29
 Train loss: 0.65997
EPOCH 30
 Train loss: 0.65570
EPOCH 31
 Train loss: 0.64922
EPOCH 32
 Train loss: 0.64812
EPOCH 33
 Train loss

In [45]:
acc, true, pred = test_network(model, test_loader)
print(pred)

accuracy: 59.010
[1. 1. 0. ... 0. 0. 1.]
