In [9]:
import torch
from torch import nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import resample
from imblearn.over_sampling import RandomOverSampler

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [10]:
diabetes_df = pd.read_csv("diabetic_data.csv")

In [11]:
from sklearn.model_selection import train_test_split

class ReadmissionPredictionDataset(torch.utils.data.Dataset):
    def __init__(self, diabetes_df: pd.DataFrame, verbose = False):
        # groups the readmitted patients because we want to focus on patients readmitted in 30 days
        diabetes_df['readmitted'] = diabetes_df['readmitted'].replace({'<30': 'Yes', '>30': 'No'})
        
        # keeps only the last occurence of a patient
        diabetes_df = diabetes_df.drop_duplicates(subset='patient_nbr', keep='last')
        
        # drop unnecessary columns
        # patient_nbr, encounter_id are unecessary indexes
        # medical_specialty, weight, payer_code has too many null values
        # only focusing on initial diagnosis, so drop diag_1 and diag_2
        # statistical analysis found that number_emergency, number_impatient, and number_outpatient are unecessary
        columns_to_drop = ['patient_nbr', 'medical_specialty', 'weight', 'payer_code', 'encounter_id', 'diag_2', 'diag_3', 'number_emergency', 'number_inpatient', 'number_outpatient']
        diabetes_df = diabetes_df.drop(columns=columns_to_drop)

        # statistcal analysis found that all the medications besides insulin and metformin were unecessary
        medications_to_drop = ['repaglinide', 'nateglinide', 'chlorpropamide', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
                   'glyburide-metformin', 'glipizide-metformin', 'metformin-pioglitazone', 
                   'metformin-rosiglitazone', 'glimepiride-pioglitazone', 'acetohexamide', 'citoglipton', 'examide', 'tolbutamide', 'glimepiride', 'glipizide', 
                    'glyburide', 'pioglitazone', 'rosiglitazone']
        diabetes_df = diabetes_df.drop(columns=medications_to_drop)

        # remove null values from remaining columns
        for column in diabetes_df:
            diabetes_df = diabetes_df[diabetes_df[column] != 'Unknown/Invalid']
            diabetes_df = diabetes_df[diabetes_df[column] != '?']

        # remove NA values from A1Cresult and max_glu_serum
        # groups = diabetes_df.groupby('max_glu_serum')
        # print(groups.size())

        # unique_values = diabetes_df['max_glu_serum'].unique()
        # print(unique_values)

        diabetes_df.dropna(subset=['A1Cresult'], inplace=True)
        #diabetes_df.dropna(subset=['max_glu_serum'], inplace=True)
        diabetes_df.fillna('Norm', inplace=True)        

        # max_glu_mapping = {
        #     'Norm': 0,
        #     '>200': 1,
        #     '>300': 2
        # }
        # diabetes_df['max_glu_serum'] = diabetes_df['max_glu_serum'].replace(max_glu_mapping)

        # unique_values = diabetes_df['max_glu_serum'].unique()
        # print(unique_values)

        #diabetes_df.dropna(subset=['max_glu_serum'], inplace=True)

        # remove people that died since they won't be readmitted
        values_to_remove = [11, 13, 14, 19, 20, 21] 
        diabetes_df = diabetes_df[~diabetes_df['discharge_disposition_id'].isin(values_to_remove)]
            
        # group similar discharge types into Home or Other
        diabetes_df['discharge_disposition_id'] = diabetes_df['discharge_disposition_id'].apply(lambda x: 'Home' if x == 1 else 'Other')

        # group similar admission types together
        admission_mapping = {
            2: 'Emergency',
            1: 'Emergency',
            7: 'Emergency',
            6: 'Other',
            5: 'Other',
            8: 'Other',
            3: 'Elective',
            4: 'Newborn'
        }
        diabetes_df['admission_type_id'] = diabetes_df['admission_type_id'].replace(admission_mapping)

        # group similar admission sources together
        diabetes_df['admission_source_id'] = diabetes_df['admission_source_id'].map({
            1: 'Physician Referral',
            2: 'Physician Referral',
            3: 'Physician Referral',
            4: 'Other',
            5: 'Other',
            6: 'Other',
            7: 'Emergency Room',
            8: 'Other',
            9: 'Other',
            10: 'Other',
            11: 'Other',
            12: 'Other',
            13: 'Other',
            14: 'Other',
            15: 'Other',
            17: 'Other',
            18: 'Other',
            19: 'Other',
            20: 'Other',
            21: 'Other',
            22: 'Other',
            23: 'Other',
            24: 'Other',
            25: 'Other',
            26: 'Other'
        })
        diabetes_df['admission_source_id'] = diabetes_df['admission_source_id'].replace(admission_mapping)

        # apply a mapping for similar diagnosis groups
        diag_mapping = {
            'Infectious': [str(i) for i in range(1, 140)],
            'Neoplasms': [str(i) for i in range(140, 240)],
            'Endocrine': [str(i) for i in range(240, 280)],
            'Circulatory': [str(i) for i in range(390, 460)],
            'Respiratory': [str(i) for i in range(460, 520)],
            'Digestive': [str(i) for i in range(520, 580)],
            'Musculoskeletal': [str(i) for i in range(710, 740)],
            'Genitourinary': [str(i) for i in range(580, 630)],
            'Nervous': [str(i) for i in range(320, 390)],
            'Symptoms': [str(i) for i in range(780, 800)]
        }
        def map_to_group(code):
            for group, code_range in diag_mapping.items():
                if code in code_range:
                    return group
            return 'Other'
        diabetes_df['diag_1'] = diabetes_df['diag_1'].apply(map_to_group)

        # one hot encoding for categorical columns
        columns_to_convert = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'metformin', 'insulin', 'diag_1', 'gender', 'race', 'max_glu_serum', 'A1Cresult']
        for column in columns_to_convert:
            diabetes_df = pd.get_dummies(diabetes_df, columns=[column], dtype='int')

        # convert age groups to integer values
        age_mapping = {
            '[0-10)': 5,
            '[10-20)': 15,
            '[20-30)': 25,
            '[30-40)': 35,
            '[40-50)': 45,
            '[50-60)': 55,
            '[60-70)': 65,
            '[70-80)': 75,
            '[80-90)': 85,
            '[90-100)': 95
        }
        diabetes_df['age'] = diabetes_df['age'].replace(age_mapping)

        # convert binary categories to integer values
        readmit_mapping = {
            'Yes': 1,
            'no': 0,
            'No': 0,
            'YES' : 1,
            'Ch': 1,
            'NO': 0
        }
        diabetes_df['readmitted'] = diabetes_df['readmitted'].replace(readmit_mapping)
        diabetes_df['diabetesMed'] = diabetes_df['diabetesMed'].replace(readmit_mapping)
        diabetes_df['change'] = diabetes_df['change'].replace(readmit_mapping)

        # remove the outliers from the quantitative variables (remove outliers beyond 1.5 * IQR)
        def outliers_remover(df, columns_to_process):
            aa = []
            for column_name in columns_to_process:
                if pd.api.types.is_numeric_dtype(df[column_name]):
                    column = df[column_name]
                    q1 = column.quantile(0.25)
                    q3 = column.quantile(0.75)
                    iqr = q3 - q1
                    upper = q3 + 1.5 * iqr
                    lower = q1 - 1.5 * iqr
                    outliers = (column > upper) | (column < lower)
                    aa.extend(outliers[outliers].index)
            df = df.drop(aa).reset_index(drop=True)
            return df
        columns_to_process = ['num_medications', 'num_procedures', 'num_lab_procedures', 'time_in_hospital']
        diabetes_df = outliers_remover(diabetes_df, columns_to_process)

        # Randomly oversample the readmitted group to even out the distribution
        # minority_class = diabetes_df[diabetes_df['readmitted'] == 1]
        # majority_class = diabetes_df[diabetes_df['readmitted'] == 0]
        # print(len(majority_class))
        # print(len(minority_class))
        # if len(minority_class) < len(majority_class):
        #     minority_class = resample(minority_class, replace=True, n_samples=len(majority_class))
        # diabetes_df = pd.concat([majority_class, minority_class])
        # print(len(majority_class))
        # print(len(minority_class))

        # Split into independent (x) and dependent (y) variables
        data_x = diabetes_df.select_dtypes(include=[int, float]).drop('readmitted', axis=1)
        data_y = diabetes_df['readmitted']
        self.input = torch.tensor(data_x.values).type(torch.float32)
        self.output = torch.tensor(data_y.values).type(torch.float32)

        self.df = diabetes_df
    
    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        return (self.input[idx], self.output[idx])


X = ReadmissionPredictionDataset(diabetes_df)
train_dataset, test_dataset = train_test_split(X, test_size=.2, random_state=42)

X_train, y_train = zip(*train_dataset)
print(len(train_dataset))
print(len(test_dataset))

oversampler = RandomOverSampler(sampling_strategy='minority')

X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
oversampled_train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train_resampled), torch.tensor(y_train_resampled))
train_loader = torch.utils.data.DataLoader(oversampled_train_dataset, batch_size=350, shuffle=True)

print(len(oversampled_train_dataset))
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 350, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = 350, shuffle = True)

8323
2081
15924


In [12]:
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.layers = nn.Sequential(nn.Linear(49,16), nn.Sigmoid(), nn.Linear(16,32), nn.Sigmoid(), nn.Linear(32, 64), nn.Sigmoid(), nn.Linear(64, 1), nn.Sigmoid())
    
    def forward(self, x):
        yhat = self.layers(x)
        return yhat

In [13]:
from tqdm.notebook import tqdm

def train_network(model, train_loader, criterion, optimizer, nepoch=100):
    try:
        for epoch in tqdm(range(nepoch)):
            print('EPOCH %d'%epoch)
            total_loss = 0
            count = 0
            for inputs, labels in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                count += 1
            print('{:>12s} {:>7.5f}'.format('Train loss:', total_loss/count))
    except KeyboardInterrupt:
        print('Exiting from training early')
    return

In [14]:
def test_network(model, test_loader):
    correct = 0
    total = 0
    true, pred = [], []
    with torch.no_grad():
        for inputs, labels  in test_loader:
            outputs = model(inputs)
            predicted = torch.round(outputs).squeeze()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            true.append(labels)
            pred.append(predicted)
    acc = (100 * correct / total)
    print('accuracy: %0.3f' % (acc))
    true = np.concatenate(true)
    pred = np.concatenate(pred)
    return acc, true, pred

In [15]:
model = SimpleNet()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.01)

In [16]:
train_network(model, train_loader, criterion, optimizer, nepoch=500)  

  0%|          | 0/500 [00:00<?, ?it/s]

EPOCH 0
 Train loss: 0.69703
EPOCH 1
 Train loss: 0.67573
EPOCH 2
 Train loss: 0.64713
EPOCH 3
 Train loss: 0.64179
EPOCH 4
 Train loss: 0.63374
EPOCH 5
 Train loss: 0.62390
EPOCH 6
 Train loss: 0.62196
EPOCH 7
 Train loss: 0.61519
EPOCH 8
 Train loss: 0.60032
EPOCH 9
 Train loss: 0.58657
EPOCH 10
 Train loss: 0.57519
EPOCH 11
 Train loss: 0.56191
EPOCH 12
 Train loss: 0.54593
EPOCH 13
 Train loss: 0.53762
EPOCH 14
 Train loss: 0.52703
EPOCH 15
 Train loss: 0.52794
EPOCH 16
 Train loss: 0.51387
EPOCH 17
 Train loss: 0.51493
EPOCH 18
 Train loss: 0.50116
EPOCH 19
 Train loss: 0.49069
EPOCH 20
 Train loss: 0.50081
EPOCH 21
 Train loss: 0.48004
EPOCH 22
 Train loss: 0.48628
EPOCH 23
 Train loss: 0.48785
EPOCH 24
 Train loss: 0.46890
EPOCH 25
 Train loss: 0.46337
EPOCH 26
 Train loss: 0.45755
EPOCH 27
 Train loss: 0.45713
EPOCH 28
 Train loss: 0.46842
EPOCH 29
 Train loss: 0.45177
EPOCH 30
 Train loss: 0.45856
EPOCH 31
 Train loss: 0.45155
EPOCH 32
 Train loss: 0.44400
EPOCH 33
 Train loss

In [17]:
acc, true, pred = test_network(model, test_loader)
print(pred)

accuracy: 64.200
[0. 0. 0. ... 0. 0. 1.]


In [18]:
torch.save(model.state_dict(), 'my_model.pth')