In [1]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset

In [2]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration,Diastolic blood pressure,Triceps skin fold thickness,2-Hour serum insulin,Body mass index,Age,Class
0,6,148,72,35,0,33.6,50,positive
1,1,85,66,29,0,26.6,31,negative
2,8,183,64,0,0,23.3,32,positive
3,1,89,66,23,94,28.1,21,negative
4,0,137,40,35,168,43.1,33,positive


#### Preprocessing

In [3]:
x = data.iloc[:,0:-1].values
y_string = list(data.iloc[:,-1])

print(x)
print(x.shape)
print(len(y_string))

[[  6.  148.   72.  ...   0.   33.6  50. ]
 [  1.   85.   66.  ...   0.   26.6  31. ]
 [  8.  183.   64.  ...   0.   23.3  32. ]
 ...
 [  5.  121.   72.  ... 112.   26.2  30. ]
 [  1.  126.   60.  ...   0.   30.1  47. ]
 [  1.   93.   70.  ...   0.   30.4  23. ]]
(768, 7)
768


In [4]:
y_int = []

for s in y_string:
    if s == 'positive':
        y_int.append(1)
    else:
        y_int.append(0)
        
y = np.array(y_int, dtype='float64')

In [5]:
# Normalize between [-1,1]
# x' = (x-mean)/SD

sc = StandardScaler()
x = sc.fit_transform(x)

print(x)

[[ 0.63994726  0.84832379  0.14964075 ... -0.69289057  0.20401277
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.69289057 -0.68442195
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -0.69289057 -1.10325546
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ...  0.27959377 -0.73518964
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.69289057 -0.24020459
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.69289057 -0.20212881
  -0.87137393]]


In [6]:
#numpy to torch tensors

x = torch.tensor(x)
y = torch.tensor(y).unsqueeze(1)

print(x.shape)
print(y.shape)

torch.Size([768, 7])
torch.Size([768, 1])


#### Create Dataset

In [7]:
class Dataset(Dataset):
    
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return len(self.x)

In [8]:
dataset = Dataset(x, y)
len(dataset)

768

In [9]:
#load data to DataLoader for batch processing

train_loader = torch.utils.data.DataLoader(dataset = dataset,
                                           batch_size = 32,
                                           shuffle = True)

print(train_loader)

<torch.utils.data.dataloader.DataLoader object at 0x0000016EB9F6B430>


In [10]:
print(f'There is {len(train_loader)} batches in the dataset')
for (x,y) in train_loader:
    print('For one iteration (batch) there is : ')
    print(f'Data : {x.shape}')
    print(f'Labels : {y.shape}')
    break

There is 24 batches in the dataset
For one iteration (batch) there is : 
Data : torch.Size([32, 7])
Labels : torch.Size([32, 1])


#### Building the Model

In [11]:
class Model(nn.Module):
    
    def __init__ (self, input_features, output_features):
        super(Model, self).__init__()
        
        self.fc1 = nn.Linear(input_features,5)
        self.fc2 = nn.Linear(5,4)
        self.fc3 = nn.Linear(4,3)
        self.fc4 = nn.Linear(3,output_features)
        
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()
        
    def forward(self, x):
        
        out = self.fc1(x)
        out = self.tanh(out)
        out = self.fc2(out)
        out = self.tanh(out)
        out = self.fc3(out)
        out = self.tanh(out)
        out = self.fc4(out)
        out = self.sigmoid(out)
        return out

In [12]:
#create the network
net = Model(7,1)

#binary cross-entropy
#size average = True, average the losses over minibatch
criterion = torch.nn.BCELoss(reduction='mean')

#optimizer = SGD with momentum
optimizer = torch.optim.SGD(net.parameters(), lr = 0.1, momentum = 0.9)

#### Training the network

In [13]:
epochs = 200

for epoch in range(epochs):
    for inputs, labels in train_loader:
        inputs = inputs.float()
        labels = labels.float()
        
        #forward prop
        outputs = net(inputs)
        
        #loss
        loss = criterion(outputs, labels)
        
        #clear gradient buffer
        optimizer.zero_grad()
        
        #backprop - calculates matrix of gradients
        loss.backward()
        
        #update weights --> w = w - lr*gradients
        optimizer.step()
        
    #accuracy
    output = (outputs > 0.5).float()
    accuracy = (output == labels).float().mean()
    
    #print stats
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss:.3f}, Accuracy: {accuracy:.3f}')

Epoch 1/200, Loss: 0.535, Accuracy: 0.781
Epoch 2/200, Loss: 0.575, Accuracy: 0.750
Epoch 3/200, Loss: 0.362, Accuracy: 0.875
Epoch 4/200, Loss: 0.531, Accuracy: 0.688
Epoch 5/200, Loss: 0.391, Accuracy: 0.844
Epoch 6/200, Loss: 0.423, Accuracy: 0.812
Epoch 7/200, Loss: 0.485, Accuracy: 0.750
Epoch 8/200, Loss: 0.609, Accuracy: 0.656
Epoch 9/200, Loss: 0.571, Accuracy: 0.625
Epoch 10/200, Loss: 0.404, Accuracy: 0.781
Epoch 11/200, Loss: 0.413, Accuracy: 0.844
Epoch 12/200, Loss: 0.712, Accuracy: 0.594
Epoch 13/200, Loss: 0.497, Accuracy: 0.719
Epoch 14/200, Loss: 0.575, Accuracy: 0.656
Epoch 15/200, Loss: 0.458, Accuracy: 0.812
Epoch 16/200, Loss: 0.382, Accuracy: 0.812
Epoch 17/200, Loss: 0.458, Accuracy: 0.719
Epoch 18/200, Loss: 0.486, Accuracy: 0.750
Epoch 19/200, Loss: 0.502, Accuracy: 0.656
Epoch 20/200, Loss: 0.426, Accuracy: 0.750
Epoch 21/200, Loss: 0.390, Accuracy: 0.750
Epoch 22/200, Loss: 0.503, Accuracy: 0.844
Epoch 23/200, Loss: 0.361, Accuracy: 0.750
Epoch 24/200, Loss: 

In [15]:
print(f'final accuracy = {accuracy*100}')

final accuracy = 87.5
