This is an example of using the neural network implemented using **Pytorch** for classifying the tabular data (diabetes_data) downloaded from [UCI repository](https://archive.ics.uci.edu/ml/datasets.php).

In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import sys
from google.colab import drive
drive.mount('/content/drive')
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


The **preprocessing** function takes data frame or table (df) and returns the table with numerical values.The **categorical_cols** is a list which stores all the categorical features.
The [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) is a Sklearn function, which take a categorical column and encode it to numerical value.

In [None]:
def preprocessing(df):
	categorical_cols=[]
	for c in data.columns:
		if data[c].dtype not in ['int64','int32','float64','float64']:
			categorical_cols.append(c)
		
	for col in categorical_cols:
		df[col] = LabelEncoder().fit_transform(df[col])
	df = df.fillna(df.mean())
	return df

The class **TableDataset** inherits the **Dataset** class of Pytorch. It takes numerical tabular data (**df**), **output column** of the table, and **mode**. The value of mode may be one of the (train, valid, test). If mode = train it will return train data, and similarly for validataion and test data. The return value of this class is a dictionary which contains **features** array and **labels** array.

In [None]:
# Dataset class

class TableDataset(Dataset):
	"""docstring for TableDataset"""
	def __init__(self, df, output_column, mode):
		super().__init__()
		# self.size = df.shape[0]
		self.mode = mode
		self.labels = df[output_column]
		self.features = df[[c for c in list(df.columns) if c!=output_column]].values
		self.train_size = int(df.shape[0]*0.7)
		self.val_size = int(df.shape[0]*0.15)
		self.test_size = int(df.shape[0]*0.15)

	def __len__(self):
		if self.mode=='train':
			return self.train_size
		elif self.mode=='valid':
			return self.val_size
		else: return self.test_size


	def __getitem__(self, index):
		if self.mode=='valid':
			index+=self.train_size
		elif self.mode=='test':
			index+=self.train_size+self.val_size
		# print(self.features[index])
		# print(self.labels[index])
		sample = {'features':self.features[index], 
				'labels':self.labels[index]}
		return sample

Here, I have defined the **Model** class contains a two layered neural network, where the first layer contains 200 neurons, the second layer contains 100 neurons. Model takes the features and returns the output.

In [None]:
# The NN Model		

class Model(nn.Module):
	"""docstring for Model"""
	def __init__(self, num_features, num_classes):
		super().__init__()
		# print(num_features)
		self.fc1 = nn.Linear(num_features, 200)
		self.bn1 = nn.BatchNorm1d(200)
		self.fc2 = nn.Linear(200, 100)
		self.bn2 = nn.BatchNorm1d(100)
		self.fc3 = nn.Linear(100, num_classes)

	def forward(self, x):
		# print(x.shape)
		x = F.relu(self.bn1(self.fc1(x)))
		x = F.relu(self.bn2(self.fc2(x)))
		x = torch.sigmoid(self.fc3(x))
		return x

Now, we need to define the paths of input csv file, path to output the best checkpoint, and training hyperparameters such as **batch size**, **learning rate**, **number of epochs**. Further, I have defined the train, validation, and test loaders to load respective data.

In [None]:
csv_filepath = '/content/drive/MyDrive/diabetes_data_upload.csv'
save_path = '/content/drive/MyDrive/best_checkpoint.pt'
batch_size=10
learning_rate=0.001	
n_epochs = 50

data = pd.read_csv(csv_filepath)
df = preprocessing(data)
# print(df.shape)

train_data = TableDataset(df, output_column='class', mode='train')
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

valid_data = TableDataset(df, output_column='class', mode='valid')
valid_loader = DataLoader(valid_data, batch_size=batch_size)

test_data = TableDataset(df, output_column='class', mode='test')
test_loader = DataLoader(test_data, batch_size=batch_size)

print('train size = '+str(len(train_data)))
print('valid size = '+str(len(valid_data)))
print('test size = '+str(len(test_data)))

Now, we create instance of Model class. The parameters **num_features=16** is due to 16 columns (features) of the tabular data and **num_class=1** is number of bits requred for representing two classes.

**BCELoss** (Binary Cross Entropy Loss) is used which takes two 1D arrays. The first array is outputs of model and second is true labels.

In [None]:
model = Model(num_features=16, num_classes=1)
# print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.BCELoss()
best_val_acc = 0

for epoch in range(1, n_epochs+1):
  epoch_train_loss = 0
  epoch_train_acc = 0
  epoch_valid_loss = 0
  epoch_valid_acc = 0
  best_epoch = 0
  model.train()

# Training the model

  for i, train_batch in enumerate(train_loader):

    feats = torch.tensor(train_batch['features']).float()
    labels = torch.tensor(train_batch['labels']).float()

    outputs = model(feats).view(-1)
    loss = criterion(outputs, labels)
    epoch_train_loss+=loss

    accuracy = ((outputs>0.5).float()==labels).float().mean()
    epoch_train_acc+=accuracy

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  model.eval()
  for i, valid_batch in enumerate(valid_loader):
    feats = torch.tensor(valid_batch['features']).float()
    labels = torch.tensor(valid_batch['labels']).float()

    outputs = model(feats).view(-1)
    loss = criterion(outputs, labels)
    epoch_valid_loss+=loss

    accuracy = ((outputs>0.5).float()==labels).float().mean()
    epoch_valid_acc+=accuracy

  print("Ep:",epoch,
        "TrainLoss: {:.4f}" .format(epoch_train_loss.item()/len(train_loader)),
        "TrainAcc: {:.2f}%" .format(100*epoch_train_acc.item()/len(train_loader)),
        "ValLoss: {:.4f}" .format(epoch_valid_loss.item()/len(valid_loader)),
        "ValAcc: {:.2f}%" .format(100*epoch_valid_acc.item()/len(valid_loader)))

# Saving the best model checkpoint

  if best_val_acc < 100*epoch_valid_acc.item()/len(valid_loader):
    best_val_acc = 100*epoch_valid_acc.item()/len(valid_loader)
    best_epoch = epoch

    torch.save({
          'epoch': best_epoch,
          'model_state_dict':model.state_dict(),
          'optimizer_state_dict':optimizer.state_dict(),
          'train_loss': epoch_train_loss.item()/len(train_loader),
          'valid_loss': epoch_valid_loss.item()/len(valid_loader),
          'train_acc':100*epoch_train_acc.item()/len(train_loader),
          'valid_acc':100*epoch_valid_acc.item()/len(valid_loader)
      }, save_path)

Now, we test the best model (which has highest validation accuracy).

In [None]:
# Testing Phase of the Model

with torch.no_grad():
  model.eval()

  checkpoint = torch.load(save_path)
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  test_acc = 0
  for i, test_batch in enumerate(test_loader):
      feats = torch.tensor(test_batch['features']).float()
      labels = torch.tensor(test_batch['labels']).float()

      outputs = model(feats).view(-1)

      accuracy = ((outputs>0.5).float()==labels).float().mean()
      test_acc+=accuracy
  print("Testing best model found after epoch: {}, TrainAcc:{:.2f}%, ValAcc:{:.2f}%".format(
      checkpoint['epoch'], checkpoint['train_acc'], checkpoint['valid_acc']))
  print("Test Accuracy: {:.2f}%" .format(100*test_acc.item()/len(test_loader)))

