# COURSE: A deep understanding of deep learning
## SECTION: Metaparameters (activation, batch, optimizers)
### LECTURE: The "wine quality" dataset
#### TEACHER: Mike X Cohen, sincxpress.com
##### COURSE URL: udemy.com/course/deeplearning_x/?couponCode=202305

In [None]:
### import libraries

# for DL modeling
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,TensorDataset
from sklearn.model_selection import train_test_split

# for number-crunching
import numpy as np
import scipy.stats as stats

# for dataset management
import pandas as pd

# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

import time

# Import and process the data

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

data = pd.read_csv(url, sep = ';')
data = data[data['total sulfur dioxide'] < 200] # drop a few outliers

# z-score all columns except quality
cols2zscore = data.keys()
cols2zscore = cols2zscore.drop('quality')
data[cols2zscore] = data[cols2zscore].apply(stats.zscore)

# create a new column for binarized (boolean) quality
data['boolQuality'] = 0
# data['boolQuality'][data['quality']<6] = 0 # implicit in the code! just here for clarity
data['boolQuality'][data['quality'] > 5] = 1

data[['quality','boolQuality']]

# Re-organize the data: train/test in DataLoaders

In [None]:
# convert from pandas dataframe to tensor
dataT  = torch.tensor( data[cols2zscore].values ).float()
labels = torch.tensor( data['boolQuality'].values ).float()
labels = labels[:,None]

In [None]:
# use scikitlearn to split the data
train_data, test_data, train_labels, test_labels = train_test_split(dataT, labels, test_size = 0.1)

# then convert them into PyTorch Datasets (note: already converted to tensors)
train_data = TensorDataset(train_data, train_labels)
test_data  = TensorDataset(test_data, test_labels)

# finally, translate into dataloader objects
def createData(batchsize = 1):
   train_loader = DataLoader(train_data, batch_size = batchsize, shuffle = True, drop_last = True)
   test_loader  = DataLoader(test_data, batch_size = test_data.tensors[0].shape[0])

   return train_loader, test_loader

# Construct the model and training plans

In [None]:
# a function that creates the ANN model
def createANewModel():

  # model architecture
  ANNWine = nn.Sequential(
      nn.Linear(11, 64),   # input layer
      nn.ReLU(),         # activation unit
      nn.Linear(64, 64),  # hidden layer
      nn.ReLU(),         # activation unit
      nn.Linear(64, 1),   # output units
      nn.Sigmoid()       # final activation unit
        )

  # loss function
  lossfun = nn.BCELoss()

  # optimizer
  optimizer = torch.optim.SGD(ANNWine.parameters(), lr = 0.001)

  return ANNWine, lossfun, optimizer

In [None]:
# train the model

# global parameter
numepochs = 1000

def trainTheModel(ANNWine, train_loader, test_loader):

  # initialize accuracies as empties
  trainAcc   = []
  testAcc    = []
  losses     = []
  start_time = time.process_time()

  # loop over epochs
  for epochi in range(numepochs):

    # ANNiris.train()

    # loop over training data batches
    batchAcc  = []
    batchLoss = []
    for X, y in train_loader:

      # forward pass and loss
      yHat = ANNWine(X)
      loss = lossfun(yHat, y)

      # backprop
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # compute training accuracy just for this batch
      batchAcc.append(100 * torch.mean(((yHat > 0.5) == y).float()))
      batchLoss.append(loss.item())
    # end of batch loop...

    # now that we've trained through the batches, get their average training accuracy
    trainAcc.append(np.mean(batchAcc))
    losses.append(np.mean(batchLoss))

    # test accuracy
    X, y = next(iter(test_loader)) # extract X,y from test dataloader

    # ANNiris.eval()
    with torch.no_grad():
      predlabels = ANNWine(X) > .5

    testAcc.append(100 * torch.mean((predlabels == y).float()))

  # function output
  return trainAcc, testAcc, losses, (time.process_time() - start_time)

# Test it out

In [None]:
batch_sizes = [2**n for n in range(1, 10, 2)]
all_accuracies_train = np.zeros((numepochs, len(batch_sizes)))
all_accuracies_test = np.zeros((numepochs, len(batch_sizes)))
all_computational_time = np.zeros((len(batch_sizes)))

for index, batchsize in enumerate(batch_sizes):
   # create data
   train_loader, test_loader = createData(batchsize)

   # create a model
   ANNWine, lossfun, optimizer = createANewModel()

   # train the model
   trainAcc, testAcc, losses, computational_time = trainTheModel(ANNWine, train_loader, test_loader)

   all_accuracies_train[:, index] = trainAcc
   all_accuracies_test[:, index] = testAcc
   all_computational_time[index] = computational_time

In [None]:
# plot the results
fig, ax = plt.subplots(1, 2, figsize = (18, 6))

ax[0].plot(all_accuracies_train)
ax[0].set_title('Train accuracy')

ax[1].plot(all_accuracies_test)
ax[1].set_title('Test accuracy')

for i in range(2):
    ax[i].set_xlabel('Epochs')
    ax[i].set_ylabel('Accuracy (%)')
    ax[i].set_ylim([30, 100])
    ax[i].legend(batch_sizes)
    ax[i].grid()

plt.show()

plt.bar(range(len(all_computational_time)), all_computational_time, tick_label = batch_sizes)
plt.xlabel('Mini-batch size')
plt.ylabel('Computational time (s)')

plt.show()