In [1]:
import torch
from torch import nn
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import os
import numpy as np

In [2]:
def notify(title, text, model):
    os.system("""
              osascript -e 'display notification "{}" with title "{}"'
              """.format(text, title))
    # os.system(f'say "{model.__class__.__name__} training complete!"')

In [3]:
rawWhitesDF = pd.read_csv("raw-wine-data/winequality-white.csv", delimiter=";")
rawRedsDF = pd.read_csv("raw-wine-data/winequality-red.csv", delimiter=";")

In [4]:
rawWhitesDF.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
print(rawWhitesDF.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    4898.000000       4898.000000  4898.000000     4898.000000   
mean        6.854788          0.278241     0.334192        6.391415   
std         0.843868          0.100795     0.121020        5.072058   
min         3.800000          0.080000     0.000000        0.600000   
25%         6.300000          0.210000     0.270000        1.700000   
50%         6.800000          0.260000     0.320000        5.200000   
75%         7.300000          0.320000     0.390000        9.900000   
max        14.200000          1.100000     1.660000       65.800000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  4898.000000          4898.000000           4898.000000  4898.000000   
mean      0.045772            35.308085            138.360657     0.994027   
std       0.021848            17.007137             42.498065     0.002991   
min       0.009000             2.000000         

In [6]:
print(rawRedsDF.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

In [7]:
featureLabels = list(rawWhitesDF.columns.values)
classes = [c for c in range(1, 11)]

# Normalising the continuous data with MinMax scaling
scaledWhitesDF = rawWhitesDF.copy()
scaledRedsDF = rawRedsDF.copy()

for column in featureLabels[:-1]:
    scaledWhitesDF[column] = (scaledWhitesDF[column] - scaledWhitesDF[column].min()) / (scaledWhitesDF[column].max() - scaledWhitesDF[column].min())
    scaledRedsDF[column] = (scaledRedsDF[column] - scaledRedsDF[column].min()) / (scaledRedsDF[column].max() - scaledRedsDF[column].min())

In [8]:
whiteTargetsInt = torch.tensor(scaledWhitesDF['quality'].to_numpy())
whiteFeatures = torch.tensor(scaledWhitesDF[featureLabels[0:-1]].to_numpy(), requires_grad=True, dtype=torch.float32)

redTargetsInt = torch.tensor(scaledRedsDF['quality'].to_numpy())
redFeatures = torch.tensor(scaledRedsDF[featureLabels[0:-1]].to_numpy(), requires_grad=True, dtype=torch.float32)

In [9]:
whiteTargets =  []

for i, target in enumerate(whiteTargetsInt):
    embed = torch.zeros(len(classes), dtype=torch.float32)
    embed[target.item()-1] = 1
    whiteTargets.append(embed)
whiteTargets = torch.stack(whiteTargets).to(torch.float32)

redTargets = []

for i, target in enumerate(redTargetsInt):
    embed = torch.zeros(len(classes), dtype=torch.float32)
    embed[target.item()-1] = 1
    redTargets.append(embed)
redTargets = torch.stack(redTargets).to(torch.float32)

In [10]:
class WineDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        sample = {
            'features': self.features[idx],
            'target': self.targets[idx]
        }
        return sample

In [11]:
train_indices, test_indices = train_test_split(range(len(whiteFeatures)), test_size=0.2, random_state=22)
train_indices, val_indices = train_test_split(train_indices, test_size=0.1, random_state=22)

In [12]:
train_dataset = WineDataset(whiteFeatures[train_indices], whiteTargets[train_indices])
val_dataset = WineDataset(whiteFeatures[val_indices], whiteTargets[val_indices])
test_dataset = WineDataset(whiteFeatures[test_indices], whiteTargets[test_indices])

batch_size = 64

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

red_test_dataset = WineDataset(redFeatures, redTargets)
red_test_loader = DataLoader(dataset=red_test_dataset, batch_size=batch_size, shuffle=False)

In [13]:
class FeedForwardModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FeedForwardModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

class ConvModel(nn.Module):
    def __init__(self, input_channels, output_size, hidden_size):
        super(ConvModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64, hidden_size)  
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = x.unsqueeze(1)
        # Convolutional layers
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        # Flatten
        x = x.reshape(x.shape[0], -1)
        # Fully connected layers
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x
    
class TransformerModel(nn.Module):
    def __init__(self, input_size, output_size, num_layers, num_heads, hidden_size, dropout):
        super(TransformerModel, self).__init__()

        # Embedding layer
        self.embedding = nn.Linear(input_size, hidden_size)

        # Transformer encoder layers
        encoder_layer = nn.TransformerEncoderLayer(hidden_size, num_heads, hidden_size, dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)

        # Output layer
        self.output_layer = nn.Linear(hidden_size, output_size)
        
        self.relu = nn.ReLU()

    def forward(self, x):
        # Apply embedding
        x = self.embedding(x)
        
        x = x.unsqueeze(1) # (batch_size, seq_len, hidden_size)
        
        # Apply transformer encoder
        x = self.transformer_encoder(x)

        x = x.squeeze(1) # (batch_size, hidden_size)
        
        # Apply output layer
        x = self.relu(self.output_layer(x))

        return x

In [14]:
def train(model, device, train_loader, optimiser, nepoch, scheduler_step_size, scheduler_gamma):
    scheduler = torch.optim.lr_scheduler.StepLR(optimiser, step_size=scheduler_step_size, gamma=scheduler_gamma)
    model.to(device)
    
    train_losses, val_losses = [], []

    model.train()
    
    # for epoch in range(nepoch):
    for epoch in tqdm(range(nepoch), leave=False, unit='epoch', desc= "Epochs"):
        
        total_train_loss = 0
        train_count = 0
        total_val_loss = 0
        val_count = 0
        
        for batch in train_loader:
            inputs, targets = batch['features'], batch['target']
            inputs, targets = inputs.to(device).detach(), targets.to(device).detach()
            
            optimiser.zero_grad()
            outputs = model(inputs)
            
            train_loss = criterion(outputs, targets)
            train_loss.backward()
            optimiser.step()
            
            total_train_loss += train_loss.item()
            train_count += 1
        
        with torch.no_grad():
            for batch in val_loader:
                inputs, targets = batch['features'], batch['target']
                inputs, targets = inputs.to(device), targets.to(device)
                
                outputs = model(inputs)
                
                val_loss = criterion(outputs, targets)
                
                total_val_loss += val_loss.item()
                val_count += 1
        
        train_losses.append(total_train_loss/train_count)
        val_losses.append(total_val_loss/val_count)
        scheduler.step()
    
    plt.semilogy(train_losses)
    plt.semilogy(val_losses)
    plt.legend(["Train loss", "Val. loss"])
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()
    
    notify("Training Finished", "", model)
    # print("Final training loss:  ", round(train_losses[-1], 4))
    # print("Final validation loss:", round(val_losses[-1], 4))

def test(model, device, test_loader):
    model.to(device)
    
    correct, total = 0, 0
    model.eval()
    
    with torch.no_grad():
        for batch in test_loader:
            inputs, targets = batch['features'], batch['target']
            inputs, targets = inputs.to(device), targets.to(device)
            
            outputs = model(inputs)
            
            # predicted = torch.argmax(outputs, dim=1)
            predicted = torch.topk(outputs, 2, dim=1).indices
            actual = torch.argmax(targets, dim=1)
            
            total += targets.size(0)
            correct += torch.sum(torch.any(predicted.eq(actual.unsqueeze(1)), dim=1)).item()
            
    acc = round(100 * correct / total, 4)
    return acc

In [15]:
baseModel = TransformerModel(input_size=whiteFeatures.shape[1], output_size=len(classes), num_layers=6, num_heads=16, hidden_size=128, dropout=0.5)
criterion = nn.CrossEntropyLoss()
baseOptimiser = torch.optim.Adam(baseModel.parameters(), lr=1e-4, weight_decay=1e-5)

device = torch.device("cpu")
# device = torch.device("mps" if torch.backends.mps.is_built() else "cpu")

scheduler_step_size = 100
scheduler_gamma = 0.1
nepoch = 300

train(baseModel, device, train_loader, baseOptimiser, nepoch, scheduler_step_size, scheduler_gamma)

Epochs:   0%|          | 0/300 [00:00<?, ?epoch/s]

KeyboardInterrupt: 

In [None]:
# Testing the base model on withheld white wine data.
white_base_acc = test(baseModel, device, test_loader)
print("Base model top-2 accuracy on white wine dataset:", white_base_acc)

In [None]:
# Printing model summary
import torchinfo
print(torchinfo.summary(baseModel, (64, 11), col_names = ("input_size", "output_size", "num_params")))

Computational Experiment 1: Testing base model generalisability on red wine dataset.

In [None]:
# Testing the model on red wine data.
red_base_acc = test(baseModel, device, red_test_loader)
print("Base model top-2 accuracy on red wine dataset:", red_base_acc)

Computational Experiment 2: Comparing performance of transformer, feed-forward and CNN.

In [None]:
FFModel = FeedForwardModel(input_size=whiteFeatures.shape[1], output_size=len(classes),hidden_size=128)
FFOptimiser = torch.optim.Adam(FFModel.parameters(), lr=1e-4, weight_decay=1e-5)

train(FFModel, device, train_loader, FFOptimiser, nepoch, scheduler_step_size, scheduler_gamma)

In [None]:
# Testing the feed-forward model on withheld white wine data.
white_FF_acc = test(FFModel, device, test_loader)
print("Feed-forward model top-2 accuracy on white wine dataset:", white_FF_acc) # 

Pretty similar. Makes sense since Transformers are best at sequential because of the multi-head attention. Might not even be worth using transformers because of the much longer training time.  \
    - ~6 mins to train transformer \
    - 

In [None]:
CNNModel = ConvModel(input_channels=1, output_size=len(classes), hidden_size=128)
CNNOptimiser = torch.optim.Adam(CNNModel.parameters(), lr=1e-4, weight_decay=1e-5)

train(CNNModel, device, train_loader, CNNOptimiser, nepoch, scheduler_step_size, scheduler_gamma)

In [None]:
# Testing the CNN model on withheld white wine data.
white_CNN_acc = test(CNNModel, device, test_loader)
print("CNN model top-2 accuracy on white wine dataset:", white_CNN_acc) # 

You could call the three all even since I spent the most time toying with the transformer model hyperparameters trying to get it optimal. Not how well each of them works, its how well they work in unison.

Computational Experiment 3: 

Dropout at 35%

In [None]:
drop35Model = TransformerModel(input_size=whiteFeatures.shape[1], output_size=len(classes), num_layers=6, num_heads=16, hidden_size=128, dropout=0.35)
drop35Optimiser = torch.optim.Adam(drop35Model.parameters(), lr=1e-4, weight_decay=1e-5)
train(drop35Model, device, train_loader, drop35Optimiser, nepoch, scheduler_step_size, scheduler_gamma)

In [None]:
# Testing the dropout-20% model on withheld white wine data.
white_drop35_acc = test(drop35Model, device, test_loader)
print("Dropout-35% model top-2 accuracy on white wine dataset:", white_drop35_acc)

Testing dropout at 20%.

In [None]:
drop20Model = TransformerModel(input_size=whiteFeatures.shape[1], output_size=len(classes), num_layers=6, num_heads=16, hidden_size=128, dropout=0.2)
drop20Optimiser = torch.optim.Adam(drop20Model.parameters(), lr=1e-4, weight_decay=1e-5)
train(drop20Model, device, train_loader, drop20Optimiser, nepoch, scheduler_step_size, scheduler_gamma)

In [None]:
# Testing the dropout-20% model on withheld white wine data.
white_drop20_acc = test(drop20Model, device, test_loader)
print("Dropout-20% model top-2 accuracy on white wine dataset:", white_drop20_acc)

Variations in number of layers

In [None]:
hidden3Model = TransformerModel(input_size=whiteFeatures.shape[1], output_size=len(classes), num_layers=3, num_heads=16, hidden_size=128, dropout=0.5)
hidden3Optimiser = torch.optim.Adam(hidden3Model.parameters(), lr=1e-4, weight_decay=1e-5)
train(hidden3Model, device, train_loader, hidden3Optimiser, nepoch, scheduler_step_size, scheduler_gamma)

In [None]:
# Testing the hidden-4 model on withheld white wine data.
white_hidden3_acc = test(hidden3Model, device, test_loader)
print("Hidden-3 model top-2 accuracy on white wine dataset:", white_hidden3_acc)

4 hidden layers

In [None]:
hidden4Model = TransformerModel(input_size=whiteFeatures.shape[1], output_size=len(classes), num_layers=4, num_heads=16, hidden_size=128, dropout=0.5)
hidden4Optimiser = torch.optim.Adam(hidden4Model.parameters(), lr=1e-4, weight_decay=1e-5)
train(hidden4Model, device, train_loader, hidden4Optimiser, nepoch, scheduler_step_size, scheduler_gamma)

In [None]:
# Testing the hidden-4 model on withheld white wine data.
white_hidden4_acc = test(hidden4Model, device, test_loader)
print("Hidden-4 model top-2 accuracy on white wine dataset:", white_hidden4_acc)

5 hidden layers

In [None]:
hidden5Model = TransformerModel(input_size=whiteFeatures.shape[1], output_size=len(classes), num_layers=5, num_heads=16, hidden_size=128, dropout=0.5)
hidden5Optimiser = torch.optim.Adam(hidden5Model.parameters(), lr=1e-4, weight_decay=1e-5)
train(hidden5Model, device, train_loader, hidden5Optimiser, nepoch, scheduler_step_size, scheduler_gamma)

In [None]:
# Testing the hidden-5 model on withheld white wine data.
white_hidden5_acc = test(hidden5Model, device, test_loader)
print("Hidden-5 model top-2 accuracy on white wine dataset:", white_hidden5_acc)

6 hidden layers

In [None]:
hidden6Model = TransformerModel(input_size=whiteFeatures.shape[1], output_size=len(classes), num_layers=6, num_heads=16, hidden_size=128, dropout=0.5)
hidden6Optimiser = torch.optim.Adam(hidden6Model.parameters(), lr=1e-4, weight_decay=1e-5)
train(hidden6Model, device, train_loader, hidden6Optimiser, nepoch, scheduler_step_size, scheduler_gamma)

In [None]:
# Testing the hidden-5 model on withheld white wine data.
white_hidden6_acc = test(hidden6Model, device, test_loader)
print("Hidden-6 model top-2 accuracy on white wine dataset:", white_hidden6_acc)

7 hidden layers

In [None]:
hidden7Model = TransformerModel(input_size=whiteFeatures.shape[1], output_size=len(classes), num_layers=7, num_heads=16, hidden_size=128, dropout=0.5)
hidden7Optimiser = torch.optim.Adam(hidden7Model.parameters(), lr=1e-4, weight_decay=1e-5)
train(hidden7Model, device, train_loader, hidden7Optimiser, nepoch, scheduler_step_size, scheduler_gamma)

In [None]:
# Testing the hidden-5 model on withheld white wine data.
white_hidden7_acc = test(hidden7Model, device, test_loader)
print("Hidden-7 model top-2 accuracy on white wine dataset:", white_hidden7_acc)

8 hidden layers

In [None]:
hidden8Model = TransformerModel(input_size=whiteFeatures.shape[1], output_size=len(classes), num_layers=8, num_heads=16, hidden_size=128, dropout=0.5)
hidden8Optimiser = torch.optim.Adam(hidden8Model.parameters(), lr=1e-4, weight_decay=1e-5)
train(hidden8Model, device, train_loader, hidden8Optimiser, nepoch, scheduler_step_size, scheduler_gamma)

In [None]:
# Testing the hidden-5 model on withheld white wine data.
white_hidden8_acc = test(hidden8Model, device, test_loader)
print("Hidden-8 model top-2 accuracy on white wine dataset:", white_hidden8_acc)