We will train two different models. One cnn will use the written digit training data to predict the label, while the other, an rnn, will use the spoken audio data to do the same. To make our final predictions, we will use the model which exhibits the highest confidence for each guess. We will procede first by training the cnn: 

In [254]:
#import deps

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchvision import datasets, transforms
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import f1_score

In [255]:
#custom dataset wrapper
class CustomDataset(Dataset):
    def __init__(self, data_file, labels_file=None, is_test=False, RNN=False):
        self.RNN = RNN
        self.data = np.load(data_file)
        if not is_test:
            self.labels = pd.read_csv(labels_file)["label"]
        else:
            self.labels = None
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if self.labels is not None:
            if not self.RNN: 
                sample = {
                    'data': torch.tensor(self.data[idx].reshape(1, 28, 28), dtype=torch.float),
                    'label': torch.tensor(self.labels[idx], dtype=torch.long)
                }
            else: 
                sample = {
                    'data': torch.tensor(self.data[idx], dtype=torch.float),
                    'label': torch.tensor(self.labels[idx], dtype=torch.long)
                }

        else:
            if not self.RNN: 
                sample = {
                    'data': torch.tensor(self.data[idx].reshape(1, 28, 28), dtype=torch.float)
                }
            else: 
                sample = {
                    'data': torch.tensor(self.data[idx], dtype=torch.float),
                }

        return sample


# helper function to retrieve and format data into loaders
def get_data_loaders(data_file, labels_file, batch_size=64, validation_size=0.2, RNN=False):
    dataset = CustomDataset(data_file, labels_file, RNN=RNN)
    
    #split dataset into training and validation sets
    train_indices, val_indices = train_test_split(
        np.arange(len(dataset)),
        test_size=validation_size,
        random_state=21,
        stratify=dataset.labels  
    )
    
    train_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=torch.utils.data.SubsetRandomSampler(train_indices)
    )
    
    val_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=torch.utils.data.SubsetRandomSampler(val_indices)
    )
    
    return train_loader, val_loader

In [256]:
#get the data
img_dataset = CustomDataset("data/x_train_wr.npy", "data/y_train.csv")

In [257]:
#custom cnn implementation
class cnn_block(nn.Module):
  def __init__(self, in_channels = 3, n_hidden = 5, kernel_size = (2, 2)):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Conv2d(in_channels       = in_channels, out_channels = n_hidden, kernel_size = kernel_size, bias=False, padding = 'same'),
        nn.BatchNorm2d(num_features = n_hidden),
        nn.ReLU(),
        nn.Dropout(p=0.2),
        nn.Conv2d(in_channels       = n_hidden, out_channels = in_channels, kernel_size = kernel_size, bias=False, padding = 'same'),
        nn.BatchNorm2d(num_features = in_channels),
        nn.ReLU(),
        nn.Dropout(p=0.2))

  def forward(self, x):
    return x + self.layers(x)


class linear_block(nn.Module):
  def __init__(self, in_features, n_hidden):
    super().__init__()
    self.in_features = (in_features, n_hidden)
    self.layers = nn.Sequential(
        nn.Linear(in_features = in_features, out_features = n_hidden),
        nn.ReLU(),
        nn.Dropout(p=0.2),
        nn.Linear(in_features = n_hidden, out_features = in_features),
        nn.ReLU()
    )

  def forward(self, x):
    return x + self.layers(x)

class CNNClassifier(nn.Module):
  def __init__(self, in_channels = 1, cnn_channels = 1, linear_hidden = 500, n_classes = 10, kernel_size = (3, 3)):
    super().__init__()

    
    self.cnn_layers = nn.Sequential(
        cnn_block(in_channels, cnn_channels, kernel_size),
        cnn_block(in_channels, cnn_channels, kernel_size),
        cnn_block(in_channels, cnn_channels, kernel_size))


    self.down_sample = nn.Conv2d(in_channels = in_channels, out_channels = 1, kernel_size = (1, 1))

    self.linear_layers = nn.Sequential(
        linear_block(28*28, linear_hidden),
        linear_block(28*28, linear_hidden)
    )
    self.last_layer = nn.Linear(28*28, n_classes)

    self.all        = nn.Sequential(
        self.cnn_layers,
        self.down_sample,
        nn.Flatten(),
        self.linear_layers,
        self.last_layer,
    )

  def forward(self, x):
    return self.all(x)

In [258]:
#helper function to train cnn
def train_cnn(model, train_loader, val_loader, optimizer, criterion, epochs=5, device='cpu'):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        val_loss = 0.0
        correct = 0
        total = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/ {epochs}", unit="batch"):
            inputs, labels = batch['data'].to(device), batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        train_loss /= len(train_loader)
        train_acc = correct / total

        # Validation loop
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                inputs, labels = batch['data'].to(device), batch['label'].to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}, Validation Loss: {val_loss:.4f}")


In [259]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#measures f1 score of model
def validate(model, val_loader, RNN=False):
    model.eval()
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch['data'].to(device), batch['label'].to(device)
            hidden = 0
            outputs = 0
            if RNN: 
                hidden = model.initHidden().to(device)
                outputs = model(inputs, hidden)
            else: 
                outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)

            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())

    true_labels = np.array(true_labels)
    predicted_labels = np.array(predicted_labels)

    f1 = f1_score(true_labels, predicted_labels, average='macro')
    return f1

In [260]:
#split the data into training and validation sets with 80-20 split
batch_size = 64
validation_size = 0.2
train_loader, val_loader = get_data_loaders("data/x_train_wr.npy", "data/y_train.csv", batch_size=batch_size, validation_size=validation_size)

The next portion of the code tries out different hyperparams by training different models on a small subset of the training data (1/5 of the training data) for only 3 epochs. It takes a bit to run, but after expirementing, {'cnn_channels': 3, 'linear_hidden': 25, 'kernel_size': (5, 5)} TENDS to work the best (although other hyperparams are close) with an F1 score of roughly .94 (which is pretty good considering it's only training on a small subset of the total data). If you want to check my hyperparam comparision, you can simply change the variable below to True to run this process. Otherwise, we simply default to the previously described hyperparams.

In [261]:
waste_a_lot_of_time = False

In [262]:
#get a smaller dataset for hyperparam comparisons 
hyper_loader, hyper_val_loader = get_data_loaders("data/x_train_wr.npy", "data/y_train.csv", batch_size=batch_size, validation_size=.8) #thus we train on .2 of given data

In [263]:
#test different hyperparams 

hyperparameters = {
    'cnn_channels': [2, 3],  # Vary the number of channels in the CNN layers
    'linear_hidden': [25, 50],  # Vary the number of hidden units in linear layers
    'kernel_size': [(3, 3), (5, 5)]  # Vary the kernel size of the convolutional layers
}

# Perform hyperparameter tuning
best_f1 = 0.0
best_hyperparameters = {}
if (waste_a_lot_of_time):
    for cnn_channels in hyperparameters['cnn_channels']:
        for linear_hidden in hyperparameters['linear_hidden']:
            for kernel_size in hyperparameters['kernel_size']:
                print(f"Training model: cnn_channels={cnn_channels}, linear_hidden={linear_hidden}, kernel_size={kernel_size}")
                model = CNNClassifier(cnn_channels=cnn_channels, linear_hidden=linear_hidden, kernel_size=kernel_size)
                optimizer = optim.Adam(model.parameters(), lr=0.001)
                criterion = nn.CrossEntropyLoss()
                train_cnn(model, hyper_loader, hyper_val_loader, optimizer, criterion, epochs=3)
                val_accuracy = validate(model, hyper_val_loader)
                print(f"Validation F1 for cnn_channels={cnn_channels}, linear_hidden={linear_hidden}, kernel_size={kernel_size}: {val_accuracy}")
                if val_accuracy > best_f1:
                    best_f1 = val_accuracy
                    best_hyperparameters = {'cnn_channels': cnn_channels, 'linear_hidden': linear_hidden, 'kernel_size': kernel_size}
else: 
    best_hyperparameters = {'cnn_channels': 3, 'linear_hidden': 25, 'kernel_size': (5, 5)}

print("Best Hyperparameters:", best_hyperparameters)
print("Best Validation F1:", best_f1)

Best Hyperparameters: {'cnn_channels': 3, 'linear_hidden': 25, 'kernel_size': (5, 5)}
Best Validation Accuracy: 0.0


In [264]:
#train model with selected hyperparams
best_model = CNNClassifier(cnn_channels=best_hyperparameters['cnn_channels'], linear_hidden=best_hyperparameters['linear_hidden'], kernel_size=best_hyperparameters['kernel_size'])
optimizer = optim.Adam(best_model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
train_cnn(best_model, train_loader, val_loader, optimizer, criterion, epochs=8)

Epoch 1/ 5: 100%|██████████| 750/750 [00:26<00:00, 28.64batch/s]


Epoch 1/5, Train Loss: 0.3579, Train Accuracy: 0.8863, Validation Loss: 0.1557


Epoch 2/ 5: 100%|██████████| 750/750 [00:37<00:00, 20.09batch/s]


Epoch 2/5, Train Loss: 0.1774, Train Accuracy: 0.9444, Validation Loss: 0.1285


Epoch 3/ 5: 100%|██████████| 750/750 [00:46<00:00, 16.16batch/s]


Epoch 3/5, Train Loss: 0.1461, Train Accuracy: 0.9545, Validation Loss: 0.1156


Epoch 4/ 5: 100%|██████████| 750/750 [00:37<00:00, 19.87batch/s]


Epoch 4/5, Train Loss: 0.1284, Train Accuracy: 0.9594, Validation Loss: 0.1009


Epoch 5/ 5: 100%|██████████| 750/750 [00:33<00:00, 22.26batch/s]


Epoch 5/5, Train Loss: 0.1180, Train Accuracy: 0.9614, Validation Loss: 0.0996


In [265]:
#helper function to get the test data

def get_test_loader(data_file, batch_size=64, RNN=False):
    test_dataset = CustomDataset(data_file, is_test=True, RNN=RNN)
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False 
    )
    return test_loader

test_data_loader = get_test_loader("data/x_test_wr.npy")

In [266]:
#get predicted labels on the test data

best_model.eval()  # set the mode to eval
predicted_labels = []
confidence_scores = []

with torch.no_grad():
    for batch in test_data_loader:
        inputs = batch['data'].to(device)  
        outputs = best_model(inputs)
        probabilities = nn.functional.softmax(outputs, dim=1)
        _, predicted = torch.max(outputs, 1)
        predicted_labels.extend(predicted.cpu().numpy())
        confidence_scores.extend(probabilities.gather(1, predicted.view(-1, 1)).squeeze().cpu().numpy()) #only add the highest confidence score, or the score of the predicted label

# Convert predictions to numpy array
predicted_labels = np.array(predicted_labels)
confidence_scores = np.array(confidence_scores)

In [267]:
cnn_f1 = validate(best_model, val_loader=val_loader)

Now we will turn to the audio data, for which we will classify using a custom RNN model.

In [268]:
#custom wrapper to format audio data

class AudioDataset(Dataset):
    def __init__(self, data_file, labels_file=None, is_test=False):
        self.data = np.load(data_file)
        if not is_test:
            self.labels = pd.read_csv(labels_file)["label"]
        else:
            self.labels = None
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
       
        sample = {'data': torch.tensor(self.data[idx], dtype=torch.float).unsqueeze(0)}
        
        if self.labels is not None:
            sample['label'] = torch.tensor(self.labels[idx], dtype=torch.long)
        
        return sample

In [269]:

#train rnn
def train_rnn(model, train_loader, val_loader, optimizer, criterion, epochs=8, device='cpu'):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        val_loss = 0.0
        correct = 0
        total = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/ {epochs}", unit="batch"):
            inputs, labels = batch['data'].to(device), batch['label'].to(device)

            hidden = model.initHidden().to(device)

            optimizer.zero_grad()
            outputs = model(inputs, hidden)
            labels = labels.long()
            loss = criterion(outputs, labels) 
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        train_loss /= len(train_loader)
        train_acc = correct / total

        # Validation loop
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                inputs, labels = batch['data'].to(device), batch['label'].to(device)
                hidden = model.initHidden().to(device)
                outputs = model(inputs, hidden)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}, Validation Loss: {val_loss:.4f}")


In [270]:
#retrieve and format the audio data into dataloaders

dataset = AudioDataset("data/x_train_sp.npy", "data/y_train.csv")
    

train_indices, val_indices = train_test_split(
        np.arange(len(dataset)),
        test_size=validation_size,
        random_state=21,
        stratify=dataset.labels  
    )
    
train_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=torch.utils.data.SubsetRandomSampler(train_indices)
    )
    
val_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=torch.utils.data.SubsetRandomSampler(val_indices)
    )

In [271]:
# custom RNN implementation

import torch.nn.functional as F

class RNN(nn.Module): 
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size

        self.input_layer = nn.Linear(input_size, hidden_size)
        self.middle_layer1 = nn.Linear(hidden_size, hidden_size)  # New middle layer
        self.middle_layer2 = nn.Linear(hidden_size, hidden_size)  # Another new middle layer
        self.middle_layer3 = nn.Linear(hidden_size, hidden_size) 
        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden): 
        hidden = F.tanh(self.input_layer(input) + self.middle_layer1(hidden))  # Pass through first middle layer
        hidden = F.tanh(self.middle_layer2(hidden))  # Pass through second middle layer
        hidden = F.tanh(self.middle_layer3(hidden))
        output = self.output_layer(hidden)
        output = F.log_softmax(output.squeeze(1), dim=1)
        return output
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    


Now we will perform hyperparam tuning. This is done by training several models using different combinations of hyperparams on a small subsection of the data. Like the last time, it takes quite a while. By default, this process will not run unless the variable below is changed to True. Otherwise, we will default to {'learning_rate': 0.0001, 'linear_hidden': 600}, which tends to perform the best

In [272]:
waste_a_lot_of_time = False

In [273]:
# get the data for hyperparam tuning. By setting the test_size param to .8, we will only use .2 percent of the data on hyperparam tuning

dataset = AudioDataset("data/x_train_sp.npy", "data/y_train.csv")
    

train_indices, val_indices = train_test_split(
        np.arange(len(dataset)),
        test_size=.8,
        random_state=21,
        stratify=dataset.labels  
    )
    
hyper_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=torch.utils.data.SubsetRandomSampler(train_indices)
    )
    
hyper_val_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=torch.utils.data.SubsetRandomSampler(val_indices)
    )

In [274]:
# Perform hyperparameter tuning

hyperparameters = {
    'learning_rate': [.001, .005, .01],
    'linear_hidden': [400, 500, 600],  
}

best_f1 = 0.0
best_hyperparameters = {}
if (waste_a_lot_of_time):
    for learning_rate in hyperparameters['learning_rate']:
        for linear_hidden in hyperparameters['linear_hidden']:
            print(f"Training model: hidden_size={linear_hidden}, learning_rate={learning_rate}")
            model = RNN(input_size=507, hidden_size=linear_hidden, output_size=10)
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
            criterion = nn.CrossEntropyLoss()
            train_rnn(model, hyper_loader, hyper_val_loader, optimizer, criterion, epochs=3)
            val_accuracy = validate(model, hyper_val_loader, RNN=True)
            print(f"Validation F1 for hidden_size={linear_hidden}, learning_rate={learning_rate}: {val_accuracy}")
            if val_accuracy > best_f1:
                best_f1 = val_accuracy
                best_hyperparameters = {'learning_rate': learning_rate, 'linear_hidden': linear_hidden}
else: 
    best_hyperparameters =  {'learning_rate': .001, 'linear_hidden': 500}

print("Best Hyperparameters:", best_hyperparameters)
print("Best Validation F1:", best_f1)

Best Hyperparameters: {'learning_rate': 0.001, 'linear_hidden': 500}
Best Validation Accuracy: 0.0


In [275]:
#train the final rnn model on the chosen hyperparams 
model = RNN(507, best_hyperparameters["linear_hidden"], 10)
optimizer = optim.Adam(model.parameters(), lr=best_hyperparameters["learning_rate"])
criterion = nn.CrossEntropyLoss()
train_rnn(model, train_loader=train_loader, val_loader=val_loader, optimizer=optimizer, criterion=criterion, epochs=14) 


Epoch 1/ 14:   0%|          | 0/750 [00:00<?, ?batch/s]

Epoch 1/ 14: 100%|██████████| 750/750 [00:08<00:00, 86.17batch/s]


Epoch 1/14, Train Loss: 1.7184, Train Accuracy: 0.3945, Validation Loss: 1.6881


Epoch 2/ 14: 100%|██████████| 750/750 [00:08<00:00, 93.55batch/s] 


Epoch 2/14, Train Loss: 1.4643, Train Accuracy: 0.5009, Validation Loss: 1.3589


Epoch 3/ 14: 100%|██████████| 750/750 [00:08<00:00, 91.05batch/s]


Epoch 3/14, Train Loss: 1.3191, Train Accuracy: 0.5495, Validation Loss: 1.2583


Epoch 4/ 14: 100%|██████████| 750/750 [00:08<00:00, 90.86batch/s]


Epoch 4/14, Train Loss: 1.2050, Train Accuracy: 0.5854, Validation Loss: 1.2295


Epoch 5/ 14: 100%|██████████| 750/750 [00:08<00:00, 91.18batch/s]


Epoch 5/14, Train Loss: 1.0391, Train Accuracy: 0.6435, Validation Loss: 1.0460


Epoch 6/ 14: 100%|██████████| 750/750 [00:08<00:00, 89.31batch/s]


Epoch 6/14, Train Loss: 0.9219, Train Accuracy: 0.6828, Validation Loss: 0.9091


Epoch 7/ 14: 100%|██████████| 750/750 [00:08<00:00, 90.54batch/s]


Epoch 7/14, Train Loss: 0.8456, Train Accuracy: 0.7119, Validation Loss: 0.8387


Epoch 8/ 14: 100%|██████████| 750/750 [00:08<00:00, 91.82batch/s]


Epoch 8/14, Train Loss: 0.7778, Train Accuracy: 0.7315, Validation Loss: 0.6781


Epoch 9/ 14: 100%|██████████| 750/750 [00:09<00:00, 79.94batch/s]


Epoch 9/14, Train Loss: 0.7397, Train Accuracy: 0.7473, Validation Loss: 0.8252


Epoch 10/ 14: 100%|██████████| 750/750 [00:08<00:00, 85.20batch/s]


Epoch 10/14, Train Loss: 0.7010, Train Accuracy: 0.7599, Validation Loss: 0.6888


Epoch 11/ 14: 100%|██████████| 750/750 [00:09<00:00, 80.93batch/s]


Epoch 11/14, Train Loss: 0.6511, Train Accuracy: 0.7757, Validation Loss: 0.6187


Epoch 12/ 14: 100%|██████████| 750/750 [00:08<00:00, 86.21batch/s]


Epoch 12/14, Train Loss: 0.6236, Train Accuracy: 0.7855, Validation Loss: 0.6697


Epoch 13/ 14: 100%|██████████| 750/750 [00:08<00:00, 87.89batch/s]


Epoch 13/14, Train Loss: 0.5941, Train Accuracy: 0.7961, Validation Loss: 0.6093


Epoch 14/ 14: 100%|██████████| 750/750 [00:08<00:00, 85.71batch/s]


Epoch 14/14, Train Loss: 0.5712, Train Accuracy: 0.8041, Validation Loss: 0.6711


In [276]:
rnn_f1 = validate(model, val_loader=val_loader, RNN=True)

Now we will use the model to get our prediction and confidence levels for each sample in the test data

In [277]:
#get and format the test data
test_dataset = AudioDataset("data/x_test_sp.npy", is_test=True)

test_data_loader = DataLoader(test_dataset)

In [278]:
#get the predictions and confidences
model.eval()  # set the mode to eval
rnn_predicted_labels = []
rnn_confidence_scores = []

with torch.no_grad():
    for batch in test_data_loader:
        inputs = batch['data'].to(device)  
        hidden = model.initHidden().to(device)
        outputs = model(inputs, hidden)
        probabilities = nn.functional.softmax(outputs, dim=1)
        max_probabilities, predicted = torch.max(probabilities, 1)
        _, predicted = torch.max(outputs, 1)
        rnn_predicted_labels.extend(predicted.cpu().numpy())
        rnn_confidence_scores.extend(max_probabilities.cpu().numpy()) 
        

# Convert predictions to numpy array
rnn_predicted_labels = np.array(predicted_labels)
rnn_confidence_scores = np.array(confidence_scores)


In [279]:
#make final predictions by using the model which produces the highest confidence score
final_predictions  = np.zeros(len(predicted_labels))

#will use a weighted average for selection, as I don't want to treat the confidence levels the same since the CNN reliably outperforms the RNN most of the time
for i in range(len(predicted_labels)): 
    if (rnn_f1 * rnn_confidence_scores[i]) > (cnn_f1 * confidence_scores[i]):
        final_predictions[i] = int(rnn_predicted_labels[i])
    else: 
        final_predictions[i] = int(predicted_labels[i])

In [280]:
#save final predictions
predictions_df = pd.DataFrame({'row_id': np.array([i for i in range(len(predicted_labels))]), 'label': final_predictions.astype(int).flatten()})
predictions_df.to_csv('Caleb Elizondo preds.csv', index=False)