In [1]:
# This notebook contains a naive mlp for garbage classification
%load_ext autoreload
%autoreload 2
import torch
import torch.nn as nn
import utils
from vocab import Vocabulary
import pandas as pd
import numpy as np
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader

seed = 666
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [2]:
import os
EMB_DIM = 100 # embedding dimension to be used, choose from [50, 100, 200, 300]
glove_path = os.path.expanduser('~/Downloads/glove.6B.{}d.txt'.format(EMB_DIM))
embedding_dict = utils.load_glove_to_dict(glove_path)

400000it [00:09, 43553.46it/s]


In [3]:
MAX_SEQ_LEN = 10 # The maximum input sequence length, if longer, truncate; if shorter, pad

complete_csv_path = './preprocessed.csv'
complete_df = pd.read_csv(complete_csv_path)

complete_vocab = utils.build_vocab(complete_df)

wvecs = utils.build_wvecs(embedding_dict, complete_vocab)

train_csv_path = './train.csv'
val_csv_path = './val.csv'
test_csv_path = './test.csv'

In [4]:
# Load dataset
train_dataset = utils.GarbageDataset(train_csv_path, complete_vocab, MAX_SEQ_LEN)
val_dataset = utils.GarbageDataset(val_csv_path, complete_vocab, MAX_SEQ_LEN)
test_dataset = utils.GarbageDataset(test_csv_path, complete_vocab, MAX_SEQ_LEN)

# Construct dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=utils.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=utils.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=utils.collate_fn)

In [5]:
class baseline_mlp(nn.Module):
    # Baseline mlp for garbage classification
    def __init__(self, embedding_dim, vocab_size, max_seq_length=MAX_SEQ_LEN):
        super(baseline_mlp, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.fc1 = nn.Linear(embedding_dim * max_seq_length, 100)
        self.fc2 = nn.Linear(100, 10)
        self.out = nn.Linear(10, 4)
    
    def forward(self, sequence):
        batch_size = sequence.shape[0]
        embedded = self.embedding(sequence)
        embedded = embedded.view(batch_size, -1)
        z = self.fc1(embedded)
        z = self.fc2(z)
        z = self.out(z)
        return z


In [9]:
def eval(loader: DataLoader, model):
    """Evaluate the classification accuracy of model on the given dataset

    Args:
        loader (DataLoader): 
        model ([type])
    """
    correct_counts = 0
    class_counts = [0] * 4
    correct_counts_per_class = [0] * 4
    loss_sum = 0
    with torch.no_grad():
        for batch in loader:
            input, target = batch
            input, target = input.to(device), target.to(device)
            pred = model(input).squeeze(1)
            pred_classes = torch.argmax(pred, dim=1)
            for i in range(4):
                i_class_index = torch.where(target == i)[0]
                if len(i_class_index) != 0:
                    correct_counts_per_class[i] += torch.sum(pred_classes[i_class_index] == target[i_class_index]).item()
                    class_counts[i] += len(i_class_index)
            correct_counts += torch.sum(pred_classes == target).item()
            loss = loss_fn(pred, target.long())
            loss_sum += loss.item() * target.shape[0]
    return correct_counts / len(loader.dataset), loss_sum / len(loader.dataset), [x / y for x,y in zip(correct_counts_per_class, class_counts)], class_counts

In [11]:
use_cuda = False
device = torch.device("cuda:0" if use_cuda else "cpu") 

mlp = baseline_mlp(100, len(complete_vocab)).to(device)
mlp.embedding.weight.data.copy_(torch.from_numpy(np.array(wvecs)))
max_epoch = 100

optimizer = torch.optim.Adam(mlp.parameters(), lr=5e-4)

loss_fn = nn.CrossEntropyLoss()
loss_fn.to(device)

train_acc_stats = []
val_acc_stats = []
test_acc_stats = []
train_loss_stats = []
val_loss_stats = []
test_loss_stats = []

print('Start training')
for epoch in range(max_epoch):
    mlp.train()
    epoch_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input, target = batch
        input, target = input.to(device), target.to(device)
        pred = mlp(input).squeeze(1)
        loss = loss_fn(pred, target.long())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * target.shape[0]

    mlp.eval()

    val_acc, val_loss, val_acc_per_class, _ = eval(val_loader, mlp)
    train_acc, train_loss, train_acc_per_class, _ = eval(train_loader, mlp)
    test_acc, test_loss, test_acc_per_class, _ = eval(test_loader, mlp)

    val_acc_stats.append(val_acc)
    val_loss_stats.append(val_loss)
    train_acc_stats.append(train_acc)
    train_loss_stats.append(train_loss)
    test_acc_stats.append(test_acc)
    test_loss_stats.append(test_loss)

    print('Epoch:{}\tTrain Loss:{:4f}\tTrain Acc:{:4f}\tVal Loss:{:4f}\tVal Acc:{:4f}\tTest Loss:{:4f}\tTest Acc:{:4f}\t'.format(
        epoch+1, train_loss, train_acc, val_loss, val_acc,test_loss,test_acc))
        
    print('-----------------------------------')


Start training
Epoch:1	Train Loss:1.169547	Train Acc:0.443390	Val Loss:1.156818	Val Acc:0.432570	Test Loss:1.168200	Test Acc:0.453401	
Accuracy of each class on Training set. hazardous wast:0.322549	recyclable waste:0.000000	household food waste:0.000000	residual waste:0.952085
-----------------------------------
Epoch:2	Train Loss:1.041609	Train Acc:0.562302	Val Loss:1.083020	Val Acc:0.549618	Test Loss:1.096558	Test Acc:0.496222	
Accuracy of each class on Training set. hazardous wast:0.694118	recyclable waste:0.015000	household food waste:0.363190	residual waste:0.684117
-----------------------------------
Epoch:3	Train Loss:0.894273	Train Acc:0.639469	Val Loss:1.026546	Val Acc:0.524173	Test Loss:1.024932	Test Acc:0.554156	
Accuracy of each class on Training set. hazardous wast:0.827451	recyclable waste:0.085000	household food waste:0.683436	residual waste:0.535936
-----------------------------------
Epoch:4	Train Loss:0.750438	Train Acc:0.709045	Val Loss:0.991309	Val Acc:0.575064	Tes

Epoch:29	Train Loss:0.063201	Train Acc:0.977862	Val Loss:1.816156	Val Acc:0.643766	Test Loss:1.966788	Test Acc:0.632242	
Accuracy of each class on Training set. hazardous wast:0.984314	recyclable waste:0.980000	household food waste:0.974233	residual waste:0.974268
-----------------------------------
Epoch:30	Train Loss:0.061639	Train Acc:0.977546	Val Loss:1.842854	Val Acc:0.648855	Test Loss:2.002183	Test Acc:0.634761	
Accuracy of each class on Training set. hazardous wast:0.977451	recyclable waste:0.975000	household food waste:0.973006	residual waste:0.981366
-----------------------------------
Epoch:31	Train Loss:0.062124	Train Acc:0.977230	Val Loss:1.871040	Val Acc:0.636132	Test Loss:2.040410	Test Acc:0.642317	
Accuracy of each class on Training set. hazardous wast:0.970588	recyclable waste:0.975000	household food waste:0.973006	residual waste:0.986690
-----------------------------------
Epoch:32	Train Loss:0.063717	Train Acc:0.977230	Val Loss:1.898983	Val Acc:0.641221	Test Loss:2.07

Epoch:57	Train Loss:0.066755	Train Acc:0.977230	Val Loss:2.127520	Val Acc:0.643766	Test Loss:2.118255	Test Acc:0.647355	
Accuracy of each class on Training set. hazardous wast:0.961765	recyclable waste:0.940000	household food waste:0.992638	residual waste:0.986690
-----------------------------------
Epoch:58	Train Loss:0.065053	Train Acc:0.977546	Val Loss:2.109609	Val Acc:0.646310	Test Loss:2.111893	Test Acc:0.644836	
Accuracy of each class on Training set. hazardous wast:0.962745	recyclable waste:0.935000	household food waste:0.992638	residual waste:0.987578
-----------------------------------
Epoch:59	Train Loss:0.067055	Train Acc:0.976913	Val Loss:2.100516	Val Acc:0.648855	Test Loss:2.125758	Test Acc:0.647355	
Accuracy of each class on Training set. hazardous wast:0.962745	recyclable waste:0.925000	household food waste:0.990184	residual waste:0.989352
-----------------------------------
Epoch:60	Train Loss:0.070352	Train Acc:0.976597	Val Loss:2.103404	Val Acc:0.651399	Test Loss:2.14

Epoch:85	Train Loss:0.068605	Train Acc:0.976913	Val Loss:2.039411	Val Acc:0.633588	Test Loss:1.974716	Test Acc:0.644836	
Accuracy of each class on Training set. hazardous wast:0.953922	recyclable waste:0.985000	household food waste:0.980368	residual waste:0.993789
-----------------------------------
Epoch:86	Train Loss:0.069644	Train Acc:0.975965	Val Loss:2.033894	Val Acc:0.633588	Test Loss:1.966826	Test Acc:0.644836	
Accuracy of each class on Training set. hazardous wast:0.950980	recyclable waste:0.985000	household food waste:0.980368	residual waste:0.993789
-----------------------------------
Epoch:87	Train Loss:0.071182	Train Acc:0.974383	Val Loss:2.028983	Val Acc:0.636132	Test Loss:1.960965	Test Acc:0.644836	
Accuracy of each class on Training set. hazardous wast:0.947059	recyclable waste:0.985000	household food waste:0.980368	residual waste:0.992902
-----------------------------------
Epoch:88	Train Loss:0.073283	Train Acc:0.974067	Val Loss:2.025090	Val Acc:0.633588	Test Loss:1.95

In [None]:
import matplotlib.pyplot as plt
plt.figure()
E = np.arange(1, max_epoch+1,1)
plt.title('Learning Curve -- Loss')
plt.plot(E, train_loss_stats, label='Train')
plt.plot(E, val_loss_stats, label='Validation')
plt.plot(E, test_loss_stats, label='Test')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.figure()
E = np.arange(1, max_epoch+1,1)
plt.title('Learning Curve -- Accuracy')
plt.plot(E, train_acc_stats, label='Train')
plt.plot(E, val_acc_stats, label='Validation')
plt.plot(E, test_acc_stats, label='Test')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.show()