In [1]:
import requests, re, time
import torch, torchvision
from torch import nn, optim
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt

In [2]:
#load dataset into directory from drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


According to the second link cited in the proposal, a grayscale transofrm on top of resizing has given the best results when comparing sentiments on faces


In [None]:
torchvision.transforms.Grayscale(num_output_channels=3)
torchvision.transforms.RandomPerspective(distortion_scale=0.5, p=0.5, interpolation=2, fill=0)

In [16]:
xform = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor(), torchvision.transforms.RandomPerspective(distortion_scale=0.5, p=0.5, interpolation=2, fill=0), torchvision.transforms.Grayscale(num_output_channels=3)])
dataset_full = datasets.ImageFolder('/content/drive/MyDrive/dataset', transform=xform)

  "Argument interpolation should be of type InterpolationMode instead of int. "


In [17]:
dataset_full

Dataset ImageFolder
    Number of datapoints: 2223
    Root location: /content/drive/MyDrive/dataset
    StandardTransform
Transform: Compose(
               Resize(size=(224, 224), interpolation=bilinear)
               ToTensor()
               RandomPerspective(p=0.5)
               Grayscale(num_output_channels=3)
           )

In [18]:
dataset_full.class_to_idx

{'angry': 0, 'happy': 1, 'neutral': 2, 'sad': 3}

In [19]:
dataset_full.classes

['angry', 'happy', 'neutral', 'sad']

In [254]:
n_all = len(dataset_full)
n_train = int(0.8 * n_all)
n_test = n_all - n_train
rng = torch.Generator().manual_seed(1549)
dataset_train, dataset_test = torch.utils.data.random_split(dataset_full, [n_train, n_test], rng)
loader_train = torch.utils.data.DataLoader(dataset_train, batch_size = 4, shuffle=True)
loader_test = torch.utils.data.DataLoader(dataset_test, batch_size = 4, shuffle=True) #prepare dataset by splitting, same as usual

In [255]:
model = models.resnet18(pretrained=True)

In [256]:
model.fc = nn.Linear(model.fc.in_features, 4)
torch.nn.init.xavier_uniform_(model.fc.weight)

Parameter containing:
tensor([[-0.0215,  0.0254, -0.0966,  ...,  0.0350,  0.0519, -0.0437],
        [-0.0850, -0.0339, -0.0252,  ...,  0.0672, -0.0673, -0.0990],
        [ 0.0764,  0.0458,  0.0777,  ...,  0.0392, -0.0716,  0.0450],
        [ 0.0526, -0.0382,  0.0939,  ..., -0.0328,  0.0152,  0.0732]],
       requires_grad=True)

In [257]:
torch.cuda.device_count()

1

In [258]:
device = torch.device('cuda:0')
model = model.to(device)

as done in A2, need to modify layers to attain better accuracies later on. could experiment on raw resnet first and modify fc layers later on.

In [259]:
criterion = nn.CrossEntropyLoss()

def run_test(model):
    nsamples_test = len(dataset_test)
    loss, correct = 0, 0
    model.eval()
    with torch.no_grad():
        for samples, labels in loader_test:
            samples = samples.to(device)
            labels = labels.to(device)
            outs = model(samples)
            loss += criterion(outs, labels)
            _, preds = torch.max(outs.detach(), 1)
            correct_mask = preds == labels
            correct += correct_mask.sum(0).item()
    return loss / nsamples_test, correct / nsamples_test

In [260]:
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

In [261]:
def run_train(model, opt, sched):
    nsamples_train = len(dataset_train)
    loss_sofar, correct_sofar = 0, 0
    model.train()
    with torch.enable_grad():
        for samples, labels in loader_train:
            samples = samples.to(device)
            labels = labels.to(device)
            opt.zero_grad()
            outs = model(samples)
            _, preds = torch.max(outs.detach(), 1)
            loss = criterion(outs, labels)
            loss.backward()
            opt.step()
            loss_sofar += loss.item() * samples.size(0)
            correct_sofar += torch.sum(preds == labels.detach())
    sched.step()
    return loss_sofar / nsamples_train, correct_sofar / nsamples_train

In [262]:
def run_all(model, optimizer, scheduler, n_epochs):
    for epoch in range(n_epochs):
        loss_train, acc_train = run_train(model, optimizer, scheduler)
        loss_test, acc_test = run_test(model)
        print(f"epoch {epoch}: train loss {loss_train:.4f} acc {acc_train:.4f}, test loss {loss_test:.4f} acc {acc_test:.4f}")

In [234]:
run_test(model)

  "Palette images with Transparency expressed in bytes should be "


(tensor(0.4698, device='cuda:0'), 0.26292134831460673)

In [235]:
run_all(model, optimizer, scheduler, 10)

  "Palette images with Transparency expressed in bytes should be "


epoch 0: train loss 1.5754 acc 0.4308, test loss 0.2541 acc 0.5798
epoch 1: train loss 1.1229 acc 0.5804, test loss 0.1876 acc 0.7438
epoch 2: train loss 0.9038 acc 0.6659, test loss 0.1933 acc 0.6989
epoch 3: train loss 0.7383 acc 0.7351, test loss 0.1897 acc 0.7079
epoch 4: train loss 0.6514 acc 0.7688, test loss 0.1707 acc 0.7124
epoch 5: train loss 0.4437 acc 0.8498, test loss 0.1059 acc 0.8337
epoch 6: train loss 0.3737 acc 0.8712, test loss 0.0990 acc 0.8494
epoch 7: train loss 0.3583 acc 0.8768, test loss 0.0962 acc 0.8629
epoch 8: train loss 0.3260 acc 0.8915, test loss 0.0968 acc 0.8472
epoch 9: train loss 0.3285 acc 0.8881, test loss 0.0815 acc 0.8831


Exploring hyperparameters space over 10 epochs:

| learning rate | momentum | step size | gamma | minibatch | test loss | test acc | best epoch |
|---------------|----------|-----------|-------|-----------|-----------|----------|------------|
| 0.0001           | 0.9      | 5       | 0.1   | 4       | 0.1608       | 0.7439      | 8        |
| 0.001           | 0.9      | 5       | 0.1   | 4       | 0.0815       | 0.8831      | 9        |
| 0.01           | 0.9      | 5       | 0.1   | 4       | 0.3889       | 0.2966      | 8       |
| 0.001           | 0.9      | 5       | 0.1   | 8       | 0.0468       | 0.8719      | 9       |
| 0.001           | 0.9      | 5       | 0.1   | 16       | 0.0260       | 0.8539      | 9       |
| 0.001           | 0.7      | 5       | 0.1   | 8       | 0.0592       | 0.8189      | 9       |
| 0.001           | 0.9      | 5       | 0.5   | 8       | 0.0391      | 0.8649      | 9       |
| 0.001           | 0.9      | 5       | 0.8   | 8       | 0.0611      | 0.8472      | 9       |
| 0.001           | 0.9      | 8       | 0.1   | 8       | 0.0593      | 0.8337      | 9       |
| 0.001           | 0.9      | 2       | 0.1   | 8       | 0.0587      | 0.8157      | 9       |



-Using grayscale and perspective data augmentation has increased accuracy by 2%. grayscale seems to be regularly used in facial sentiment analysis over multiple studies. 

-using 0.01 learning rate seems to lead to underfitting and not training over the sample space. 

-replacing the single FC layer with a sequential container of 3 linear layers with 2 non linear layers decreases accuracy by 2-3%.