## Library

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import os
import torch
from tqdm import tqdm
import json
import time
import requests
import torchaudio
import numpy as np
import pickle
import cv2
from PIL import Image
from torchvision import transforms
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import math
import torch.nn as nn
import random
from torch import optim
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
root = '/content/drive/MyDrive/MSc/Thesis'

# Dataset

In [5]:
def get_names(vid, id):
    name = ""
    if id>=0 and id<10:
        name = f"{vid}/0000" + str(id) + ".jpg"
    elif id>=10 and id<100:
        name = f"{vid}/000" + str(id) + ".jpg"
    elif id>=100 and id<1000:
        name = f"{vid}/00" + str(id) + ".jpg"
    elif id>=1000 and id<10000:
        name = f"{vid}/0" + str(id) + ".jpg"
    else:
        name = f"{vid}/" + str(id) + ".jpg"
    return name

In [6]:
class ABAW_dataset1(Dataset):
    def __init__(self, data, iname, dims, task):
        self.data = data
        self.iname = iname
        self.task = task
        self.feature_dims = dims
    def __getitem__(self, index):
        frame = self.iname[index]
        vname = frame.split('/')[0]
        data = self.data[self.task][vname][frame]
        data['frame'] = frame
        data['vid'] = vname
        data['label'] = self.data[self.task][vname][frame]['label']
        return data

    def __len__(self):
            return self.feature_dims

# Metrics

### Compute loss

In [7]:
def compute_EXP_loss(pred, label, weights):
    cri_exp = nn.CrossEntropyLoss(weights)
    cls_loss = cri_exp(pred, label)
    return cls_loss

In [8]:
def compute_AU_loss(pred, label, class_weights):
    class_weights = torch.from_numpy(class_weights).to(device)
    bce = F.binary_cross_entropy_with_logits(pred, label.float(), reduction='none')
    weights = (class_weights[:, 0]**(1 - label)) * (class_weights[:, 1]**label)
    weighted_bce = bce * weights
    loss = torch.mean(weighted_bce)
    return loss

In [9]:
def CCC_loss(x, y):
    x, y = x.view(-1), y.view(-1)
    vx = x - torch.mean(x)
    vy = y - torch.mean(y)
    rho =  torch.sum(vx * vy) / (torch.sqrt(torch.sum(torch.pow(vx, 2))) * torch.sqrt(torch.sum(torch.pow(vy, 2)))+1e-8)
    x_m, y_m = torch.mean(x), torch.mean(y)
    x_s, y_s = torch.std(x), torch.std(y)
    ccc = 2*rho*x_s*y_s/(torch.pow(x_s, 2) + torch.pow(y_s, 2) + torch.pow(x_m - y_m, 2))
    return 1-ccc

In [10]:
def compute_VA_loss(Vout,Aout,label):
    ccc_loss = CCC_loss(Vout[:,0],label[:,0]) + CCC_loss(Aout[:,0],label[:,1])
    mse_loss = nn.MSELoss()(Vout,label[:,0]) + nn.MSELoss()(Aout,label[:,1])
    return mse_loss,ccc_loss

### Compute F1 score

In [11]:
def compute_EXP_F1(pred, target):
    pred_labels = np.argmax(pred, axis=1)
    target_labels = np.argmax(target, axis=1)
    macro_f1 = f1_score(target_labels,pred_labels,average='macro')
    acc = accuracy_score(target_labels, pred_labels)
    return macro_f1, acc

In [12]:
def f1s_max_AU(label, pred, thresh, i=0):
    pred = np.array(pred)
    label = np.array(label)
    label = label[:,i]
    pred = pred[:,i]
    acc = []
    F1 = []
    for i in thresh:
        new_pred = ((pred >= i) * 1).flatten()
        acc.append(accuracy_score(label.flatten(), new_pred))
        F1.append(f1_score(label.flatten(), new_pred))

    F1_MAX = max(F1)
    if F1_MAX < 0 or math.isnan(F1_MAX):
        F1_MAX = 0
        F1_THRESH = 0
        accuracy = 0
    else:
        idx_thresh = np.argmax(F1)
        F1_THRESH = thresh[idx_thresh]
        accuracy = acc[idx_thresh]
    return F1, F1_MAX, F1_THRESH, accuracy

In [13]:
def compute_AU_F1(pred,label,thresh=np.arange(0.1,1,0.1)):
    F1s = []
    F1t = []
    acc = []
    for i in range(12):
        F1, F1_MAX, F1_THRESH, accuracy = f1s_max_AU(label,pred,thresh,i)
        F1s.append(F1_MAX)
        F1t.append(F1_THRESH)
        acc.append(accuracy)
    acc = [round(a,3) for a in acc]
    return np.mean(F1s),np.mean(F1t),acc, F1t

### Concordance Correlation Coefficient

In [14]:
def CCC_score(x, y):
    x = np.array(x)
    y = np.array(y)
    vx = x - np.mean(x)
    vy = y - np.mean(y)
    rho = np.sum(vx * vy) / (np.sqrt(np.sum(vx**2)) * np.sqrt(np.sum(vy**2)))
    x_m = np.mean(x)
    y_m = np.mean(y)
    x_s = np.std(x)
    y_s = np.std(y)
    ccc = 2*rho*x_s*y_s/(x_s**2 + y_s**2 + (x_m - y_m)**2)
    return ccc

In [15]:
def compute_VA_CCC(x,y):
    x = np.array(x)
    y = np.array(y)
    x[x>1] = 1
    x[x<-1] = -1
    ccc1 = CCC_score(x[:,0],y[:,0])
    ccc2 = CCC_score(x[:,1],y[:,1])

    return ccc1,ccc2

# Smooth utils

In [31]:
def smooth_prediction(img, predict):
    cur_ind = 0
    preds_proba = []
    if img:
        for i in range(img[-1]):
            if img[cur_ind] - 1 == i:
                preds_proba.append(predict[cur_ind])
                cur_ind += 1
            else:
                if cur_ind == 0:
                    preds_proba.append(predict[cur_ind])
                else:
                    w = (i - img[cur_ind - 1] + 1) / (img[cur_ind] - img[cur_ind - 1])
                    pred = w * predict[cur_ind - 1] + (1 - w) * predict[cur_ind]
                    preds_proba.append(pred)
        return np.array([p.cpu().detach().numpy() for p in preds_proba]) #np.array(preds_proba)

In [17]:
def slide_window(preds_proba, i, delta, typ):
    i1 = max(i - delta, 0)
    if typ == 'mean':
        proba = np.mean(preds_proba[i1:i+delta+1], axis=0)
    elif typ == 'median':
        proba = np.median(preds_proba[i1:i+delta+1], axis=0)
    else:
        proba = np.mean(preds_proba[i1:i+delta+1:int(typ)], axis=0)
    return np.argmax(proba), proba

# Challenges

In [18]:
task = ['EXPR_Recognition_Challenge','AU_Detection_Challenge','VA_Estimation_Challenge']
split = ['Train_Set', 'Validation_Set']
typ = ['Train','Val','Test']
vis_typ = ['cropped_aligned', 'cropped', 'cropped_aligned_b0']
visual_feat = 'visualfeat_enet_b2_8_best'
visual_feat_1 = 'visualfeat_enet_b0_8_va_mtl'
audio_feat = ['audiofeat_wav2vec2','audiofeat_vggish','nope']
vis_aud = ['visual_wav2vec2','visual_vggish','visual']
batch_size = 32
model_type = ['fusion', 'mlp']

## EXPR Recognition Challenge

### Loading data

In [19]:
# Cropped_aligned images
vis = vis_typ[0]

#### Effnet + wav2vec2

In [20]:
auft = audio_feat[0]
viau = vis_aud[0]

#### Effnet + vggish

In [None]:
auft = audio_feat[1]
viau = vis_aud[1]

#### Effnet

In [None]:
auft = audio_feat[2]
viau = vis_aud[2]

#### Test

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis}/EXPR/{task[0]}_{typ[2]}_{viau}.pkl'), 'rb') as f:
    data3 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[0]
with open(os.path.join(root, f'data/Annotations/{task[0]}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data3[task1][vname])
    for img in data3[task1][vname].keys():
        iname.append(img)

100%|██████████| 70/70 [00:00<00:00, 1553.79it/s]


In [None]:
dataset = ABAW_dataset1(data3, iname, dims, task1)
test_loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=False)

### Modeling

#### Transformer encoder

In [None]:
class EXP_fusion(nn.Module):
    def __init__(self, batchsize = batch_size, audio_ft = auft, hidden_size = [512, 128, batch_size]):
        super(EXP_fusion, self).__init__()
        self.batchsize = batchsize
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.hidden_size = hidden_size
        self.feat_fc = nn.Conv1d(self.concat_dim, hidden_size[0], 1, padding=0)
        self.activ = nn.LeakyReLU(0.1)
        self.dropout = nn.Dropout(p=0.3)
        self.conv1 = nn.Conv1d(hidden_size[0], hidden_size[1], 1, padding=0)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size[1], nhead=4, dim_feedforward=hidden_size[1], dropout=0.3)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
        self.head = nn.Sequential(
                nn.Linear(hidden_size[1], hidden_size[2]),
                nn.BatchNorm1d(hidden_size[2]),
                nn.Dropout(p=0.3),
                nn.Linear(hidden_size[2], 8))

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs,dim=1)
        feat = torch.transpose(feat,0,1)
        feat = self.feat_fc(feat)
        feat = self.activ(feat)
        out = self.conv1(feat)
        out = torch.transpose(out,0,1)
        out = self.transformer_encoder(out)
        out = self.head(out)

        return out, torch.softmax(out, dim = 1)

In [None]:
EXP_model = EXP_fusion().to(device)
EXP_model

EXP_fusion(
  (feat_fc): Conv1d(1536, 512, kernel_size=(1,), stride=(1,))
  (activ): LeakyReLU(negative_slope=0.1)
  (dropout): Dropout(p=0.3, inplace=False)
  (conv1): Conv1d(512, 128, kernel_size=(1,), stride=(1,))
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (head): Sequential(
    (0): Linear(in_features=128, out_feature

#### MLP

In [21]:
class MLPModel(nn.Module):
    def __init__(self, audio_ft = auft, num_classes=8):
        super(MLPModel, self).__init__()
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.activ = nn.ReLU()
        self.fc1 = nn.Linear(self.concat_dim, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs, dim=1)
        feat = self.fc1(feat)
        feat = self.activ(feat)
        out = self.fc2(feat)
        return out, torch.softmax(out, dim=1)

In [22]:
mlp_model = MLPModel().to(device)
mlp_model

MLPModel(
  (activ): ReLU()
  (fc1): Linear(in_features=2176, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=8, bias=True)
)

#### Utils

In [23]:
weights = [0.4260, 4.1334, 5.9334, 6.8653, 0.7257, 0.9578, 2.0518, 0.4572]
weights = torch.tensor(weights).to(device)

In [24]:
def one_hot_transfer(label, class_num):
    one_hot = torch.eye(class_num)
    one_hot = one_hot.to(device)
    return one_hot[label]

### Testing

In [None]:
def evaluate_model(model, data_loader, au_feat, weight):
    model.eval()
    total_loss = []
    all_preds = []
    all_targets = []
    with torch.no_grad():
        iterator = iter(data_loader)
        for EXPR in iterator:
            if au_feat == 'nope':
                vis_feat, y = EXPR[visual_feat], EXPR['label']
                vis_feat, y = vis_feat.to(device), y.to(device)
                aud_feat = None
            else:
                vis_feat, aud_feat, y = EXPR[visual_feat], EXPR[au_feat], EXPR['label']
                vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
            y_onehot = one_hot_transfer(y, 8).to(device)
            pred, exp_pred = model(vis_feat, aud_feat)
            loss = compute_EXP_loss(pred, y_onehot, weight)
            total_loss.append(loss.item())
            all_preds.extend(exp_pred.cpu().tolist())
            all_targets.extend(y_onehot.cpu().tolist())

    f1_scores, acc = compute_EXP_F1(all_preds, all_targets)
    return round(np.mean(total_loss),3), round(f1_scores,3), round(acc,3)

##### EffNet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
print('EXP_model')
print(visual_feat + ' & ' + auft)
EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_model/best_EXPR_fusion_{viau}.pth'))
EXP_model = EXP_fusion().to(device)
EXP_model.load_state_dict(EXP_best_model)
test_loss, f1s, acc = evaluate_model(EXP_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

EXP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: f1_score 0.33, accuracy: 0.457
34.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_model/best_EXPR_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model, strict=False)
val_loss, f1s, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: f1_score 0.394, accuracy: 0.488
12.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet + Vggish

In [None]:
%%timeit -n 1 -r 1
print('EXP_model')
print(visual_feat + ' & ' + auft)
EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_model/best_EXPR_fusion_{viau}.pth'))
EXP_model = EXP_fusion().to(device)
EXP_model.load_state_dict(EXP_best_model)
test_loss, f1s, acc = evaluate_model(EXP_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

EXP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score 0.316, accuracy: 0.475
33 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_model/best_EXPR_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model, strict=False)
val_loss, f1s, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score 0.379, accuracy: 0.498
11.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet

In [None]:
%%timeit -n 1 -r 1
print('EXP_model')
print(visual_feat + ' & ' + auft)
EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_model/best_EXPR_fusion_{viau}.pth'))
EXP_model = EXP_fusion().to(device)
EXP_model.load_state_dict(EXP_best_model)
test_loss, f1s, acc = evaluate_model(EXP_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

EXP_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score 0.31, accuracy: 0.447
29.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_model/best_EXPR_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model, strict=False)
val_loss, f1s, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

MLP_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score 0.327, accuracy: 0.431
8.93 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### Smoothing

In [25]:
tsk = task[0]
vis = vis_typ[0]
viau = vis_aud[0]
auft = audio_feat[0]

In [26]:
class MLPModel(nn.Module):
    def __init__(self, audio_ft = auft, num_classes=8):
        super(MLPModel, self).__init__()
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.activ = nn.ReLU()
        self.fc1 = nn.Linear(self.concat_dim, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, vis_feat, aud_feat):
        inputs = [vis_feat]
        inputs.append(aud_feat)
        feat = torch.cat(inputs, dim=0)
        feat = self.fc1(feat)
        feat = self.activ(feat)
        out = self.fc2(feat)
        return out, torch.softmax(out, dim=0)

In [27]:
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_model/best_EXPR_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)

<All keys matched successfully>

In [28]:
anno_path = os.path.join(root,f'data/Annotations/{tsk}/{split[1]}')
with open(os.path.join(root, f'data/Annotations/{tsk}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
with open(os.path.join(root,f'models/ABAW6/{vis}/EXPR/{tsk}_{typ[2]}_{viau}.pkl'), 'rb') as f:
    data = pickle.load(f)

In [29]:
test_vid = {}
for vname in tqdm(vidnames):
        img, predict, label = [], [], []
        for imgname, val in sorted(data[tsk][vname].items()):
            vis_feat = torch.tensor(val[visual_feat]).to(device)
            if auft == 'nope':
                aud_feat = None
            else:
                aud_feat = torch.tensor(val[auft]).to(device)
            if tsk == task[2]:
                _, _, vpred, apred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor([vpred, apred])
            else:
                _, pred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor(pred)
            ind = int(imgname.split('/')[1][:-4])
            img.append(ind)
            predict.append(preds)
            label.append(data[tsk][vname][imgname]['label'])
        test_vid[vname] = (img, predict, label)

100%|██████████| 70/70 [01:22<00:00,  1.18s/it]


In [32]:
hyperparams=[(isMean,delta) for delta in [0, 5, 15, 30, 60, 100, 200] for isMean in [1,0] if not (isMean==0 and delta==0)]
total_true=[]
total_preds=[[] for _ in range(len(hyperparams))]
timing_results = {(isMean, delta): 0 for isMean, delta in hyperparams}
for videoname,(img, predict, label) in test_vid.items():
    for i,ind in enumerate(img):
        total_true.append(label[i].cpu().numpy())
    preds_proba = smooth_prediction(img, predict)
    for hInd,(isMean,delta) in enumerate(hyperparams):
        preds=[]
        start = time.time()
        for i in range(len(preds_proba)):
            i1=max(i-delta,0)
            if isMean:
                best_ind, proba = slide_window(preds_proba, i, delta, 'mean')
            else:
                best_ind, proba = slide_window(preds_proba, i, delta, 'median')
            preds.append(best_ind)
        for i,ind in enumerate(img):
            if label[i]>=0:
                total_preds[hInd].append(preds[ind-1])
        end = time.time()
        timing_results[(isMean, delta)] += end - start
total_true=np.array(total_true)

In [33]:
for hInd, (isMean, delta) in enumerate(hyperparams):
    preds = np.array(total_preds[hInd])
    accuracy = round((preds == total_true).mean(), 3)
    f1 = round(f1_score(y_true=total_true, y_pred=preds, average='macro'), 3)
    mean_or_median = 'mean' if isMean else 'median'
    time_taken = round(timing_results[(isMean, delta)],3)
    print(f'{mean_or_median}; delta: {delta}; Acc: {accuracy}; F1: {f1}; Time: {time_taken}s')

mean; delta: 0; Acc: 0.488; F1: 0.394; Time: 7.318s
mean; delta: 5; Acc: 0.512; F1: 0.418; Time: 7.51s
median; delta: 5; Acc: 0.509; F1: 0.417; Time: 16.631s
mean; delta: 15; Acc: 0.524; F1: 0.429; Time: 7.611s
median; delta: 15; Acc: 0.523; F1: 0.43; Time: 17.372s
mean; delta: 30; Acc: 0.533; F1: 0.44; Time: 8.184s
median; delta: 30; Acc: 0.533; F1: 0.442; Time: 18.457s
mean; delta: 60; Acc: 0.537; F1: 0.443; Time: 7.855s
median; delta: 60; Acc: 0.538; F1: 0.449; Time: 20.305s
mean; delta: 100; Acc: 0.539; F1: 0.448; Time: 8.482s
median; delta: 100; Acc: 0.541; F1: 0.454; Time: 22.453s
mean; delta: 200; Acc: 0.538; F1: 0.437; Time: 10.033s
median; delta: 200; Acc: 0.544; F1: 0.457; Time: 27.948s


In [None]:
for hInd, (isMean, delta) in enumerate(hyperparams):
    preds = np.array(total_preds[hInd])
    accuracy = round((preds == total_true).mean(), 3)
    f1 = round(f1_score(y_true=total_true, y_pred=preds, average='macro'), 3)
    mean_or_median = 'mean' if isMean else 'median'
    time_taken = round(timing_results[(isMean, delta)],3)
    print(f'{mean_or_median}; delta: {delta}; Acc: {accuracy}; F1: {f1}; Time: {time_taken}s')

mean; delta: 0; Acc: 0.488; F1: 0.394; Time: 9.554s
mean; delta: 15; Acc: 0.524; F1: 0.429; Time: 9.97s
median; delta: 15; Acc: 0.523; F1: 0.43; Time: 20.216s
mean; delta: 30; Acc: 0.533; F1: 0.44; Time: 9.814s
median; delta: 30; Acc: 0.533; F1: 0.442; Time: 20.944s
mean; delta: 60; Acc: 0.537; F1: 0.443; Time: 9.286s
median; delta: 60; Acc: 0.538; F1: 0.449; Time: 22.854s
mean; delta: 100; Acc: 0.539; F1: 0.448; Time: 9.671s
median; delta: 100; Acc: 0.541; F1: 0.454; Time: 25.641s
mean; delta: 200; Acc: 0.538; F1: 0.437; Time: 11.106s
median; delta: 200; Acc: 0.544; F1: 0.457; Time: 31.662s


### Adaptive Frame Rate

In [None]:
delta = 200

In [None]:
anno_path = os.path.join(root,f'data/Annotations/{tsk}/{split[0]}')
with open(os.path.join(root, f'data/Annotations/{tsk}/{typ[0]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
with open(os.path.join(root,f'models/ABAW6/{vis}/EXPR/{tsk}_{typ[0]}_{viau}.pkl'), 'rb') as f:
    data = pickle.load(f)
train_vid = {}
for vname in tqdm(vidnames):
        img, predict, label = [], [], []
        for imgname, val in sorted(data[tsk][vname].items()):
            vis_feat = torch.tensor(val[visual_feat]).to(device)
            if auft == 'nope':
                aud_feat = None
            else:
                aud_feat = torch.tensor(val[auft]).to(device)
            if tsk == task[2]:
                _, _, vpred, apred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor([vpred, apred])
            else:
                _, pred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor(pred)
            ind = int(imgname.split('/')[1][:-4])
            img.append(ind)
            predict.append(preds)
            label.append(data[tsk][vname][imgname]['label'])
        train_vid[vname] = (img, predict, label)

100%|██████████| 199/199 [02:23<00:00,  1.39it/s]


In [None]:
stride2scores={}
for stride in [200, 100, 50, 25, 10]:
    total_true, predictions, max_decision_values = [],[],[]
    for vidname, (img, predict, label) in train_vid.items():
        index = []
        for i,ind in enumerate(img):
            total_true.append(label[i].cpu().numpy())
            index.append(ind-1)
        preds_proba = smooth_prediction(img, predict)
        for i in range(len(index)):
            best_ind, proba = slide_window(preds_proba, index[i], delta, stride)
            predictions.append(best_ind)
            max_decision_values.append(proba[best_ind])
    stride2scores[stride] = (np.array(total_true),np.array(predictions),np.array(max_decision_values))

In [None]:
def get_threshold(stride,fpr_corrected):
    (total_true,predictions,max_decision_values) = stride2scores[stride]
    mistakes = max_decision_values[predictions != total_true]
    best_threshold = -1
    for i, threshold in enumerate(sorted(max_decision_values[predictions == total_true])[::-1]):
        tpr = i/len(predictions)
        fpr = (mistakes > threshold).sum()/len(predictions)
        if fpr > fpr_corrected:
            if best_threshold == -1:
                best_threshold = threshold
            print(stride, 'best_threshold', best_threshold, i)
            break
        best_threshold = threshold
    return best_threshold

In [None]:
stride2threshold = {}
for stride in stride2scores:
    fpr_corrected=0.05
    stride2threshold[stride] = get_threshold(stride,fpr_corrected)
stride2threshold[1] = 0
print(stride2threshold)

200 best_threshold 0.5776766 230724
100 best_threshold 0.5594592 241835
50 best_threshold 0.54468054 251611
25 best_threshold 0.538801 255266
10 best_threshold 0.53340304 259316
{200: 0.5776766, 100: 0.5594592, 50: 0.54468054, 25: 0.538801, 10: 0.53340304, 1: 0}


In [None]:
all_strides=[
    [200, 100, 50, 10, 1],
    [50, 25, 1],
    [50, 10, 1],
    [200,50,1],
    [100,50,1],
    [200,1],
    [100,1],
    [50,1]
]
for s in stride2threshold.keys():
    all_strides.append([s])

for strides in all_strides:
    print(strides)
    last_stride=strides[-1]

    total_true=[]
    total_preds=[]
    total_frames_processed,total_frames=0,0
    time_each = []
    start = time.time()
    for videoname, (img, predict, label) in test_vid.items():
        emotional_img=[]
        start1 = time.time()
        for i,ind in enumerate(img):
            total_true.append(label[i].cpu().numpy())
            emotional_img.append(ind-1)
        cur_ind=0
        preds_proba=[]
        for i in range(img[-1]):
            if img[cur_ind]-1==i:
                preds_proba.append(predict[cur_ind])
                cur_ind+=1
            else:
                if cur_ind==0:
                    preds_proba.append(predict[cur_ind])
                else:
                    w=(i-img[cur_ind-1]+1)/(img[cur_ind]-img[cur_ind-1])
                    pred=w*predict[cur_ind-1]+(1-w)*predict[cur_ind]
                    preds_proba.append(pred)

        preds_proba=np.array([p.cpu().numpy() for p in preds_proba])

        preds=-np.ones(len(emotional_img))
        end1 = time.time()
        time_each.append(end1 - start1)
        for stride in strides:
            threshold=stride2threshold[stride]
            for i in range(len(emotional_img)):
                if preds[i]<0:
                    i1=max(emotional_img[i]-delta,0)
                    cur_preds=preds_proba[i1:emotional_img[i]+delta+1:stride]
                    proba=np.median(cur_preds,axis=0)
                    best_ind=np.argmax(proba)
                    if proba[best_ind]>=threshold or stride==last_stride:
                        total_frames_processed+=len(cur_preds)
                        total_frames+=len(preds_proba[i1:emotional_img[i]+delta+1])
                        preds[i]=best_ind
        for p in preds:
            total_preds.append(p)
    end = time.time()
    elapsed_time = end - start - sum(time_each)
    total_true=np.array(total_true)
    preds=np.array(total_preds)
    print('Acc:',round((preds==total_true).mean(),3), 'F1:',round(f1_score(y_true=total_true,y_pred=preds, average="macro"),3))
    print(total_frames_processed,total_frames,round(total_frames_processed/total_frames,3))
    print(f"Time: {elapsed_time:.2f} seconds")

[200, 100, 50, 10, 1]
Acc: 0.543 F1: 0.454
66718832 109811197 0.608
Time: 50.22 seconds
[50, 25, 1]
Acc: 0.544 F1: 0.457
75888291 109811197 0.691
Time: 35.62 seconds
[50, 10, 1]
Acc: 0.544 F1: 0.457
74809429 109811197 0.681
Time: 35.97 seconds
[200, 50, 1]
Acc: 0.543 F1: 0.455
72017266 109811197 0.656
Time: 34.61 seconds
[100, 50, 1]
Acc: 0.544 F1: 0.456
75050361 109811197 0.683
Time: 35.59 seconds
[200, 1]
Acc: 0.543 F1: 0.455
81565887 109811197 0.743
Time: 29.01 seconds
[100, 1]
Acc: 0.544 F1: 0.456
80951438 109811197 0.737
Time: 28.60 seconds
[50, 1]
Acc: 0.544 F1: 0.457
79911794 109811197 0.728
Time: 28.14 seconds
[200]
Acc: 0.498 F1: 0.411
816872 109811197 0.007
Time: 13.12 seconds
[100]
Acc: 0.514 F1: 0.429
1364594 109811197 0.012
Time: 13.06 seconds
[50]
Acc: 0.527 F1: 0.443
2459930 109811197 0.022
Time: 13.21 seconds
[25]
Acc: 0.536 F1: 0.45
4650733 109811197 0.042
Time: 13.65 seconds
[10]
Acc: 0.541 F1: 0.455
11223222 109811197 0.102
Time: 13.91 seconds
[1]
Acc: 0.544 F1: 0.45

## AU Detection Challenge

### Loading data

In [None]:
# Cropped_aligned images
vis = vis_typ[0]
vis1 = vis_typ[2]

#### Effnet + wav2vec2

In [None]:
auft = audio_feat[0]
viau = vis_aud[0]

#### Effnet + vggish

In [None]:
auft = audio_feat[1]
viau = vis_aud[1]

#### Effnet

In [None]:
auft = audio_feat[2]
viau = vis_aud[2]

#### Test

##### EffNet_B2

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis}/AU/{task[1]}_{typ[2]}_{viau}.pkl'), 'rb') as f:
    data3 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[1]
with open(os.path.join(root, f'data/Annotations/{task[1]}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data3[task1][vname])
    for img in data3[task1][vname].keys():
        iname.append(img)

100%|██████████| 105/105 [00:00<00:00, 1203.40it/s]


In [None]:
dataset = ABAW_dataset1(data3, iname, dims, task1)
test_loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=False)

##### EffNet_B0

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis1}/AU/{task[1]}_{typ[2]}_{viau}.pkl'), 'rb') as f:
    data3 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[1]
with open(os.path.join(root, f'data/Annotations/{task[1]}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data3[task1][vname])
    for img in data3[task1][vname].keys():
        iname.append(img)

In [None]:
dataset = ABAW_dataset1(data3, iname, dims, task1)
test_loader_b0 = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=False)

### Modeling

In [None]:
class AU_fusion(nn.Module):
    def __init__(self, batchsize = batch_size, audio_ft = auft, hidden_size = [512, 128, batch_size]):
        super(AU_fusion, self).__init__()
        self.batchsize = batchsize
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.hidden_size = hidden_size
        self.feat_fc = nn.Conv1d(self.concat_dim, hidden_size[0], 1, padding=0)
        self.activ = nn.LeakyReLU(0.1)
        self.dropout = nn.Dropout(p=0.3)
        self.conv1 = nn.Conv1d(hidden_size[0], hidden_size[1], 1, padding=0)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size[1], nhead=4, dim_feedforward=hidden_size[1], dropout=0.3)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
        self.head = nn.Sequential(
                nn.Linear(hidden_size[1], hidden_size[2]),
                nn.BatchNorm1d(hidden_size[2]),
                nn.Linear(hidden_size[2], 12))

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs,dim=1)
        feat = torch.transpose(feat,0,1)
        feat = self.feat_fc(feat)
        feat = self.activ(feat)
        out = self.conv1(feat)
        out = torch.transpose(out,0,1)
        out = self.transformer_encoder(out)
        out = self.head(out)

        return out, torch.sigmoid(out)

In [None]:
AU_model = AU_fusion().to(device)
AU_model

AU_fusion(
  (feat_fc): Conv1d(1408, 512, kernel_size=(1,), stride=(1,))
  (activ): LeakyReLU(negative_slope=0.1)
  (dropout): Dropout(p=0.3, inplace=False)
  (conv1): Conv1d(512, 128, kernel_size=(1,), stride=(1,))
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (head): Sequential(
    (0): Linear(in_features=128, out_features

In [None]:
class MLPModel(nn.Module):
    def __init__(self, audio_ft = auft, num_classes=12):
        super(MLPModel, self).__init__()
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.activ = nn.ReLU()
        self.fc1 = nn.Linear(self.concat_dim, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs, dim=1)
        feat = self.fc1(feat)
        feat = self.activ(feat)
        out = self.fc2(feat)
        return out, torch.sigmoid(out)

In [None]:
class MLPModel_b0(nn.Module):
    def __init__(self, audio_ft = auft, num_classes=12):
        super(MLPModel_b0, self).__init__()
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2048 #1280+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1408 #1280+128
        elif audio_ft == 'nope':
            self.concat_dim = 1280 #1280    #visual only
        self.activ = nn.ReLU()
        self.fc1 = nn.Linear(self.concat_dim, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs, dim=1)
        feat = self.fc1(feat)
        feat = self.activ(feat)
        out = self.fc2(feat)
        return out, torch.sigmoid(out)

In [None]:
mlp_model = MLPModel().to(device)
mlp_model

MLPModel(
  (activ): ReLU()
  (fc1): Linear(in_features=1536, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=12, bias=True)
)

#### Utils

In [None]:
weights = np.array([[ 0.57226017, 3.95972043],
 [ 0.52682923,  9.81819492],
 [ 0.59477327,  3.13787445],
 [ 0.67850442,  1.90052551],
 [ 0.82920966,  1.25939448],
 [ 0.75993937,  1.46176273],
 [ 0.65729582,  2.08936204],
 [ 0.51501993, 17.14454686],
 [ 0.5158745,  16.24852424],
 [ 0.51460124, 17.62183135],
 [ 1.33900046,  0.79797362],
 [ 0.54113187,  6.57801166]])

In [None]:
weights1 = torch.tensor([0.54733899, 0.44180561, 0.56990565, 0.61997328, 0.73956417,0.74692377, 0.72684634, 0.33222808, 0.17383676, 0.20608964, 0.83688068, 0.33890931]).to(device)

### Testing

In [None]:
def evaluate_model(model, data_loader, vi_feat, au_feat, weight):
    model.eval()
    total_loss = []
    all_preds = []
    all_targets = []
    with torch.no_grad():
        iterator = iter(data_loader)
        for AU in iterator:
            if au_feat == 'nope':
                vis_feat, y = AU[vi_feat], AU['label']
                vis_feat, y = vis_feat.to(device), y.to(device)
                aud_feat = None
            else:
                vis_feat, aud_feat, y = AU[vi_feat], AU[au_feat], AU['label']
                vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
            pred, au_pred = model(vis_feat, aud_feat)
            loss = compute_AU_loss(pred, y, weight)
            total_loss.append(loss.item())
            all_preds.extend(au_pred.cpu().tolist())
            all_targets.extend(y.cpu().tolist())

    f1_scores, f1_thresh, acc, threshold = compute_AU_F1(all_preds, all_targets)
    return round(np.mean(total_loss),3), round(f1_scores,3), round(f1_thresh,3), acc, threshold

##### EffNet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
print('AU_model')
print(visual_feat + ' & ' + auft)
AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_model/best_AU_fusion_{viau}_loss.pth'))
AU_model = AU_fusion().to(device)
AU_model.load_state_dict(AU_best_model)
test_loss, f1s, f1t, acc, threshold = evaluate_model(AU_model, test_loader, auft, weights1)
print(f'Test set: f1_score: {f1s}, accuracy: {acc}, f1_threshold: {threshold}')

AU_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: f1_score: 0.497, accuracy: [0.86, 0.889, 0.857, 0.79, 0.764, 0.8, 0.858, 0.901, 0.938, 0.932, 0.734, 0.803], f1_threshold: [0.2, 0.1, 0.30000000000000004, 0.30000000000000004, 0.5, 0.4, 0.4, 0.1, 0.1, 0.1, 0.4, 0.1]
1min 29s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_model/best_AU_mlp_{viau}.pth'))
mlp_model = MLPModel(num_classes = 12).to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, f1s, f1t, acc, threshold = evaluate_model(mlp_model, test_loader, visual_feat, auft, weights)
print(f'Test set: f1_score: {f1s}, accuracy: {acc}, f1_threshold: {threshold}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: f1_score: 0.513, accuracy: [0.875, 0.913, 0.858, 0.813, 0.757, 0.807, 0.863, 0.879, 0.954, 0.926, 0.746, 0.849], f1_threshold: [0.7000000000000001, 0.8, 0.7000000000000001, 0.6, 0.5, 0.6, 0.7000000000000001, 0.6, 0.8, 0.8, 0.30000000000000004, 0.7000000000000001]
50.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet + Vggish

In [None]:
%%timeit -n 1 -r 1
print('AU_model')
print(visual_feat + ' & ' + auft)
AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_fusion_{viau}.pth'))
AU_model = AU_fusion().to(device)
AU_model.load_state_dict(AU_best_model)
test_loss, f1s, f1t, acc, threshold = evaluate_model(AU_model, test_loader, auft, weights1)
print(f'Test set: f1_score: {f1s}, accuracy: {acc}, f1_threshold: {threshold}')

AU_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score: 0.494, accuracy: [0.886, 0.914, 0.86, 0.79, 0.772, 0.805, 0.867, 0.907, 0.925, 0.935, 0.76, 0.88], f1_threshold: [0.30000000000000004, 0.2, 0.30000000000000004, 0.4, 0.5, 0.4, 0.5, 0.1, 0.1, 0.2, 0.5, 0.2]
1min 26s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_model/best_AU_mlp_{viau}.pth'))
mlp_model = MLPModel(num_classes = 12).to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, f1s, f1t, acc, threshold = evaluate_model(mlp_model, test_loader, visual_feat, auft, weights)
print(f'Test set: f1_score: {f1s}, accuracy: {acc}, f1_threshold: {threshold}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score: 0.52, accuracy: [0.875, 0.913, 0.849, 0.809, 0.773, 0.8, 0.865, 0.897, 0.932, 0.922, 0.759, 0.849], f1_threshold: [0.7000000000000001, 0.8, 0.6, 0.6, 0.5, 0.5, 0.7000000000000001, 0.7000000000000001, 0.7000000000000001, 0.8, 0.30000000000000004, 0.7000000000000001]
48.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet

In [None]:
%%timeit -n 1 -r 1
print('AU_model')
print(visual_feat + ' & ' + auft)
AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_model/best_AU_fusion_{viau}.pth'))
AU_model = AU_fusion().to(device)
AU_model.load_state_dict(AU_best_model)
test_loss, f1s, f1t, acc, threshold = evaluate_model(AU_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, accuracy: {acc}, f1_threshold: {threshold}')

AU_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score: 0.509, accuracy: [0.868, 0.911, 0.854, 0.794, 0.765, 0.793, 0.853, 0.905, 0.943, 0.951, 0.742, 0.838], f1_threshold: [0.7000000000000001, 0.8, 0.5, 0.5, 0.5, 0.5, 0.6, 0.5, 0.7000000000000001, 0.8, 0.30000000000000004, 0.6]
1min 28s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_mlp_{viau}_f1s.pth'))
mlp_model = MLPModel(num_classes = 12).to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, f1s, f1t, acc, threshold = evaluate_model(mlp_model, test_loader, visual_feat, auft, weights)
print(f'Test set: f1_score: {f1s}, accuracy: {acc}, f1_threshold: {threshold}')

MLP_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score: 0.521, accuracy: [0.878, 0.916, 0.864, 0.791, 0.772, 0.795, 0.86, 0.865, 0.933, 0.919, 0.757, 0.846], f1_threshold: [0.7000000000000001, 0.8, 0.7000000000000001, 0.5, 0.5, 0.5, 0.6, 0.6, 0.7000000000000001, 0.8, 0.30000000000000004, 0.7000000000000001]
52.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### Ensembling

#### EffNet_B2 + Wav2Vec2

In [None]:
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_model/best_AU_mlp_{viau}.pth'))
mlp_model_wav = MLPModel().to(device)
mlp_model_wav.load_state_dict(mlp_best_model)

<All keys matched successfully>

In [None]:
iterator = iter(test_loader)
all_preds_wav, all_targets_wav = [], []
img = []
for AU in iterator:
    frame = AU['frame']
    for imgname in frame:
        img.append(imgname)
    vis_feat, aud_feat, y = AU[visual_feat], AU[auft], AU['label']
    vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
    pred, au_pred = mlp_model_wav(vis_feat, aud_feat)
    all_preds_wav.extend(au_pred.cpu().tolist())
    all_targets_wav.extend(y.cpu().tolist())

#### EffNet_B2 + Vggish

In [None]:
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_model/best_AU_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)

<All keys matched successfully>

In [None]:
iterator = iter(test_loader)
all_preds, all_targets = [], []
img1 = []
for AU in iterator:
    frame = AU['frame']
    for imgname in frame:
        img1.append(imgname)
    vis_feat, aud_feat, y = AU[visual_feat], AU[auft], AU['label']
    vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
    pred, au_pred = mlp_model(vis_feat, aud_feat)
    all_preds.extend(au_pred.cpu().tolist())
    all_targets.extend(y.cpu().tolist())

#### EffNet_B0 + Vggish

In [None]:
mlp_best_model_b0 = torch.load(os.path.join(root,f'models/ABAW6/{vis1}/best_AU_mlp_{viau}_acc.pth'))
mlp_model_b0 = MLPModel_b0(num_classes = 12).to(device)
mlp_model_b0.load_state_dict(mlp_best_model_b0)

<All keys matched successfully>

In [None]:
iterator = iter(test_loader_b0)
all_preds_b0, all_targets_b0 = [], []
img = []
for AU in iterator:
    frame = AU['frame']
    for imgname in frame:
        img.append(imgname)
    vis_feat, aud_feat, y = AU[visual_feat_1], AU[auft], AU['label']
    vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
    pred, au_pred = mlp_model_b0(vis_feat, aud_feat)
    all_preds_b0.extend(au_pred.cpu().tolist())
    all_targets_b0.extend(y.cpu().tolist())

#### Ensemble

##### EffNet_B2 + Wav2Vec2 + Vggish

In [None]:
zipped_data = zip(img1, all_preds, all_targets)
zipped_data_wav = zip(img, all_preds_wav, all_targets_wav)

sorted_data = sorted(zipped_data)
sorted_data_wav = sorted(zipped_data_wav)

re_img_wav, re_all_preds_wav, re_all_targets_wav = zip(*sorted_data_wav)
re_img, re_all_preds, re_all_targets = zip(*sorted_data)

In [None]:
f1s, _, _, thresholds = compute_AU_F1(re_all_preds, re_all_targets_wav)
round(f1s,3), thresholds

(0.52,
 [0.7000000000000001,
  0.8,
  0.6,
  0.6,
  0.5,
  0.5,
  0.7000000000000001,
  0.7000000000000001,
  0.7000000000000001,
  0.8,
  0.30000000000000004,
  0.7000000000000001])

In [None]:
for w in np.linspace(0, 1, 11):
    print(f'{w = }')
    w = np.array([w])
    y_ensemble = w * re_all_preds + (1 - w) * re_all_preds_wav
    new_pred = (y_ensemble >= thresholds).astype(int)
    f1_scores, f1_thresh, acc, threshold = compute_AU_F1(re_all_targets, new_pred,thresh=np.arange(0.1,1,0.1))
    print(f'f1 score: {f1_scores:.3f}, accuracy: {acc}')

w = 0.0
f1 score: 0.512, accuracy: [0.875, 0.913, 0.839, 0.813, 0.757, 0.787, 0.863, 0.913, 0.922, 0.926, 0.746, 0.849]
w = 0.1
f1 score: 0.515, accuracy: [0.877, 0.914, 0.843, 0.814, 0.759, 0.79, 0.865, 0.914, 0.926, 0.929, 0.747, 0.851]
w = 0.2
f1 score: 0.517, accuracy: [0.878, 0.915, 0.846, 0.815, 0.762, 0.792, 0.865, 0.915, 0.929, 0.931, 0.749, 0.853]
w = 0.30000000000000004
f1 score: 0.520, accuracy: [0.88, 0.916, 0.848, 0.816, 0.764, 0.794, 0.866, 0.915, 0.932, 0.931, 0.751, 0.854]
w = 0.4
f1 score: 0.521, accuracy: [0.88, 0.917, 0.85, 0.816, 0.766, 0.797, 0.866, 0.915, 0.934, 0.932, 0.752, 0.855]
w = 0.5
f1 score: 0.522, accuracy: [0.88, 0.917, 0.851, 0.816, 0.767, 0.798, 0.866, 0.914, 0.935, 0.931, 0.754, 0.855]
w = 0.6000000000000001
f1 score: 0.523, accuracy: [0.88, 0.917, 0.852, 0.815, 0.769, 0.799, 0.867, 0.912, 0.936, 0.93, 0.755, 0.855]
w = 0.7000000000000001
f1 score: 0.523, accuracy: [0.88, 0.916, 0.852, 0.814, 0.77, 0.8, 0.866, 0.909, 0.936, 0.929, 0.756, 0.854]
w = 0

In [None]:
weight=np.array([0.6])
y_ensemble = weight * re_all_preds + (1 - weight) * re_all_preds_wav
new_pred = ((y_ensemble >= thresholds) * 1)
f1_scores, f1_thresh, acc, threshold = compute_AU_F1(re_all_targets, new_pred,thresh=np.arange(0.1,1,0.1))
print(f'f1 score: {f1_scores:.3f}, accuracy: {acc}, threshold: {threshold}')

f1 score: 0.523, accuracy: [0.88, 0.917, 0.852, 0.815, 0.769, 0.799, 0.867, 0.912, 0.936, 0.93, 0.755, 0.855], threshold: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]


##### EffNet_B2 + EffNet_B0 + Vggish

In [None]:
zipped_data = zip(img1, all_preds, all_targets)
zipped_data_b0 = zip(img, all_preds_b0, all_targets_b0)

sorted_data = sorted(zipped_data)
sorted_data_b0 = sorted(zipped_data_b0)

re_img_b0, re_all_preds_b0, re_all_targets_b0 = zip(*sorted_data_b0)
re_img, re_all_preds, re_all_targets = zip(*sorted_data)

In [None]:
f1s, _, _, thresholds = compute_AU_F1(re_all_preds_b0, re_all_targets)
round(f1s,3), thresholds

(0.52,
 [0.8,
  0.9,
  0.6,
  0.5,
  0.5,
  0.6,
  0.7000000000000001,
  0.7000000000000001,
  0.7000000000000001,
  0.8,
  0.30000000000000004,
  0.7000000000000001])

In [None]:
for w in np.linspace(0, 1, 11):
    print(f'{w = }')
    w = np.array([w])
    y_ensemble = w * re_all_preds + (1 - w) * re_all_preds_b0
    new_pred = (y_ensemble >= thresholds).astype(int)
    f1_scores, f1_thresh, acc, threshold = compute_AU_F1(re_all_targets, new_pred,thresh=np.arange(0.1,1,0.1))
    print(f'f1 score: {f1_scores:.3f}, accuracy: {acc}')

w = 0.0
f1 score: 0.520, accuracy: [0.877, 0.915, 0.853, 0.784, 0.78, 0.807, 0.868, 0.932, 0.948, 0.933, 0.777, 0.849], threshold: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
w = 0.1
f1 score: 0.524, accuracy: [0.883, 0.92, 0.857, 0.787, 0.783, 0.811, 0.87, 0.934, 0.951, 0.935, 0.778, 0.853], threshold: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
w = 0.2
f1 score: 0.528, accuracy: [0.888, 0.923, 0.86, 0.79, 0.786, 0.815, 0.873, 0.936, 0.954, 0.936, 0.779, 0.857], threshold: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
w = 0.30000000000000004
f1 score: 0.531, accuracy: [0.892, 0.925, 0.864, 0.794, 0.788, 0.818, 0.875, 0.938, 0.956, 0.937, 0.778, 0.86], threshold: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
w = 0.4
f1 score: 0.532, accuracy: [0.894, 0.926, 0.867, 0.797, 0.79, 0.821, 0.876, 0.938, 0.958, 0.937, 0.778, 0.863], threshold: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
w = 0.5
f1 score: 0.533, a

In [None]:
weight=np.array([0.5])
y_ensemble = weight * re_all_preds + (1 - weight) * re_all_preds_b0
new_pred = ((y_ensemble >= thresholds) * 1)
f1_scores, f1_thresh, acc, threshold = compute_AU_F1(re_all_targets, new_pred,thresh=np.arange(0.1,1,0.1))
print(f'f1 score: {f1_scores:.3f}, accuracy: {acc}, threshold: {threshold}')

f1 score: 0.533, accuracy: [0.895, 0.926, 0.868, 0.8, 0.791, 0.823, 0.876, 0.938, 0.96, 0.937, 0.777, 0.864], threshold: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]


### Smoothing

In [None]:
tsk = task[1]
vis = vis_typ[0]
viau = vis_aud[1]
auft = audio_feat[1]

In [None]:
index, pred, lab = [], [], []
test_vid = {}
for i, val in enumerate(re_img):
    ind = int(val.split('/')[1][:-4])
    vname = val.split('/')[0]
    if i == 0:
        prename = vname
        index.append(ind)
        pred.append(new_pred[i])
        lab.append(re_all_targets[i])
    else:
        if vname == prename:
            index.append(ind)
            pred.append(new_pred[i])
            lab.append(re_all_targets[i])
        else:
            combined = list(zip(index, pred, lab))
            combined_sorted = sorted(combined, key=lambda x: x[0])
            index_list_sorted, pred_list_sorted, lab_list_sorted = zip(*combined_sorted)
            test_vid[prename] = (list(index_list_sorted), list(pred_list_sorted), list(lab_list_sorted))
            prename = vname
            index, pred, lab = [], [], []
            index.append(ind)
            pred.append(new_pred[i])
            lab.append(re_all_targets[i])

##### EffNet_B2 + Wav2Vec2 + Vggish

In [None]:
thresholds = np.array([0.7000000000000001,
  0.8,
  0.6,
  0.6,
  0.5,
  0.5,
  0.7000000000000001,
  0.7000000000000001,
  0.7000000000000001,
  0.8,
  0.30000000000000004,
  0.7000000000000001])

In [None]:
hyperparams=[(isMean,delta) for delta in [0, 5, 15, 30, 60, 100, 200] for isMean in [1,0] if not (isMean==0 and delta==0)]
total_true=[]
total_preds=[[] for _ in range(len(hyperparams))]
timing_results = {(isMean, delta): 0 for isMean, delta in hyperparams}
for videoname,(img, predict, label) in test_vid.items():
    for i,ind in enumerate(img):
        total_true.append(label[i])
    preds_proba = smooth_prediction(img, predict)
    for hInd,(isMean,delta) in enumerate(hyperparams):
        preds=[]
        start = time.time()
        for i in range(len(preds_proba)):
            i1=max(i-delta,0)
            if isMean:
                _, proba = slide_window(preds_proba, i, delta, 'mean')
            else:
                _, proba = slide_window(preds_proba, i, delta, 'median')
            aus = (proba>=thresholds)*1
            preds.append(aus)
        for i,ind in enumerate(img):
            if label[i][0]>=-1 and label[i][1]>=-1:
                total_preds[hInd].append(preds[ind-1])
        end = time.time()
        timing_results[(isMean, delta)] += end - start
total_true=np.array(total_true)

In [None]:
for hInd, (isMean, delta) in enumerate(hyperparams):
    preds = np.array(total_preds[hInd])
    accuracy = round((preds == total_true).mean(), 3)
    f1 = round(np.mean([f1_score(y_true=total_true[:,i],y_pred=preds[:,i]) for i in range(preds.shape[1])]), 3)
    mean_or_median = 'mean' if isMean else 'median'
    time_taken = round(timing_results[(isMean, delta)],3)
    print(f'{mean_or_median}; delta: {delta}; Acc: {accuracy}; F1: {f1}; Time: {time_taken}s')

mean; delta: 0; Acc: 0.857; F1: 0.523; Time: 7.502s
mean; delta: 5; Acc: 0.869; F1: 0.525; Time: 7.965s
median; delta: 5; Acc: 0.863; F1: 0.532; Time: 16.839s
mean; delta: 15; Acc: 0.871; F1: 0.511; Time: 8.342s
median; delta: 15; Acc: 0.866; F1: 0.534; Time: 17.291s
mean; delta: 30; Acc: 0.87; F1: 0.492; Time: 8.718s
median; delta: 30; Acc: 0.867; F1: 0.528; Time: 17.746s
mean; delta: 60; Acc: 0.868; F1: 0.462; Time: 9.44s
median; delta: 60; Acc: 0.866; F1: 0.515; Time: 19.077s
mean; delta: 100; Acc: 0.864; F1: 0.431; Time: 11.532s
median; delta: 100; Acc: 0.864; F1: 0.502; Time: 21.379s
mean; delta: 200; Acc: 0.857; F1: 0.39; Time: 13.451s
median; delta: 200; Acc: 0.858; F1: 0.472; Time: 25.658s


##### EffNet_B2 + EffNet_B0 + Vggish

In [None]:
thresholds = np.array([0.8,
  0.9,
  0.6,
  0.5,
  0.5,
  0.6,
  0.7000000000000001,
  0.7000000000000001,
  0.7000000000000001,
  0.8,
  0.30000000000000004,
  0.7000000000000001])

In [None]:
hyperparams=[(isMean,delta) for delta in [0, 5, 15, 30, 60, 100, 200] for isMean in [1,0] if not (isMean==0 and delta==0)]
total_true=[]
total_preds=[[] for _ in range(len(hyperparams))]
timing_results = {(isMean, delta): 0 for isMean, delta in hyperparams}
for videoname,(img, predict, label) in test_vid.items():
    for i,ind in enumerate(img):
        total_true.append(label[i])
    preds_proba = smooth_prediction(img, predict)
    for hInd,(isMean,delta) in enumerate(hyperparams):
        preds=[]
        start = time.time()
        for i in range(len(preds_proba)):
            i1=max(i-delta,0)
            if isMean:
                _, proba = slide_window(preds_proba, i, delta, 'mean')
            else:
                _, proba = slide_window(preds_proba, i, delta, 'median')
            aus = (proba>=thresholds)*1
            preds.append(aus)
        for i,ind in enumerate(img):
            if label[i][0]>=-1 and label[i][1]>=-1:
                total_preds[hInd].append(preds[ind-1])
        end = time.time()
        timing_results[(isMean, delta)] += end - start
total_true=np.array(total_true)

In [None]:
for hInd, (isMean, delta) in enumerate(hyperparams):
    preds = np.array(total_preds[hInd])
    accuracy = round((preds == total_true).mean(), 3)
    f1 = round(np.mean([f1_score(y_true=total_true[:,i],y_pred=preds[:,i]) for i in range(preds.shape[1])]), 3)
    mean_or_median = 'mean' if isMean else 'median'
    time_taken = round(timing_results[(isMean, delta)],3)
    print(f'{mean_or_median}; delta: {delta}; Acc: {accuracy}; F1: {f1}; Time: {time_taken}s')

mean; delta: 0; Acc: 0.871; F1: 0.534; Time: 8.697s
mean; delta: 5; Acc: 0.878; F1: 0.524; Time: 9.522s
median; delta: 5; Acc: 0.875; F1: 0.54; Time: 22.173s
mean; delta: 15; Acc: 0.877; F1: 0.499; Time: 9.23s
median; delta: 15; Acc: 0.877; F1: 0.535; Time: 19.396s
mean; delta: 30; Acc: 0.876; F1: 0.472; Time: 9.172s
median; delta: 30; Acc: 0.877; F1: 0.524; Time: 19.412s
mean; delta: 60; Acc: 0.872; F1: 0.437; Time: 9.686s
median; delta: 60; Acc: 0.874; F1: 0.506; Time: 20.104s
mean; delta: 100; Acc: 0.867; F1: 0.41; Time: 10.824s
median; delta: 100; Acc: 0.871; F1: 0.489; Time: 22.654s
mean; delta: 200; Acc: 0.86; F1: 0.368; Time: 14.02s
median; delta: 200; Acc: 0.864; F1: 0.457; Time: 27.107s


## VA Estimation Challenge

### Loading data

In [None]:
# Cropped_aligned images
vis = vis_typ[0]

In [None]:
# Cropped images
vis = vis_typ[1]

#### Effnet + wav2vec2

In [None]:
auft = audio_feat[0]
viau = vis_aud[0]

#### Effnet + vggish

In [None]:
auft = audio_feat[1]
viau = vis_aud[1]

#### Effnet

In [None]:
auft = audio_feat[2]
viau = vis_aud[2]

#### Test

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis}/VA/{task[2]}_{typ[2]}_{viau}.pkl'), 'rb') as f:
    data3 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[2]
with open(os.path.join(root, f'data/Annotations/{task[2]}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data3[task1][vname])
    for img in data3[task1][vname].keys():
        iname.append(img)

100%|██████████| 76/76 [00:00<00:00, 1175.35it/s]


In [None]:
dataset = ABAW_dataset1(data3, iname, dims, task1)
test_loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=False)

### Modeling

In [None]:
class VA_fusion(nn.Module):
    def __init__(self, batchsize = batch_size, audio_ft = auft, hidden_size = [512, 128, batch_size]):
        super(VA_fusion, self).__init__()
        self.batchsize = batchsize
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.hidden_size = hidden_size
        self.feat_fc = nn.Conv1d(self.concat_dim, hidden_size[0], 1, padding=0)
        self.activ = nn.LeakyReLU(0.1)
        self.dropout = nn.Dropout(p=0.3)
        self.conv1 = nn.Conv1d(hidden_size[0], hidden_size[1], 1, padding=0)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size[1], nhead=4, dim_feedforward=hidden_size[1], dropout=0.3)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
        self.vhead = nn.Sequential(
                nn.Linear(hidden_size[1], hidden_size[2]),
                nn.BatchNorm1d(hidden_size[2]),
                nn.Linear(hidden_size[2], 1),
                )
        self.ahead = nn.Sequential(
                nn.Linear(hidden_size[1], hidden_size[2]),
                nn.BatchNorm1d(hidden_size[2]),
                nn.Linear(hidden_size[2], 1),
                )

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs,dim=1)
        feat = torch.transpose(feat,0,1)
        feat = self.feat_fc(feat)
        feat = self.activ(feat)
        out = self.conv1(feat)
        out = torch.transpose(out,0,1)
        out = self.transformer_encoder(out)
        vout = self.vhead(out)
        aout = self.ahead(out)

        return vout, aout, torch.tanh(vout), torch.tanh(aout)

In [None]:
VA_model = VA_fusion().to(device)
VA_model

VA_fusion(
  (feat_fc): Conv1d(2176, 512, kernel_size=(1,), stride=(1,))
  (activ): LeakyReLU(negative_slope=0.1)
  (dropout): Dropout(p=0.3, inplace=False)
  (conv1): Conv1d(512, 128, kernel_size=(1,), stride=(1,))
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (vhead): Sequential(
    (0): Linear(in_features=128, out_feature

In [None]:
class MLPModel(nn.Module):
    def __init__(self, audio_ft = auft, num_classes=1):
        super(MLPModel, self).__init__()
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.feat_fc = nn.Conv1d(self.concat_dim, 512, 1, padding=0)
        self.vhead = nn.Sequential(
            nn.Linear(512, 128),
            nn.LeakyReLU(0.1),
            nn.BatchNorm1d(128),
            nn.Linear(128, num_classes)
        )

        self.ahead = nn.Sequential(
            nn.Linear(self.concat_dim, 128),
            nn.BatchNorm1d(128),
            nn.Linear(128, num_classes)
        )
    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs, dim=1)
        vfeat = self.feat_fc(torch.transpose(feat,0,1))
        vfeat = torch.transpose(vfeat,0,1)
        vout = self.vhead(vfeat)
        aout = self.ahead(feat)

        return vout, aout, torch.tanh(vout), torch.tanh(aout)

In [None]:
mlp_model = MLPModel().to(device)
mlp_model

MLPModel(
  (feat_fc): Conv1d(1536, 512, kernel_size=(1,), stride=(1,))
  (vhead): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): LeakyReLU(negative_slope=0.1)
    (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=128, out_features=1, bias=True)
  )
  (ahead): Sequential(
    (0): Linear(in_features=1536, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [None]:
def evaluate_model(model, data_loader, au_feat):
    model.eval()
    total_loss = []
    all_targets = []
    all_preds = []
    mse = []
    with torch.no_grad():
        iterator = iter(data_loader)
        for i in range(len(data_loader)):
            VA = next(iterator)
            if au_feat == 'nope':
                vis_feat, y = VA[visual_feat], VA['label']
                vis_feat, y = vis_feat.to(device), y.to(device)
                aud_feat = None
            else:
                vis_feat, aud_feat, y = VA[visual_feat], VA[au_feat], VA['label']
                vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
            Vpred, Apred, v_pred, a_pred = model(vis_feat, aud_feat)
            mse_loss, ccc_loss = compute_VA_loss(Vpred, Apred, y)
            total_loss.append(ccc_loss.item())
            mse.append(mse_loss.item())
            preds = torch.cat((v_pred, a_pred), dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_targets.extend(y.cpu().tolist())

    ccc1, ccc2 = compute_VA_CCC(all_preds, all_targets)
    return round(np.mean(total_loss),3), round(np.mean(mse),3), round(ccc1,3), round(ccc2,3)

### Testing

##### EffNet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
print('VA_model')
print(visual_feat + ' & ' + auft)
VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
VA_model = VA_fusion().to(device)
VA_model.load_state_dict(VA_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}, mean CCC: {round((ccc1+ccc2)/2,3)}')

VA_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: CCC_Valence 0.294, CCC_Arousal: 0.495, mean CCC: 0.394
58.2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_model/best_VA_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}, mean CCC: {round((ccc1+ccc2)/2,3)}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: CCC_Valence 0.391, CCC_Arousal: 0.527, mean CCC: 0.459
29 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet + Vggish

In [None]:
%%timeit -n 1 -r 1
print('VA_model')
print(visual_feat + ' & ' + auft)
VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_model/best_VA_fusion_{viau}.pth'))
VA_model = VA_fusion().to(device)
VA_model.load_state_dict(VA_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}, mean CCC: {round((ccc1+ccc2)/2,3)}')

VA_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: CCC_Valence 0.293, CCC_Arousal: 0.458, mean CCC: 0.376
57.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_model/best_VA_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}, mean CCC: {round((ccc1+ccc2)/2,3)}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: CCC_Valence 0.365, CCC_Arousal: 0.516, mean CCC: 0.44
28.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet

In [None]:
%%timeit -n 1 -r 1
print('VA_model')
print(visual_feat + ' & ' + auft)
VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_model/best_VA_fusion_{viau}.pth'))
VA_model = VA_fusion().to(device)
VA_model.load_state_dict(VA_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}, mean CCC: {round((ccc1+ccc2)/2,3)}')

VA_model
visualfeat_enet_b2_8_best & nope
Test set: CCC_Valence 0.27, CCC_Arousal: 0.377, mean CCC: 0.324
55.2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_model/best_VA_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}, mean CCC: {round((ccc1+ccc2)/2,3)}')

MLP_model
visualfeat_enet_b2_8_best & nope
Test set: CCC_Valence 0.366, CCC_Arousal: 0.464, mean CCC: 0.415
26.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### Ensembling

#### EffNet_B2 + Wav2vec2

In [None]:
# Wav2vec2
vis = vis_typ[0]
auft = audio_feat[0]
viau = vis_aud[0]

In [None]:
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_model/best_VA_mlp_{viau}.pth'))
mlp_model_wav = MLPModel().to(device)
mlp_model_wav.load_state_dict(mlp_best_model)
evaluate_model(mlp_model_wav, test_loader, auft)

(1.488, 0.276, 0.391, 0.527)

In [None]:
iterator = iter(test_loader)
all_preds, all_targets = [], []
img1 = []
for i in range(len(test_loader)):
    VA = next(iterator)
    frame = VA['frame']
    for imgname in frame:
        img1.append(imgname)
    vis_feat, aud_feat, y = VA[visual_feat], VA[auft], VA['label']
    vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
    Vpred, Apred, v_pred, a_pred = mlp_model_wav(vis_feat, aud_feat)
    preds = torch.cat((v_pred, a_pred), dim=1)
    all_preds.extend(preds.cpu().tolist())
    all_targets.extend(y.cpu().tolist())

In [None]:
compute_VA_CCC(all_preds, all_targets)

(0.3908915320970614, 0.5266098927052195)

#### EffNet_B2 + Vggish

In [None]:
mlp_best_mod = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_model/best_VA_mlp_{viau}.pth'))
mlp_model_vgg = MLPModel().to(device)
mlp_model_vgg.load_state_dict(mlp_best_mod)
evaluate_model(mlp_model_vgg, test_loader, auft)

(1.483, 0.287, 0.354, 0.533)

In [None]:
iterator = iter(test_loader)
all_preds_vgg, all_targets_vgg = [], []
img = []
for i in range(len(test_loader)):
    VA = next(iterator)
    frame = VA['frame']
    for imgname in frame:
        img.append(imgname)
    vis_feat, aud_feat, y = VA[visual_feat], VA[auft], VA['label']
    vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
    Vpred, Apred, v_pred, a_pred = mlp_model_vgg(vis_feat, aud_feat)
    preds = torch.cat((v_pred, a_pred), dim=1)
    all_preds_vgg.extend(preds.cpu().tolist())
    all_targets_vgg.extend(y.cpu().tolist())

In [None]:
compute_VA_CCC(all_preds_vgg, all_targets_vgg)

(0.35368417669976915, 0.53293580621636)

#### Ensemble

In [None]:
zipped_data = zip(img1, all_preds, all_targets)
zipped_data_vgg = zip(img, all_preds_vgg, all_targets_vgg)

sorted_data = sorted(zipped_data)
sorted_data_vgg = sorted(zipped_data_vgg)

re_img_vgg, re_all_preds_vgg, re_all_targets_vgg = zip(*sorted_data_vgg)
re_img, re_all_preds, re_all_targets = zip(*sorted_data)

In [None]:
compute_VA_CCC(re_all_preds_vgg, re_all_targets)

(0.3536841766997691, 0.53293580621636)

In [None]:
for w in np.linspace(0, 1, 11):
    print(f'{w = }')
    w = np.array([w])
    y_ensemble = w * re_all_preds + (1 - w) * re_all_preds_vgg
    cccv, ccca = compute_VA_CCC(y_ensemble, re_all_targets)
    print(f'CCCV: {cccv:.3f}, CCCA: {ccca:.3f}, Mean CCCC: {(cccv+ccca)/2:.3f}')

w = 0.0
CCCV: 0.354, CCCA: 0.533, Mean CCCC: 0.443
w = 0.1
CCCV: 0.364, CCCA: 0.538, Mean CCCC: 0.451
w = 0.2
CCCV: 0.374, CCCA: 0.541, Mean CCCC: 0.458
w = 0.30000000000000004
CCCV: 0.382, CCCA: 0.544, Mean CCCC: 0.463
w = 0.4
CCCV: 0.389, CCCA: 0.545, Mean CCCC: 0.467
w = 0.5
CCCV: 0.393, CCCA: 0.545, Mean CCCC: 0.469
w = 0.6000000000000001
CCCV: 0.396, CCCA: 0.544, Mean CCCC: 0.470
w = 0.7000000000000001
CCCV: 0.398, CCCA: 0.541, Mean CCCC: 0.470
w = 0.8
CCCV: 0.397, CCCA: 0.538, Mean CCCC: 0.467
w = 0.9
CCCV: 0.395, CCCA: 0.533, Mean CCCC: 0.464
w = 1.0
CCCV: 0.391, CCCA: 0.527, Mean CCCC: 0.459


In [None]:
test_vid = zip(img1, y_ensemble, re_all_targets)

In [None]:
weight=np.array([0.6])
y_ensemble = weight * re_all_preds + (1 - weight) * re_all_preds_vgg
cccv, ccca = compute_VA_CCC(y_ensemble, re_all_targets)
print(f'CCCV: {cccv:.3f}, CCCA: {ccca:.3f}, Mean CCCC: {(cccv+ccca)/2:.3f}')

CCCV: 0.396, CCCA: 0.544, Mean CCCC: 0.470


### Smoothing

In [None]:
index, pred, lab = [], [], []
test_vid = {}
for i, val in enumerate(re_img):
    ind = int(val.split('/')[1][:-4])
    vname = val.split('/')[0]
    if i == 0:
        prename = vname
        index.append(ind)
        pred.append(y_ensemble[i])
        lab.append(re_all_targets[i])
    else:
        if vname == prename:
            index.append(ind)
            pred.append(y_ensemble[i])
            lab.append(re_all_targets[i])
        else:
            combined = list(zip(index, pred, lab))
            combined_sorted = sorted(combined, key=lambda x: x[0])
            index_list_sorted, pred_list_sorted, lab_list_sorted = zip(*combined_sorted)
            test_vid[prename] = (list(index_list_sorted), list(pred_list_sorted), list(lab_list_sorted))
            prename = vname
            index, pred, lab = [], [], []
            index.append(ind)
            pred.append(y_ensemble[i])
            lab.append(re_all_targets[i])

In [None]:
hyperparams=[(isMean,delta) for delta in [0, 5, 15, 30, 60, 100, 200] for isMean in [1,0] if not (isMean==0 and delta==0)]
total_true=[]
total_preds=[[] for _ in range(len(hyperparams))]
timing_results = {(isMean, delta): 0 for isMean, delta in hyperparams}
for videoname,(img, predict, label) in test_vid.items():
    for i,ind in enumerate(img):
        total_true.append(label[i])
    preds_proba = smooth_prediction(img, predict)
    for hInd,(isMean,delta) in enumerate(hyperparams):
        preds=[]
        start = time.time()
        for i in range(len(preds_proba)):
            i1=max(i-delta,0)
            if isMean:
                best_ind, proba = slide_window(preds_proba, i, delta, 'mean')
            else:
                best_ind, proba = slide_window(preds_proba, i, delta, 'median')
            preds.append(proba)
        for i, ind in enumerate(img):
            if label[i][0]>=-1 and label[i][1]>=-1:
                total_preds[hInd].append(preds[ind-1])
        end = time.time()
        timing_results[(isMean, delta)] += end - start
total_true=np.array(total_true)

In [None]:
hyperparams=[(isMean,delta) for delta in [15, 20, 25, 30, 35, 40] for isMean in [1,0] if not (isMean==0 and delta==0)]
total_true=[]
total_preds=[[] for _ in range(len(hyperparams))]
timing_results = {(isMean, delta): 0 for isMean, delta in hyperparams}
for videoname,(img, predict, label) in test_vid.items():
    for i,ind in enumerate(img):
        total_true.append(label[i])
    preds_proba = smooth_prediction(img, predict)
    for hInd,(isMean,delta) in enumerate(hyperparams):
        preds=[]
        start = time.time()
        for i in range(len(preds_proba)):
            i1=max(i-delta,0)
            if isMean:
                best_ind, proba = slide_window(preds_proba, i, delta, 'mean')
            else:
                best_ind, proba = slide_window(preds_proba, i, delta, 'median')
            preds.append(proba)
        for i, ind in enumerate(img):
            if label[i][0]>=-1 and label[i][1]>=-1:
                total_preds[hInd].append(preds[ind-1])
        end = time.time()
        timing_results[(isMean, delta)] += end - start
total_true=np.array(total_true)

In [None]:
for hInd, (isMean, delta) in enumerate(hyperparams):
    preds = np.array(total_preds[hInd])
    ccc1, ccc2 = compute_VA_CCC(preds, total_true)
    mean_or_median = 'mean' if isMean else 'median'
    time_taken = round(timing_results[(isMean, delta)],3)
    print(f'{mean_or_median}; delta: {delta}; CCCV: {ccc1:.3f}; CCCA: {ccc2:.3f}; Mean CCC: {(ccc1+ccc2)/2:.3f} Time: {time_taken}s')

mean; delta: 0; CCCV: 0.399; CCCA: 0.546; Mean CCC: 0.473 Time: 5.635s
mean; delta: 5; CCCV: 0.411; CCCA: 0.569; Mean CCC: 0.490 Time: 6.035s
median; delta: 5; CCCV: 0.411; CCCA: 0.567; Mean CCC: 0.489 Time: 13.1s
mean; delta: 15; CCCV: 0.418; CCCA: 0.585; Mean CCC: 0.502 Time: 5.745s
median; delta: 15; CCCV: 0.419; CCCA: 0.584; Mean CCC: 0.502 Time: 13.37s
mean; delta: 30; CCCV: 0.421; CCCA: 0.594; Mean CCC: 0.507 Time: 5.922s
median; delta: 30; CCCV: 0.424; CCCA: 0.596; Mean CCC: 0.510 Time: 13.567s
mean; delta: 60; CCCV: 0.414; CCCA: 0.585; Mean CCC: 0.500 Time: 6.428s
median; delta: 60; CCCV: 0.420; CCCA: 0.594; Mean CCC: 0.507 Time: 14.063s
mean; delta: 100; CCCV: 0.402; CCCA: 0.562; Mean CCC: 0.482 Time: 6.961s
median; delta: 100; CCCV: 0.409; CCCA: 0.575; Mean CCC: 0.492 Time: 14.75s
mean; delta: 200; CCCV: 0.377; CCCA: 0.516; Mean CCC: 0.447 Time: 7.423s
median; delta: 200; CCCV: 0.379; CCCA: 0.531; Mean CCC: 0.455 Time: 16.118s
