## Library

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install pydub timm resampy

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting timm
  Downloading timm-0.9.16-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->timm)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->timm)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->timm)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->timm)
  Using cached nvidia_cudnn_

In [None]:
from pydub import AudioSegment

In [None]:
import os
import torch
from tqdm import tqdm
import json
import time
import requests
import torchaudio
import numpy as np
import pickle
import cv2
from PIL import Image
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import math
import torch.nn as nn
import random
from torch import optim
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
root = '/content/drive/MyDrive/MSc/Thesis'

In [None]:
def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True

In [None]:
setup_seed(20)

# Dataset

In [None]:
#test_list = []
a = []
for d in ['VA_Estimation_Challenge','EXPR_Recognition_Challenge','AU_Detection_Challenge']:
    data_dir=os.path.join(root,'data/Annotations',d)
    for k in ['Train_Set','Validation_Set']:
        data_label=os.path.join(data_dir,k)
        with open(os.path.join(data_dir,f'{k}.txt'), 'w') as f:
            for filename in tqdm(os.listdir(data_label)):
                fn, ext = os.path.splitext(os.path.basename(filename))
                if ext.lower()=='.txt':
                    f.write(fn+'\n')

100%|██████████| 357/357 [00:00<00:00, 218785.29it/s]
100%|██████████| 76/76 [00:00<00:00, 121342.64it/s]
100%|██████████| 248/248 [00:00<00:00, 211119.83it/s]
100%|██████████| 70/70 [00:00<00:00, 226719.14it/s]
100%|██████████| 295/295 [00:00<00:00, 196555.95it/s]
100%|██████████| 105/105 [00:00<00:00, 110737.22it/s]


In [None]:
test_list = []

for d in ['VA_Estimation_Challenge','EXPR_Recognition_Challenge','AU_Detection_Challenge']:
    print(d)
    with open(os.path.join(root,f'data/Annotations/{d}/Train_Set.txt'), 'r') as f:
        files = f.read().splitlines()
    random.shuffle(files)
    ratio = int(len(files)/5) # 20%
    val_set = files[:ratio]
    test_list.extend(val_set)
    train_set = files[ratio:]
    print('Train_set:')
    with open(os.path.join(root,f'data/Annotations/{d}/Train.txt'), 'w') as f:
        for ftrain in tqdm(train_set):
            f.write(ftrain+'\n')
    print('Val_set:')
    with open(os.path.join(root,f'data/Annotations/{d}/Val.txt'), 'w') as f:
        for fval in tqdm(val_set):
            f.write(fval+'\n')

    with open(os.path.join(root,f'data/Annotations/{d}/Validation_Set.txt'), 'r') as f:
        test_set = f.read().splitlines()
    test_list.extend(test_set)
    print('Test_set:')
    with open(os.path.join(root,f'data/Annotations/{d}/Test.txt'), 'w') as f:
        for ftest in tqdm(test_set):
            f.write(ftest+'\n')
with open(os.path.join(root,'data/test_list.txt'), 'w') as f:
    for fn in test_list:
        f.write(fn+'\n')

VA_Estimation_Challenge
Train_set:


100%|██████████| 285/285 [00:00<00:00, 300799.36it/s]


Val_set:


100%|██████████| 71/71 [00:00<00:00, 387250.43it/s]


Test_set:


100%|██████████| 76/76 [00:00<00:00, 78090.91it/s]


EXPR_Recognition_Challenge
Train_set:


100%|██████████| 199/199 [00:00<00:00, 674205.57it/s]


Val_set:


100%|██████████| 49/49 [00:00<00:00, 349525.33it/s]


Test_set:


100%|██████████| 70/70 [00:00<00:00, 460190.09it/s]


AU_Detection_Challenge
Train_set:


100%|██████████| 236/236 [00:00<00:00, 772118.37it/s]


Val_set:


100%|██████████| 59/59 [00:00<00:00, 391556.86it/s]


Test_set:


100%|██████████| 105/105 [00:00<00:00, 522422.21it/s]


In [None]:
def get_names(vid, id):
    name = ""
    if id>=0 and id<10:
        name = f"{vid}/0000" + str(id) + ".jpg"
    elif id>=10 and id<100:
        name = f"{vid}/000" + str(id) + ".jpg"
    elif id>=100 and id<1000:
        name = f"{vid}/00" + str(id) + ".jpg"
    elif id>=1000 and id<10000:
        name = f"{vid}/0" + str(id) + ".jpg"
    else:
        name = f"{vid}/" + str(id) + ".jpg"
    return name

In [None]:
def load_feature_cache(feature_names):

    for feature_name in feature_names:
        print('processing:', feature_name)
        feat_root = os.path.join(root + '/models/ABAW6', feature_name)
        save_root = feat_root+'.pkl'
        if os.path.exists(save_root):
            continue
        filenames = os.listdir(feat_root)[:]
        feat = {}
        for fname in tqdm(filenames):
            vname = fname.split('.')[0]
            if filenames[0].endswith('.npy'):
                fea = np.load(os.path.join(feat_root, fname), allow_pickle=True).tolist()
            elif filenames[0].endswith('.pkl'):
                with open(os.path.join(feat_root, fname), 'rb') as f:
                    fea = pickle.load(f)
            feat[vname] = fea
        with open(save_root, 'wb') as f:
            pickle.dump(feat, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
feature_names = [
        'visualfeat_enet_b2_8_best_cropped',
        'visualfeat_enet_b2_8_best_cropped_aligned',
        'audiofeat_vggish',
        'audiofeat_wav2vec2'
        ]

In [None]:
class ABAW_dataset(Dataset):
    def __init__(self, root, split, typ, task, feature_v, feature_a):
        self.root = root
        self.split = split
        self.typ = typ
        self.task = task
        self.anno_path = os.path.join(self.root,f'data/Annotations/{self.task}/{self.split}')
        self.feature_v, self.feature_a = feature_v, feature_a
        self.feature = [self.feature_v, self.feature_a]
        with open(os.path.join(root, f'data/Annotations/{self.task}/{self.typ}.txt'), 'r') as f:
                self.vidnames = f.read().splitlines()
        self.feature_dims = 0
        self.data = {}
        self.data[self.task] = {}
        self.iname = []
        for feature_name in self.feature:
            if 'visual' in feature_name:
                self.data = self.load_feature_v(feature_name)
            elif 'audio' in feature_name:
                self.data = self.load_feature_a(feature_name)

    def get_names(vid, id):
            name = ""
            if id>=0 and id<10:
                name = f"{vid}/0000" + str(id) + ".jpg"
            elif id>=10 and id<100:
                name = f"{vid}/000" + str(id) + ".jpg"
            elif id>=100 and id<1000:
                name = f"{vid}/00" + str(id) + ".jpg"
            elif id>=1000 and id<10000:
                name = f"{vid}/0" + str(id) + ".jpg"
            else:
                name = f"{vid}/" + str(id) + ".jpg"
            return name

    def load_feature_v(self, feature_v):
            print(f'loading visual feature: {feature_v}')
            feat_root = os.path.join(root + '/models/ABAW6', feature_v)
            filenames = os.listdir(feat_root)[:]
            for vname in tqdm(self.vidnames):
                    feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
                    with open(os.path.join(self.anno_path, f'{vname}.txt')) as f:
                        labels = f.read().splitlines()
                    self.data[self.task][vname] = {}

                    for imgname, val in feature.items():
                        for i,line in enumerate(labels):
                            if i > 0:
                                imname = get_names(vname, i)
                                if imname == imgname:
                                    if self.task == 'AU_Detection_Challenge':
                                        splitted_line=line.split(',')
                                        aus = list(map(int,splitted_line))
                                        if min(aus) >= 0:
                                            labs = torch.tensor(aus)
                                            self.data[self.task][vname][imgname] = {f'{feature_v}': val, 'label': labs}
                                            self.iname.append(imname)
                                    elif self.task == 'VA_Estimation_Challenge':
                                        splitted_line=line.split(',')
                                        valence=float(splitted_line[0])
                                        arousal=float(splitted_line[1])
                                        if valence >= -1 and arousal >= -1:
                                            labs = torch.tensor([valence, arousal])
                                            self.data[self.task][vname][imgname] = {f'{feature_v}': val, 'label': labs}
                                            self.iname.append(imname)
                                    elif self.task == 'EXPR_Recognition_Challenge':
                                        exp = int(line)
                                        if exp >= 0:
                                            labs = torch.tensor(exp)
                                            self.data[self.task][vname][imgname] = {f'{feature_v}': val, 'label': labs}
                                            self.iname.append(imname)
                    self.feature_dims += len(self.data[self.task][vname])
            return self.data

    def load_feature_a(self, feature_a):
            print(f'loading audio feature: {feature_a}')
            feat_root = os.path.join(root + '/models/ABAW6', feature_a)
            filenames = os.listdir(feat_root)[:]
            for vname in tqdm(self.vidnames):
                    feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
                    for imgname, val in feature.items():
                        if imgname in self.data[self.task][vname]:
                            self.data[self.task][vname][imgname].update({f'{feature_a}': val})

                    for img, value in list(self.data[self.task][vname].items()):
                        if len(value) < 3:
                            self.data[self.task][vname].pop(img)
            return self.data

    def __getitem__(self, index):
            frame = self.iname[index]
            vname = frame.split('/')[0]
            data = self.data[self.task][vname][frame]
            data['frame'] = frame
            data['vid'] = vname
            data['label'] = self.data[self.task][vname][frame]['label']
            return data

    def __len__(self):
            return self.feature_dims

In [None]:
class ABAW_dataset1(Dataset):
    def __init__(self, data, iname, dims, task):
        self.data = data
        self.iname = iname
        self.task = task
        self.feature_dims = dims
    def __getitem__(self, index):
        frame = self.iname[index]
        vname = frame.split('/')[0]
        data = self.data[self.task][vname][frame]
        data['frame'] = frame
        data['vid'] = vname
        data['label'] = self.data[self.task][vname][frame]['label']
        return data

    def __len__(self):
            return self.feature_dims

# Convert

## Video to Audio

In [None]:
def vid2aud(video_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for file_name in tqdm(os.listdir(video_folder)):
        video_path = os.path.join(video_folder, file_name)
        audio_file_name = os.path.splitext(file_name)[0] + '.wav'
        audio_file_path = os.path.join(output_folder, audio_file_name)
        if os.path.exists(audio_file_path):
            continue
        if os.path.isfile(video_path) and file_name.lower().endswith(('.mp4', '.mov', '.avi', '.mkv')):
            video_clip = VideoFileClip(video_path)
            audio_clip = video_clip.audio
            audio_clip.write_audiofile(audio_file_path)
            audio_clip.close()
            video_clip.close()

In [None]:
for i in ['batch1','batch2','new_vids']:
    video_folder= os.path.join(root,'data/video', i)
    print(f'\nProcessing {video_folder}')
    output_folder= os.path.join(root, 'data/audio')
    vid2aud(video_folder, output_folder)


Processing /content/drive/MyDrive/MSc/Thesis/data/video/batch1


100%|██████████| 475/475 [00:02<00:00, 192.89it/s]



Processing /content/drive/MyDrive/MSc/Thesis/data/video/batch2


100%|██████████| 73/73 [00:00<00:00, 2263.92it/s]



Processing /content/drive/MyDrive/MSc/Thesis/data/video/new_vids


100%|██████████| 50/50 [00:00<00:00, 540.31it/s]


## Audio to Text

In [None]:
class Fuxi():
    def __init__(self, max_duration=50):
        self.appkey = 'phr-fuxi'
        self.appsecret = '74c72dee-bf9a-4de2-8c1f-96be1a1ecabd'
        self.url = 'http://api-test.vop.netease.com/phone_rec'
        self.max_duration = max_duration

    def run(self, wav_path, lang='en', type="wav",max_duration=50):
        words = []

        audio = AudioSegment.from_file(wav_path, format='mp3')
        duration_ms = len(audio)
        if duration_ms < 500:
            return ''
        chunk_length_ms = max_duration * 1000  # 60s
        text = ''
        long_vid = []
        for i in range(0, duration_ms,chunk_length_ms):
            new_audio = audio[i:min(i+chunk_length_ms,duration_ms)]
            byte_io = BytesIO()
            new_audio.export(byte_io, format="wav")
            speech = byte_io.getvalue()

            curtime = str(int(time.time()))
            hl = hashlib.md5()
            hl.update((self.appkey + curtime).encode(encoding='utf-8'))
            sign = hmac.new(self.appsecret.encode('utf-8'),
                            hl.hexdigest().encode('utf-8'), hashlib.sha1).digest()
            checksum = base64.b64encode(sign)
            params = {'appkey': self.appkey, 'lan': lang}
            headers = {
                'curtime': curtime,
                'checksum': checksum,
                'content-type': 'audio/wav',
                'cuid': 'fuxi-avatarlib'
            }
            response = requests.post(self.url,
                                        params=params,
                                        headers=headers,
                                        data=speech)
            r = response.json()
            if r['ret_code'] != 1:
                #error = r['ret_msg']
                #raise RuntimeError('Hangyan Rec Error: ' + error+f'[{wav_path}]')
                long_vid.append(wav_path)
                pass
            aligned_text = r['result']
            text += ' '.join([w['word'] for w in aligned_text if w['word'] != 'sil'])
        return text

    def split_wave(self, wav_path, save_root, max_duration=50):
        wav_name = wav_path.split('/')[-1].replace('.wav', '')
        fin = wave.open(wav_path, 'rb')
        fs_orig = fin.getframerate()
        audio_length = fin.getnframes() * (1/fs_orig)
        fin.close()

        audio = AudioSegment.from_wav(wav_path)
        n = int(audio_length//max_duration + 1)
        for i in range(n):
            new_audio = audio[i*max_duration*1000:(i+1)*max_duration*1000]
            new_audio.export(os.path.join(save_root, wav_name+f'_{i}.wav'), format='wav')
        return n

    def read_wave(self, wav_path, max_duration=60):
        with open(wav_path, 'rb') as f:
            wav_data = f.read()
        return wav_data

In [None]:
def aud2text():

    speech2text = Fuxi()
    print('loading model')
    root_wav = os.path.join(root, 'data/audio')
    wav_files = os.listdir(root_wav)[::-1]

    print(f'processing {root_wav}')
    save_folder = os.path.join(root, f'models/ABAW6/aud2text')
    os.makedirs(save_folder, exist_ok=True)

    res_dict = {}
    for audio_name in tqdm(wav_files):
        audio_file = os.path.join(root_wav, audio_name)
        save_path = os.path.join(save_folder, audio_name.split('.')[0]+ '.json')
        if os.path.exists(save_path):
            with open(save_path, 'r') as f:
                text = json.load(f)
            continue
        else:
            text = speech2text.run(audio_file)
            with open(save_path, 'w') as f:
                json.dump(text, f)
        res_dict[audio_name] = text.strip()

    with open(os.path.join(save_folder,'alltext.json'), 'w') as f:
        json.dump(res_dict, f, indent=4)

In [None]:
aud2text()

loading model
processing /content/drive/MyDrive/MSc/Thesis/data/audio


100%|██████████| 173/173 [52:28<00:00, 18.20s/it]


# Extract feature

## Visual feature

In [None]:
IMG_SIZE=224
train_transforms = transforms.Compose(
    [
        transforms.Resize((IMG_SIZE,IMG_SIZE)),
        transforms.RandomRotation(45),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.5, contrast=0.5, hue=0.5),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ]
)
test_transforms = transforms.Compose([
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])])

In [None]:
with open('/content/drive/MyDrive/MSc/Thesis/data/test_list.txt', 'r') as f:
      test_list = f.read().splitlines()

In [None]:
def extract_visual_feature(mode_name, typ):
    print('loading model:',mode_name)
    feature_extractor_model = torch.load(os.path.join(root, f'models/EmotiEffNet/enet/{mode_name}.pt'))
    feature_extractor_model.classifier=torch.nn.Identity()
    feature_extractor_model=feature_extractor_model.to(device)
    feature_extractor_model.eval()

    if typ == 'cropped_aligned':
        dir = ['cropped_aligned','cropped_aligned_new_50_vids']
    elif typ == 'cropped':
        dir = ['batch1', 'batch2', 'cropped_new_50_vids']

    for d in dir:
        root_vis = f'/content/data/{d}'
        print(f'processing {root_vis}')
        save_folder = os.path.join(root, f'models/ABAW6/visualfeat_{mode_name}_{typ}')
        os.makedirs(save_folder, exist_ok=True)
        for filename in tqdm(os.listdir(root_vis)):
            X_features=[]
            img_names=[]
            img_feat = {}
            imgs=[]
            frames_dir=os.path.join(root_vis,filename)

            if not os.path.isdir(frames_dir):
                continue
            save_file = os.path.join(save_folder, filename+'.npy')
            if os.path.exists(save_file):
                continue
            else:
                for img_name in os.listdir(frames_dir):
                    if img_name.lower().endswith('.jpg'):
                        img = Image.open(os.path.join(frames_dir,img_name))
                        if filename in test_list:
                            img_tensor = test_transforms(img)
                        else:
                            img_tensor = train_transforms(img)
                        if img.size:
                            img_names.append(filename+'/'+img_name)
                            imgs.append(img_tensor)
                            if len(imgs)>= 64:
                                features = feature_extractor_model(torch.stack(imgs, dim=0).to(device))
                                features = features.data.cpu().numpy()
                                if len(X_features)==0:
                                    X_features=features
                                else:
                                    X_features=np.concatenate((X_features,features),axis=0)
                                imgs=[]

                if len(imgs)>0:
                    features = feature_extractor_model(torch.stack(imgs, dim=0).to(device))
                    features = features.data.cpu().numpy()

                    if len(X_features)==0:
                        X_features=features
                    else:
                        X_features=np.concatenate((X_features,features),axis=0)

                    imgs=[]
                img_feat= {img_name:global_features for img_name,global_features in zip(img_names,X_features)}
                np.save(save_file,img_feat)

In [None]:
# cropped images
extract_visual_feature('enet_b2_8_best','cropped')

loading model: enet_b2_8_best
processing /content/data/batch1


100%|██████████| 355/355 [01:01<00:00,  5.76it/s]


processing /content/data/batch2


100%|██████████| 209/209 [00:31<00:00,  6.53it/s]


processing /content/data/cropped_new_50_vids


100%|██████████| 31/31 [22:04<00:00, 42.72s/it]


In [None]:
# cropped_aligned images
extract_visual_feature('enet_b2_8_best','cropped_aligned')

loading model: enet_b2_8_best
processing /content/data/cropped_aligned


100%|██████████| 564/564 [2:05:17<00:00, 13.33s/it]


processing /content/data/cropped_aligned_new_50_vids


100%|██████████| 31/31 [14:10<00:00, 27.44s/it]


## Audio feature

In [None]:
video2len={}
for i in ['batch1', 'batch2', 'new_vids']:
    d = os.path.join('/content/video',i)
    for filename in os.listdir(d):
        fn, ext = os.path.splitext(os.path.basename(filename))
        vid=os.path.join(d,filename)
        cap = cv2.VideoCapture(vid)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        video2len[fn]=total_frames+1

for filename in os.listdir(os.path.join(root,f'models/ABAW6/visualfeat_enet_b2_8_best_cropped_aligned')):
    if 'left' in filename or 'right' in filename:
        feature_path = os.path.join(root, filename)
        feature = np.load(feature_path, allow_pickle=True).tolist()
        fn = filename.split('.')[0]
        video2len[fn] = len(feature)

with open('/content/drive/MyDrive/MSc/Thesis/data/vid_length.pkl', 'wb') as f:
    pickle.dump(video2len, f)

In [None]:
def extract_audio_feature(mode_name):

    if mode_name == 'wav2vec2':
        bundle = torchaudio.pipelines.WAV2VEC2_BASE
        model = bundle.get_model().to(device)
    elif mode_name == 'vggish':
        model = torch.hub.load('harritaylor/torchvggish', mode_name)
        model.eval().to(device)

    print('loading model:',mode_name)
    root_wav = os.path.join(root, 'data/audio')
    wav_files = os.listdir(root_wav)[::-1]
    print(f'processing {root_wav}')
    save_folder = os.path.join(root, f'models/ABAW6/audiofeat_{mode_name}')
    os.makedirs(save_folder, exist_ok=True)

    with open('/content/drive/MyDrive/MSc/Thesis/data/vid_length.pkl', 'rb') as f:
        video2len = pickle.load(f)

    for nwav, frames_count in tqdm(video2len.items()):
        if nwav.endswith('_left'):
            wav_f = nwav[:-5]
        elif nwav.endswith('_right'):
            wav_f = nwav[:-6]
        else:
            wav_f = nwav
        audio_features = {}
        save_file = os.path.join(save_folder, nwav +'.npy')
        if os.path.exists(save_file):
            continue
        if mode_name == 'vggish':
            with torch.no_grad():
                reps = model.forward(os.path.join(root_wav, wav_f + '.wav'))
                reps = reps.cpu().numpy()/255.
        else:
            wav, rate = torchaudio.load(os.path.join(root_wav, wav_f + '.wav'))
            if rate!= bundle.sample_rate:
                wav = torchaudio.functional.resample(wav, rate, bundle.sample_rate)
            reps = []
            channel, length = wav.shape
            max_length = 2500000
            with torch.no_grad():
                for i in range(length//max_length+1):
                    reps.append(model.extract_features(wav.cuda()[:,i*max_length:(i+1)*max_length])[0][-1])
            reps = torch.concatenate(reps,dim=1)
            if channel !=1:
                reps = torch.mean(reps, dim=0).unsqueeze(dim=0)

            reps = reps.cpu().numpy().squeeze(0)
        audio_scale=len(reps)/frames_count

        for frame_number in range(frames_count):
            ind=int(frame_number*audio_scale)
            nframe = get_names(nwav,frame_number+1)
            audio_features[nframe]= reps[ind]

        np.save(save_file, audio_features)

In [None]:
extract_audio_feature('wav2vec2')

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960.pth
100%|██████████| 360M/360M [00:01<00:00, 207MB/s]


loading model: wav2vec2
processing /content/drive/MyDrive/MSc/Thesis/data/audio


100%|██████████| 627/627 [30:11<00:00,  2.89s/it]


In [None]:
extract_audio_feature('vggish')

Downloading: "https://github.com/harritaylor/torchvggish/zipball/master" to /root/.cache/torch/hub/master.zip
Downloading: "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth" to /root/.cache/torch/hub/checkpoints/vggish-10086976.pth
100%|██████████| 275M/275M [00:01<00:00, 213MB/s]
Downloading: "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish_pca_params-970ea276.pth" to /root/.cache/torch/hub/checkpoints/vggish_pca_params-970ea276.pth
100%|██████████| 177k/177k [00:00<00:00, 9.32MB/s]


loading model: vggish
processing /content/drive/MyDrive/MSc/Thesis/data/audio


100%|██████████| 627/627 [19:59<00:00,  1.91s/it]


# Metrics

### Compute loss

In [None]:
def compute_EXP_loss(pred, label, weights):
    cri_exp = nn.CrossEntropyLoss(weights)
    cls_loss = cri_exp(pred, label)
    return cls_loss

In [None]:
def compute_AU_loss(pred, label, weights):
    cri_AU = nn.BCEWithLogitsLoss(weights)
    cls_loss = cri_AU(pred, label.float())
    return cls_loss

In [None]:
def CCC_loss(x, y):
    x, y = x.view(-1), y.view(-1)
    vx = x - torch.mean(x)
    vy = y - torch.mean(y)
    rho =  torch.sum(vx * vy) / (torch.sqrt(torch.sum(torch.pow(vx, 2))) * torch.sqrt(torch.sum(torch.pow(vy, 2)))+1e-8)
    x_m, y_m = torch.mean(x), torch.mean(y)
    x_s, y_s = torch.std(x), torch.std(y)
    ccc = 2*rho*x_s*y_s/(torch.pow(x_s, 2) + torch.pow(y_s, 2) + torch.pow(x_m - y_m, 2))
    return 1-ccc

In [None]:
def compute_VA_loss(Vout,Aout,label):
    ccc_loss = CCC_loss(Vout[:,0],label[:,0]) + CCC_loss(Aout[:,0],label[:,1])
    mse_loss = nn.MSELoss()(Vout,label[:,0]) + nn.MSELoss()(Aout,label[:,1])
    return mse_loss,ccc_loss

### Compute F1 score

In [None]:
def compute_EXP_F1(pred, target):
    pred_labels = np.argmax(pred, axis=1)
    target_labels = np.argmax(target, axis=1)
    macro_f1 = f1_score(target_labels,pred_labels,average='macro')
    acc = accuracy_score(target_labels, pred_labels)
    return macro_f1, acc

In [None]:
def f1s_max_AU(label, pred, thresh, i=0):
    pred = np.array(pred)
    label = np.array(label)
    label = label[:,i]
    pred = pred[:,i]
    acc = []
    F1 = []
    for i in thresh:
        new_pred = ((pred >= i) * 1).flatten()
        acc.append(accuracy_score(label.flatten(), new_pred))
        F1.append(f1_score(label.flatten(), new_pred))

    F1_MAX = max(F1)
    if F1_MAX < 0 or math.isnan(F1_MAX):
        F1_MAX = 0
        F1_THRESH = 0
        accuracy = 0
    else:
        idx_thresh = np.argmax(F1)
        F1_THRESH = thresh[idx_thresh]
        accuracy = acc[idx_thresh]
    return F1, F1_MAX, F1_THRESH, accuracy

In [None]:
def compute_AU_F1(pred,label,thresh=np.arange(0.1,1,0.1)):
    F1s = []
    F1t = []
    acc = []
    for i in range(12):
        F1, F1_MAX, F1_THRESH, accuracy = f1s_max_AU(label,pred,thresh,i)
        F1s.append(F1_MAX)
        F1t.append(F1_THRESH)
        acc.append(accuracy)
    acc = [round(a,3) for a in acc]
    return np.mean(F1s),np.mean(F1t),acc, F1t

### Concordance Correlation Coefficient

In [None]:
def CCC_score(x, y):
    x = np.array(x)
    y = np.array(y)
    vx = x - np.mean(x)
    vy = y - np.mean(y)
    rho = np.sum(vx * vy) / (np.sqrt(np.sum(vx**2)) * np.sqrt(np.sum(vy**2)))
    x_m = np.mean(x)
    y_m = np.mean(y)
    x_s = np.std(x)
    y_s = np.std(y)
    ccc = 2*rho*x_s*y_s/(x_s**2 + y_s**2 + (x_m - y_m)**2)
    return ccc

In [None]:
def compute_VA_CCC(x,y):
    x = np.array(x)
    y = np.array(y)
    x[x>1] = 1
    x[x<-1] = -1
    ccc1 = CCC_score(x[:,0],y[:,0])
    ccc2 = CCC_score(x[:,1],y[:,1])

    return ccc1,ccc2

### Pearson’s Correlation Coefficient

In [None]:
def PCC(x,y):
    x = np.array(x)
    y = np.array(y)
    x[x>1] = 1
    x[x<0] = 0
    vx = x - np.mean(x)
    vy = y - np.mean(y)
    pcc = np.sum(vx * vy) / (np.sqrt(np.sum(vx**2)) * np.sqrt(np.sum(vy**2)))
    return pcc

In [None]:
def compute_emo_PCC(x,y):
    x = np.array(x)
    y = np.array(y)
    pccs = []
    for i in range(7):
        p = PCC(x[:,i],y[:,i])
        pccs.append(p)
    pccs = np.array(pccs)
    mean_pcc = np.mean(pccs)
    return pccs, mean_pcc

# Challenges

In [None]:
task = ['EXPR_Recognition_Challenge','AU_Detection_Challenge','VA_Estimation_Challenge']
split = ['Train_Set', 'Validation_Set']
typ = ['Train','Val','Test']
vis_typ = ['cropped_aligned', 'cropped']
visual_feat = 'visualfeat_enet_b2_8_best'
audio_feat = ['audiofeat_wav2vec2','audiofeat_vggish','nope']
vis_aud = ['visual_wav2vec2','visual_vggish','visual']
batch_size = 32
model_type = ['fusion', 'mlp']

## EXPR Recognition Challenge

### Loading data

In [None]:
# Cropped_aligned images
vis = vis_typ[0]

In [None]:
# Cropped images
vis = vis_typ[1]

#### Effnet + wav2vec2

In [None]:
auft = audio_feat[0]
viau = vis_aud[0]

#### Effnet + vggish

In [None]:
auft = audio_feat[1]
viau = vis_aud[1]

#### Effnet

In [None]:
auft = audio_feat[2]
viau = vis_aud[2]

#### Train

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis}/EXPR/{task[0]}_{typ[0]}_{viau}.pkl'), 'rb') as f:
    data1 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[0]
with open(os.path.join(root, f'data/Annotations/{task[0]}/{typ[0]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data1[task1][vname])
    for img in data1[task1][vname].keys():
        iname.append(img)

100%|██████████| 199/199 [00:00<00:00, 2542.65it/s]


In [None]:
dataset = ABAW_dataset1(data1, iname, dims, task1)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0, drop_last=True)

#### Val

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis}/EXPR/{task[0]}_{typ[1]}_{viau}.pkl'), 'rb') as f:
    data2 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[0]
with open(os.path.join(root, f'data/Annotations/{task[0]}/{typ[1]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data2[task1][vname])
    for img in data2[task1][vname].keys():
        iname.append(img)

100%|██████████| 49/49 [00:00<00:00, 2459.18it/s]


In [None]:
dataset = ABAW_dataset1(data2, iname, dims, task1)
val_loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=True)

#### Test

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis}/EXPR/{task[0]}_{typ[2]}_{viau}.pkl'), 'rb') as f:
    data3 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[0]
with open(os.path.join(root, f'data/Annotations/{task[0]}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data3[task1][vname])
    for img in data3[task1][vname].keys():
        iname.append(img)

100%|██████████| 70/70 [00:00<00:00, 1183.99it/s]


In [None]:
dataset = ABAW_dataset1(data3, iname, dims, task1)
test_loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=True)

#### Another way (Effnet + wav2vec2)

In [None]:
print(f'{task[0]}')
print(f'loading {typ[0]}')
dataset = ABAW_dataset(root, split[0], typ[0], task[0], feature_v=visual_feat, feature_a=audio_feat)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2, drop_last=True)
torch.save(loader, os.path.join(root,f'models/ABAW6/{task[0]}_{typ[0]}.pth'))

In [None]:
print(f'{task[0]}')
print(f'loading {typ[1]}')
dataset = ABAW_dataset(root, split[0], typ[1], task[0], feature_v=visual_feat, feature_a=audio_feat)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2, drop_last=True)
torch.save(loader, os.path.join(root,f'models/ABAW6/{task[0]}_{typ[1]}.pth'))

In [None]:
print(f'{task[0]}')
print(f'loading {typ[2]}')
dataset = ABAW_dataset(root, split[1], typ[2], task[0], feature_v=visual_feat, feature_a=audio_feat)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2,drop_last=True)
torch.save(loader, os.path.join(root,f'models/ABAW6/{task[0]}_{typ[2]}.pth'))

In [None]:
train_loader = torch.load(os.path.join(root,f'models/ABAW6/EXPR/{task[0]}_{typ[0]}.pth'))

In [None]:
val_loader = torch.load(os.path.join(root,f'models/ABAW6/EXPR/{task[0]}_{typ[1]}.pth'))

In [None]:
test_loader = torch.load(os.path.join(root,f'models/ABAW6/EXPR/{task[0]}_{typ[2]}.pth'))

### Modeling

In [None]:
class EXP_fusion(nn.Module):
    def __init__(self, batchsize = batch_size, audio_ft = auft, hidden_size = [512, 128, batch_size]):
        super(EXP_fusion, self).__init__()
        self.batchsize = batchsize
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.hidden_size = hidden_size
        self.feat_fc = nn.Conv1d(self.concat_dim, hidden_size[0], 1, padding=0)
        self.activ = nn.LeakyReLU(0.1)
        self.dropout = nn.Dropout(p=0.3)
        self.conv1 = nn.Conv1d(hidden_size[0], hidden_size[1], 1, padding=0)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size[1], nhead=4, dim_feedforward=hidden_size[1], dropout=0.3)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
        self.head = nn.Sequential(
                nn.Linear(hidden_size[1], hidden_size[2]),
                nn.BatchNorm1d(hidden_size[2]),
                nn.Dropout(p=0.3),
                nn.Linear(hidden_size[2], 8))

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs,dim=1)
        feat = torch.transpose(feat,0,1)
        feat = self.feat_fc(feat)
        feat = self.activ(feat)
        out = self.conv1(feat)
        out = torch.transpose(out,0,1)
        out = self.transformer_encoder(out)
        out = self.head(out)

        return out, torch.softmax(out, dim = 1)

In [None]:
EXP_model = EXP_fusion().to(device)
EXP_model

EXP_fusion(
  (feat_fc): Conv1d(2176, 512, kernel_size=(1,), stride=(1,))
  (activ): LeakyReLU(negative_slope=0.1)
  (dropout): Dropout(p=0.3, inplace=False)
  (conv1): Conv1d(512, 128, kernel_size=(1,), stride=(1,))
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (head): Sequential(
    (0): Linear(in_features=128, out_feature

In [None]:
class MLPModel(nn.Module):
    def __init__(self, audio_ft = auft, num_classes=8):
        super(MLPModel, self).__init__()
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.activ = nn.ReLU()
        self.fc1 = nn.Linear(self.concat_dim, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs, dim=1)
        feat = self.fc1(feat)
        feat = self.activ(feat)
        out = self.fc2(feat)
        return out, torch.softmax(out, dim=1)

In [None]:
mlp_model = MLPModel().to(device)
mlp_model

MLPModel(
  (activ): ReLU()
  (fc1): Linear(in_features=1536, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=8, bias=True)
)

#### Utils

In [None]:
y_train = []
iterator = iter(train_loader)
i = 0
while True:
    try:
        EXPR = next(iterator)
        y_train.extend(EXPR['label'].numpy())
    except:
        break

In [None]:
class_weights=compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)

weights=torch.tensor(class_weights,dtype=torch.float).to(device)
print(f'{weights = }')

weights = tensor([0.4260, 4.1334, 5.9334, 6.8653, 0.7257, 0.9578, 2.0518, 0.4572],
       device='cuda:0')


In [None]:
weights = [0.4260, 4.1334, 5.9334, 6.8653, 0.7257, 0.9578, 2.0518, 0.4572]
weights = torch.tensor(weights).to(device)

In [None]:
weights1 = [0.5619842406043042, 0.1331245105716523, 0.4915926179084074, 0.009731543624161074, 0.4858991788569254, 0.3211159481346253, 0.23595084924606013, 0.4854298934682019]
weights1 = torch.tensor(weights1).to(device)

In [None]:
def one_hot_transfer(label, class_num):
    one_hot = torch.eye(class_num)
    one_hot = one_hot.to(device)
    return one_hot[label]

In [None]:
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, EXP_model.parameters()), lr=0.00001, betas=(0.9, 0.999), weight_decay=0.00005)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-5)

In [None]:
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, mlp_model.parameters()), lr=0.00001, betas=(0.9, 0.999), weight_decay=0.00005)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-5)
#EXP_class_names = ["Neutral","Anger","Disgust","Fear","Happiness","Sadness","Surprise","Other"]

### Training

In [None]:
def train(model, mod_type, train_loader, val_loader, epoch, batch_size, optim, au_feat, weight, vi_au):

    model.train(True)
    model.eval()
    best_loss = float('inf')
    f1best, accbest = 0, 0
    loss_value = []
    loss_train = []
    loss_val = []
    all_preds = []
    all_targets = []

    for e in range(epoch):
        print(f'Epoch: {e+1}')
        iterator = iter(train_loader)
        while True:
            try:
                EXPR = next(iterator)
                if au_feat == 'nope':
                    vis_feat, y = EXPR[visual_feat], EXPR['label']
                    vis_feat, y = vis_feat.to(device), y.to(device)
                    aud_feat = None
                else:
                    vis_feat, aud_feat, y = EXPR[visual_feat], EXPR[au_feat], EXPR['label']
                    vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
                y_onehot = one_hot_transfer(y, 8).to(device)
                model.zero_grad()
                pred, exp_pred = model(vis_feat, aud_feat)
                loss = compute_EXP_loss(pred, y_onehot, weight)
                loss.backward()
                optim.step()
                loss_value.append(loss.item())
                all_preds.extend(exp_pred.cpu().tolist())
                all_targets.extend(y_onehot.cpu().tolist())
            except:
                break
        avg_loss = round(np.mean(loss_value),3)
        loss_train.append(avg_loss)
        f1_scores, accuracy = compute_EXP_F1(all_preds, all_targets)
        print(f'Train Loss: {avg_loss}, Accuracy: {round(accuracy,3)}')

        val_loss, f1s, acc = evaluate_model(model, val_loader, au_feat, weight)
        loss_val.append(val_loss)

        if val_loss < best_loss:
            best_loss = val_loss
            torch.save(model.state_dict(), os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_{mod_type}_{vi_au}_loss.pth'))
            # f1best = f1s
            # accbest = acc

        if f1s > f1best:
            #best_loss = val_loss
            torch.save(model.state_dict(), os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_{mod_type}_{vi_au}_f1s.pth'))
            f1best = f1s
            #accbest = acc

        if acc > accbest:
            #best_loss = val_loss
            torch.save(model.state_dict(), os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_{mod_type}_{vi_au}_acc.pth'))
            #f1best = f1s
            accbest = acc

        print(f'Validation Loss: {val_loss}, Accuracy: {acc}')
        scheduler.step(val_loss)
    return loss_train, loss_val, best_loss, f1best, accbest

In [None]:
def evaluate_model(model, data_loader, au_feat, weight):
    model.eval()
    total_loss = []
    all_preds = []
    all_targets = []
    with torch.no_grad():
        iterator = iter(data_loader)
        while True:
            try:
                EXPR = next(iterator)
                if au_feat == 'nope':
                    vis_feat, y = EXPR[visual_feat], EXPR['label']
                    vis_feat, y = vis_feat.to(device), y.to(device)
                    aud_feat = None
                else:
                    vis_feat, aud_feat, y = EXPR[visual_feat], EXPR[au_feat], EXPR['label']
                    vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
                y_onehot = one_hot_transfer(y, 8).to(device)
                pred, exp_pred = model(vis_feat, aud_feat)
                loss = compute_EXP_loss(pred, y_onehot, weight)
                total_loss.append(loss.item())
                all_preds.extend(exp_pred.cpu().tolist())
                all_targets.extend(y_onehot.cpu().tolist())
            except:
                break

    f1_scores, acc = compute_EXP_F1(all_preds, all_targets)
    return round(np.mean(total_loss),3), round(f1_scores,3), round(acc,3)

#### Cropped_aligned images

##### Effnet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1best, accbest = train(EXP_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.68, Accuracy: 0.737
Validation Loss: 1.494, Accuracy: 0.468
Epoch: 2
Train Loss: 0.497, Accuracy: 0.801
Validation Loss: 1.709, Accuracy: 0.459
Epoch: 3
Train Loss: 0.407, Accuracy: 0.833
Validation Loss: 1.757, Accuracy: 0.494
Epoch: 4
Train Loss: 0.351, Accuracy: 0.853
Validation Loss: 1.883, Accuracy: 0.48
Epoch: 5
Train Loss: 0.31, Accuracy: 0.868
Validation Loss: 2.133, Accuracy: 0.484
Epoch: 6
Train Loss: 0.28, Accuracy: 0.879
Validation Loss: 2.186, Accuracy: 0.497
Epoch: 7
Train Loss: 0.256, Accuracy: 0.888
Validation Loss: 2.404, Accuracy: 0.489
Epoch: 8
Train Loss: 0.236, Accuracy: 0.896
Validation Loss: 2.459, Accuracy: 0.471
Epoch: 9
Train Loss: 0.219, Accuracy: 0.902
Validation Loss: 2.515, Accuracy: 0.507
Epoch: 10
Train Loss: 0.205, Accuracy: 0.908
Validation Loss: 2.705, Accuracy: 0.502
28min 25s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: f1_score {f1best}, accuracy: {accbest}')
except:
    EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_fusion_{viau}.pth'))
    EXP_model = EXP_fusion().to(device)
    EXP_model.load_state_dict(EXP_best_model, strict=False)
    val_loss, f1s, acc = evaluate_model(EXP_model, val_loader, auft, weights)
    print(f'best metric: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

best metric: f1_score 0.282, accuracy: 0.468


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1best, accbest = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 1.274, Accuracy: 0.539
Validation Loss: 1.278, Accuracy: 0.443
Epoch: 2
Train Loss: 1.071, Accuracy: 0.602
Validation Loss: 1.366, Accuracy: 0.447
Epoch: 3
Train Loss: 0.96, Accuracy: 0.64
Validation Loss: 1.414, Accuracy: 0.452
Epoch: 4
Train Loss: 0.885, Accuracy: 0.666
Validation Loss: 1.453, Accuracy: 0.456
Epoch: 5
Train Loss: 0.829, Accuracy: 0.685
Validation Loss: 1.481, Accuracy: 0.456
Epoch: 6
Train Loss: 0.786, Accuracy: 0.701
Validation Loss: 1.524, Accuracy: 0.456
Epoch: 7
Train Loss: 0.751, Accuracy: 0.713
Validation Loss: 1.546, Accuracy: 0.452
Epoch: 8
Train Loss: 0.721, Accuracy: 0.724
Validation Loss: 1.563, Accuracy: 0.456
Epoch: 9
Train Loss: 0.695, Accuracy: 0.733
Validation Loss: 1.586, Accuracy: 0.451
Epoch: 10
Train Loss: 0.673, Accuracy: 0.741
Validation Loss: 1.604, Accuracy: 0.45
7min 57s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: f1_score {f1best}, accuracy: {accbest}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}.pth'))
    mlp_model = MLPModel().to(device)
    mlp_model.load_state_dict(mlp_best_model, strict=False)
    val_loss, f1s, acc = evaluate_model(mlp_model, val_loader, auft, weights)
    print(f'best metric: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

best metric: f1_score 0.29, accuracy: 0.452


##### Effnet + Vggish

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1best, accbest = train(EXP_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.668, Accuracy: 0.74
Validation Loss: 1.387, Accuracy: 0.492
Epoch: 2
Train Loss: 0.471, Accuracy: 0.805
Validation Loss: 1.575, Accuracy: 0.481
Epoch: 3
Train Loss: 0.373, Accuracy: 0.84
Validation Loss: 1.922, Accuracy: 0.473
Epoch: 4
Train Loss: 0.311, Accuracy: 0.864
Validation Loss: 2.178, Accuracy: 0.495
Epoch: 5
Train Loss: 0.268, Accuracy: 0.881
Validation Loss: 2.274, Accuracy: 0.467
Epoch: 6
Train Loss: 0.236, Accuracy: 0.894
Validation Loss: 2.43, Accuracy: 0.495
Epoch: 7
Train Loss: 0.211, Accuracy: 0.905
Validation Loss: 2.607, Accuracy: 0.475
Epoch: 8
Train Loss: 0.191, Accuracy: 0.913
Validation Loss: 2.66, Accuracy: 0.488
Epoch: 9
Train Loss: 0.175, Accuracy: 0.92
Validation Loss: 2.806, Accuracy: 0.477
Epoch: 10
Train Loss: 0.161, Accuracy: 0.926
Validation Loss: 2.994, Accuracy: 0.47
29min 23s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: f1_score {f1best}, accuracy: {accbest}')
except:
    EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_fusion_{viau}.pth'))
    EXP_model = EXP_fusion().to(device)
    EXP_model.load_state_dict(EXP_best_model, strict=False)
    val_loss, f1s, acc = evaluate_model(EXP_model, val_loader, auft, weights)
    print(f'best metric: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

best metric: f1_score 0.286, accuracy: 0.492


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1best, accbest = train(mlp_model,model_type[1], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 1.417, Accuracy: 0.495
Validation Loss: 1.21, Accuracy: 0.416
Epoch: 2
Train Loss: 1.232, Accuracy: 0.548
Validation Loss: 1.213, Accuracy: 0.454
Epoch: 3
Train Loss: 1.121, Accuracy: 0.585
Validation Loss: 1.218, Accuracy: 0.489
Epoch: 4
Train Loss: 1.043, Accuracy: 0.613
Validation Loss: 1.23, Accuracy: 0.493
Epoch: 5
Train Loss: 0.985, Accuracy: 0.633
Validation Loss: 1.229, Accuracy: 0.501
Epoch: 6
Train Loss: 0.938, Accuracy: 0.649
Validation Loss: 1.24, Accuracy: 0.499
Epoch: 7
Train Loss: 0.9, Accuracy: 0.663
Validation Loss: 1.237, Accuracy: 0.506
Epoch: 8
Train Loss: 0.868, Accuracy: 0.674
Validation Loss: 1.252, Accuracy: 0.496
Epoch: 9
Train Loss: 0.84, Accuracy: 0.683
Validation Loss: 1.237, Accuracy: 0.496
Epoch: 10
Train Loss: 0.816, Accuracy: 0.692
Validation Loss: 1.26, Accuracy: 0.499
7min 35s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: f1_score {f1best}, accuracy: {accbest}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}.pth'))
    mlp_model = MLPModel().to(device)
    mlp_model.load_state_dict(mlp_best_model, strict=False)
    val_loss, f1s, acc = evaluate_model(mlp_model, val_loader, auft, weights)
    print(f'best metric: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

best metric: f1_score 0.309, accuracy: 0.501


##### Effnet

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1best, accbest = train(EXP_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.7, Accuracy: 0.729
Validation Loss: 1.413, Accuracy: 0.46
Epoch: 2
Train Loss: 0.529, Accuracy: 0.786
Validation Loss: 1.596, Accuracy: 0.445
Epoch: 3
Train Loss: 0.441, Accuracy: 0.816
Validation Loss: 1.822, Accuracy: 0.478
Epoch: 4
Train Loss: 0.385, Accuracy: 0.837
Validation Loss: 1.869, Accuracy: 0.472
Epoch: 5
Train Loss: 0.345, Accuracy: 0.852
Validation Loss: 2.031, Accuracy: 0.466
Epoch: 6
Train Loss: 0.314, Accuracy: 0.863
Validation Loss: 2.083, Accuracy: 0.434
Epoch: 7
Train Loss: 0.289, Accuracy: 0.873
Validation Loss: 2.298, Accuracy: 0.472
Epoch: 8
Train Loss: 0.268, Accuracy: 0.881
Validation Loss: 2.361, Accuracy: 0.459
Epoch: 9
Train Loss: 0.251, Accuracy: 0.888
Validation Loss: 2.384, Accuracy: 0.455
Epoch: 10
Train Loss: 0.236, Accuracy: 0.894
Validation Loss: 2.432, Accuracy: 0.464
27min 18s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: f1_score {f1best}, accuracy: {accbest}')
except:
    EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_fusion_{viau}.pth'))
    EXP_model = EXP_fusion().to(device)
    EXP_model.load_state_dict(EXP_best_model, strict=False)
    val_loss, f1s, acc = evaluate_model(EXP_model, val_loader, auft, weights)
    print(f'best metric: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

best metric: f1_score 0.262, accuracy: 0.478


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1best, accbest = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 1.477, Accuracy: 0.467
Validation Loss: 1.214, Accuracy: 0.388
Epoch: 2
Train Loss: 1.325, Accuracy: 0.505
Validation Loss: 1.222, Accuracy: 0.402
Epoch: 3
Train Loss: 1.232, Accuracy: 0.535
Validation Loss: 1.235, Accuracy: 0.412
Epoch: 4
Train Loss: 1.164, Accuracy: 0.558
Validation Loss: 1.25, Accuracy: 0.428
Epoch: 5
Train Loss: 1.11, Accuracy: 0.577
Validation Loss: 1.261, Accuracy: 0.43
Epoch: 6
Train Loss: 1.065, Accuracy: 0.593
Validation Loss: 1.281, Accuracy: 0.429
Epoch: 7
Train Loss: 1.027, Accuracy: 0.606
Validation Loss: 1.278, Accuracy: 0.441
Epoch: 8
Train Loss: 0.995, Accuracy: 0.618
Validation Loss: 1.291, Accuracy: 0.436
Epoch: 9
Train Loss: 0.967, Accuracy: 0.628
Validation Loss: 1.301, Accuracy: 0.439
Epoch: 10
Train Loss: 0.942, Accuracy: 0.637
Validation Loss: 1.315, Accuracy: 0.436
6min 29s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: f1_score {f1best}, accuracy: {accbest}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}.pth'))
    mlp_model = MLPModel().to(device)
    mlp_model.load_state_dict(mlp_best_model, strict=False)
    val_loss, f1s, acc = evaluate_model(mlp_model, val_loader, auft, weights)
    print(f'best metric: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

best metric: f1_score 0.275, accuracy: 0.441


#### Cropped images

##### Effnet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1best, accbest = train(EXP_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.742, Accuracy: 0.71
Validation Loss: 1.544, Accuracy: 0.483
Epoch: 2
Train Loss: 0.559, Accuracy: 0.775
Validation Loss: 1.756, Accuracy: 0.477
Epoch: 3
Train Loss: 0.468, Accuracy: 0.807
Validation Loss: 1.852, Accuracy: 0.514
Epoch: 4
Train Loss: 0.409, Accuracy: 0.828
Validation Loss: 2.096, Accuracy: 0.512
Epoch: 5
Train Loss: 0.367, Accuracy: 0.843
Validation Loss: 2.13, Accuracy: 0.509
Epoch: 6
Train Loss: 0.334, Accuracy: 0.855
Validation Loss: 2.323, Accuracy: 0.489
Epoch: 7
Train Loss: 0.308, Accuracy: 0.865
Validation Loss: 2.354, Accuracy: 0.506
Epoch: 8
Train Loss: 0.286, Accuracy: 0.873
Validation Loss: 2.506, Accuracy: 0.511
Epoch: 9
Train Loss: 0.268, Accuracy: 0.88
Validation Loss: 2.579, Accuracy: 0.518
Epoch: 10
Train Loss: 0.252, Accuracy: 0.886
Validation Loss: 2.697, Accuracy: 0.503
28min 16s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: f1_score {f1best}, accuracy: {accbest}')
except:
    EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_fusion_{viau}.pth'))
    EXP_model = EXP_fusion().to(device)
    EXP_model.load_state_dict(EXP_best_model, strict=False)
    val_loss, f1s, acc = evaluate_model(EXP_model, val_loader, auft, weights)
    print(f'best metric: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

best metric: f1_score 0.269, accuracy: 0.518


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1best, accbest = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 1.372, Accuracy: 0.495
Validation Loss: 1.248, Accuracy: 0.425
Epoch: 2
Train Loss: 1.168, Accuracy: 0.56
Validation Loss: 1.331, Accuracy: 0.452
Epoch: 3
Train Loss: 1.052, Accuracy: 0.6
Validation Loss: 1.379, Accuracy: 0.472
Epoch: 4
Train Loss: 0.973, Accuracy: 0.628
Validation Loss: 1.454, Accuracy: 0.467
Epoch: 5
Train Loss: 0.915, Accuracy: 0.649
Validation Loss: 1.492, Accuracy: 0.465
Epoch: 6
Train Loss: 0.87, Accuracy: 0.665
Validation Loss: 1.535, Accuracy: 0.467
Epoch: 7
Train Loss: 0.833, Accuracy: 0.678
Validation Loss: 1.575, Accuracy: 0.464
Epoch: 8
Train Loss: 0.803, Accuracy: 0.69
Validation Loss: 1.607, Accuracy: 0.463
Epoch: 9
Train Loss: 0.776, Accuracy: 0.699
Validation Loss: 1.635, Accuracy: 0.467
Epoch: 10
Train Loss: 0.754, Accuracy: 0.707
Validation Loss: 1.656, Accuracy: 0.472
7min 25s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: f1_score {f1best}, accuracy: {accbest}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}.pth'))
    mlp_model = MLPModel().to(device)
    mlp_model.load_state_dict(mlp_best_model, strict=False)
    val_loss, f1s, acc = evaluate_model(mlp_model, val_loader, auft, weights)
    print(f'best metric: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

best metric: f1_score 0.308, accuracy: 0.472


##### Effnet + Vggish

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1best, accbest = train(EXP_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.779, Accuracy: 0.695
Validation Loss: 1.356, Accuracy: 0.511
Epoch: 2
Train Loss: 0.586, Accuracy: 0.759
Validation Loss: 1.504, Accuracy: 0.512
Epoch: 3
Train Loss: 0.484, Accuracy: 0.794
Validation Loss: 1.683, Accuracy: 0.509
Epoch: 4
Train Loss: 0.416, Accuracy: 0.818
Validation Loss: 1.844, Accuracy: 0.507
Epoch: 5
Train Loss: 0.366, Accuracy: 0.836
Validation Loss: 2.024, Accuracy: 0.501
Epoch: 6
Train Loss: 0.329, Accuracy: 0.85
Validation Loss: 2.264, Accuracy: 0.495
Epoch: 7
Train Loss: 0.298, Accuracy: 0.862
Validation Loss: 2.425, Accuracy: 0.493
Epoch: 8
Train Loss: 0.273, Accuracy: 0.872
Validation Loss: 2.425, Accuracy: 0.5
Epoch: 9
Train Loss: 0.252, Accuracy: 0.881
Validation Loss: 2.808, Accuracy: 0.493
Epoch: 10
Train Loss: 0.235, Accuracy: 0.889
Validation Loss: 2.79, Accuracy: 0.497
28min 26s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: f1_score {f1best}, accuracy: {accbest}')
except:
    EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_fusion_{viau}.pth'))
    EXP_model = EXP_fusion().to(device)
    EXP_model.load_state_dict(EXP_best_model, strict=False)
    val_loss, f1s, acc = evaluate_model(EXP_model, val_loader, auft, weights)
    print(f'best metric: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

best metric: f1_score 0.295, accuracy: 0.511


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1best, accbest = train(mlp_model,model_type[1], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 1.533, Accuracy: 0.458
Validation Loss: 1.177, Accuracy: 0.416
Epoch: 2
Train Loss: 1.36, Accuracy: 0.506
Validation Loss: 1.188, Accuracy: 0.46
Epoch: 3
Train Loss: 1.251, Accuracy: 0.541
Validation Loss: 1.207, Accuracy: 0.484
Epoch: 4
Train Loss: 1.172, Accuracy: 0.568
Validation Loss: 1.227, Accuracy: 0.494
Epoch: 5
Train Loss: 1.112, Accuracy: 0.588
Validation Loss: 1.247, Accuracy: 0.499
Epoch: 6
Train Loss: 1.064, Accuracy: 0.605
Validation Loss: 1.265, Accuracy: 0.502
Epoch: 7
Train Loss: 1.025, Accuracy: 0.618
Validation Loss: 1.282, Accuracy: 0.504
Epoch: 8
Train Loss: 0.992, Accuracy: 0.63
Validation Loss: 1.298, Accuracy: 0.505
Epoch: 9
Train Loss: 0.964, Accuracy: 0.639
Validation Loss: 1.312, Accuracy: 0.506
Epoch: 10
Train Loss: 0.939, Accuracy: 0.648
Validation Loss: 1.326, Accuracy: 0.507
7min 37s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: f1_score {f1best}, accuracy: {accbest}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}.pth'))
    mlp_model = MLPModel().to(device)
    mlp_model.load_state_dict(mlp_best_model, strict=False)
    val_loss, f1s, acc = evaluate_model(mlp_model, val_loader, auft, weights)
    print(f'best metric: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

best metric: f1_score 0.308, accuracy: 0.494


##### Effnet

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1best, accbest = train(EXP_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.925, Accuracy: 0.644
Validation Loss: 1.389, Accuracy: 0.506
Epoch: 2
Train Loss: 0.745, Accuracy: 0.704
Validation Loss: 1.538, Accuracy: 0.492
Epoch: 3
Train Loss: 0.65, Accuracy: 0.735
Validation Loss: 1.683, Accuracy: 0.485
Epoch: 4
Train Loss: 0.586, Accuracy: 0.757
Validation Loss: 1.792, Accuracy: 0.488
Epoch: 5
Train Loss: 0.539, Accuracy: 0.772
Validation Loss: 1.888, Accuracy: 0.486
Epoch: 6
Train Loss: 0.501, Accuracy: 0.785
Validation Loss: 1.986, Accuracy: 0.487
Epoch: 7
Train Loss: 0.47, Accuracy: 0.796
Validation Loss: 2.051, Accuracy: 0.487
Epoch: 8
Train Loss: 0.443, Accuracy: 0.805
Validation Loss: 2.161, Accuracy: 0.487
Epoch: 9
Train Loss: 0.42, Accuracy: 0.813
Validation Loss: 2.263, Accuracy: 0.477
Epoch: 10
Train Loss: 0.4, Accuracy: 0.82
Validation Loss: 2.335, Accuracy: 0.495
27min 18s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: f1_score {f1best}, accuracy: {accbest}')
except:
    EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_fusion_{viau}.pth'))
    EXP_model = EXP_fusion().to(device)
    EXP_model.load_state_dict(EXP_best_model, strict=False)
    val_loss, f1s, acc = evaluate_model(EXP_model, val_loader, auft, weights)
    print(f'best metric: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

best metric: f1_score 0.281, accuracy: 0.506


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1best, accbest = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 1.596, Accuracy: 0.408
Validation Loss: 1.162, Accuracy: 0.406
Epoch: 2
Train Loss: 1.466, Accuracy: 0.443
Validation Loss: 1.163, Accuracy: 0.441
Epoch: 3
Train Loss: 1.383, Accuracy: 0.468
Validation Loss: 1.18, Accuracy: 0.461
Epoch: 4
Train Loss: 1.322, Accuracy: 0.489
Validation Loss: 1.205, Accuracy: 0.473
Epoch: 5
Train Loss: 1.272, Accuracy: 0.508
Validation Loss: 1.232, Accuracy: 0.478
Epoch: 6
Train Loss: 1.231, Accuracy: 0.523
Validation Loss: 1.257, Accuracy: 0.479
Epoch: 7
Train Loss: 1.196, Accuracy: 0.536
Validation Loss: 1.28, Accuracy: 0.48
Epoch: 8
Train Loss: 1.166, Accuracy: 0.548
Validation Loss: 1.299, Accuracy: 0.48
Epoch: 9
Train Loss: 1.139, Accuracy: 0.558
Validation Loss: 1.316, Accuracy: 0.48
Epoch: 10
Train Loss: 1.116, Accuracy: 0.567
Validation Loss: 1.331, Accuracy: 0.479
6min 53s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: f1_score {f1best}, accuracy: {accbest}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}.pth'))
    mlp_model = MLPModel().to(device)
    mlp_model.load_state_dict(mlp_best_model, strict=False)
    val_loss, f1s, acc = evaluate_model(mlp_model, val_loader, auft, weights)
    print(f'best metric: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

best metric: f1_score 0.298, accuracy: 0.461


### Testing

#### Cropped_aligned images

##### EffNet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
print('EXP_model')
print(visual_feat + ' & ' + auft)
EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_fusion_{viau}.pth'))
EXP_model = EXP_fusion().to(device)
EXP_model.load_state_dict(EXP_best_model)
test_loss, f1s, acc = evaluate_model(EXP_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

EXP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: f1_score 0.33, accuracy: 0.457
34.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model, strict=False)
val_loss, f1s, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: f1_score 0.394, accuracy: 0.488
12 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet + Vggish

In [None]:
%%timeit -n 1 -r 1
print('EXP_model')
print(visual_feat + ' & ' + auft)
EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_fusion_{viau}.pth'))
EXP_model = EXP_fusion().to(device)
EXP_model.load_state_dict(EXP_best_model)
test_loss, f1s, acc = evaluate_model(EXP_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

EXP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score 0.316, accuracy: 0.475
33 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model, strict=False)
val_loss, f1s, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score 0.379, accuracy: 0.498
11.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet

In [None]:
%%timeit -n 1 -r 1
print('EXP_model')
print(visual_feat + ' & ' + auft)
EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_fusion_{viau}.pth'))
EXP_model = EXP_fusion().to(device)
EXP_model.load_state_dict(EXP_best_model)
test_loss, f1s, acc = evaluate_model(EXP_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

EXP_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score 0.31, accuracy: 0.447
29.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model, strict=False)
val_loss, f1s, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

MLP_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score 0.327, accuracy: 0.431
8.93 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


#### Cropped images

##### EffNet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
print('EXP_model')
print(visual_feat + ' & ' + auft)
EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_fusion_{viau}.pth'))
EXP_model = EXP_fusion().to(device)
EXP_model.load_state_dict(EXP_best_model)
test_loss, f1s, acc = evaluate_model(EXP_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

EXP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: f1_score 0.334, accuracy: 0.472
32.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model, strict=False)
val_loss, f1s, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: f1_score 0.364, accuracy: 0.458
10.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet + Vggish

In [None]:
%%timeit -n 1 -r 1
print('EXP_model')
print(visual_feat + ' & ' + auft)
EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_fusion_{viau}.pth'))
EXP_model = EXP_fusion().to(device)
EXP_model.load_state_dict(EXP_best_model)
test_loss, f1s, acc = evaluate_model(EXP_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

EXP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score 0.292, accuracy: 0.413
32.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}_loss.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model, strict=False)
val_loss, f1s, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score 0.295, accuracy: 0.374
11.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}_f1s.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model, strict=False)
val_loss, f1s, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score 0.342, accuracy: 0.445
11.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet

In [None]:
%%timeit -n 1 -r 1
print('EXP_model')
print(visual_feat + ' & ' + auft)
EXP_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_fusion_{viau}.pth'))
EXP_model = EXP_fusion().to(device)
EXP_model.load_state_dict(EXP_best_model)
test_loss, f1s, acc = evaluate_model(EXP_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

EXP_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score 0.28, accuracy: 0.401
29.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model, strict=False)
val_loss, f1s, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score {round(f1s,3)}, accuracy: {round(acc,3)}')

MLP_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score 0.291, accuracy: 0.374
9.01 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## AU Detection Challenge

### Loading data

In [None]:
# Cropped_aligned images
vis = vis_typ[0]
visft = visual_feat[0]

In [None]:
# Cropped images
vis = vis_typ[1]
visft = visual_feat[1]

#### Effnet + wav2vec2

In [None]:
auft = audio_feat[0]
viau = vis_aud[0]

#### Effnet + vggish

In [None]:
auft = audio_feat[1]
viau = vis_aud[1]

#### Effnet

In [None]:
auft = audio_feat[2]
viau = vis_aud[2]

#### Train

In [None]:
if viau == vis_aud[0]: #Visual+wav2vec2
    with open(os.path.join(root,f'models/ABAW6/{vis}/AU/{task[1]}_{typ[0]}_visual.pkl'), 'rb') as f:
        data1 = pickle.load(f)
    with open(os.path.join(root, f'data/Annotations/{task[1]}/{typ[0]}.txt'), 'r') as f:
        vidnames = f.read().splitlines()
    task1 = task[1]
    feature_a = 'audiofeat_wav2vec2'
    feat_root = os.path.join(root + '/models/ABAW6', feature_a)
    filenames = os.listdir(feat_root)[:]
    for vname in tqdm(vidnames):
            feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
            for imgname, val in feature.items():
                if imgname in data1[task1][vname]:
                    data1[task1][vname][imgname].update({f'{feature_a}': val})
            for img, value in list(data1[task1][vname].items()):
                if len(value) < 3:
                    data1[task1][vname].pop(img)
else:
    with open(os.path.join(root,f'models/ABAW6/{vis}/AU/{task[1]}_{typ[0]}_{viau}.pkl'), 'rb') as f:
        data1 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[1]
with open(os.path.join(root, f'data/Annotations/{task[1]}/{typ[0]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data1[task1][vname])
    for img in data1[task1][vname].keys():
        iname.append(img)

100%|██████████| 236/236 [00:00<00:00, 664.91it/s] 


In [None]:
dataset = ABAW_dataset1(data1, iname, dims, task1)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0, drop_last=True)

#### Val

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis}/AU/{task[1]}_{typ[1]}_{viau}.pkl'), 'rb') as f:
    data2 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[1]
with open(os.path.join(root, f'data/Annotations/{task[1]}/{typ[1]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data2[task1][vname])
    for img in data2[task1][vname].keys():
        iname.append(img)

100%|██████████| 59/59 [00:00<00:00, 1230.69it/s]


In [None]:
dataset = ABAW_dataset1(data2, iname, dims, task1)
val_loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=True)

#### Test

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis}/AU/{task[1]}_{typ[2]}_{viau}.pkl'), 'rb') as f:
    data3 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[1]
with open(os.path.join(root, f'data/Annotations/{task[1]}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data3[task1][vname])
    for img in data3[task1][vname].keys():
        iname.append(img)

100%|██████████| 105/105 [00:00<00:00, 1298.66it/s]


In [None]:
dataset = ABAW_dataset1(data3, iname, dims, task1)
test_loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=True)

### Modeling

In [None]:
class AU_fusion(nn.Module):
    def __init__(self, batchsize = batch_size, audio_ft = auft, hidden_size = [512, 128, batch_size]):
        super(AU_fusion, self).__init__()
        self.batchsize = batchsize
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.hidden_size = hidden_size
        self.feat_fc = nn.Conv1d(self.concat_dim, hidden_size[0], 1, padding=0)
        self.activ = nn.LeakyReLU(0.1)
        self.dropout = nn.Dropout(p=0.3)
        self.conv1 = nn.Conv1d(hidden_size[0], hidden_size[1], 1, padding=0)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size[1], nhead=4, dim_feedforward=hidden_size[1], dropout=0.3)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
        self.head = nn.Sequential(
                nn.Linear(hidden_size[1], hidden_size[2]),
                nn.BatchNorm1d(hidden_size[2]),
                nn.Linear(hidden_size[2], 12))

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs,dim=1)
        feat = torch.transpose(feat,0,1)
        feat = self.feat_fc(feat)
        feat = self.activ(feat)
        out = self.conv1(feat)
        out = torch.transpose(out,0,1)
        out = self.transformer_encoder(out)
        out = self.head(out)

        return out, torch.sigmoid(out)

In [None]:
AU_model = AU_fusion().to(device)
AU_model

AU_fusion(
  (feat_fc): Conv1d(1536, 512, kernel_size=(1,), stride=(1,))
  (activ): LeakyReLU(negative_slope=0.1)
  (dropout): Dropout(p=0.3, inplace=False)
  (conv1): Conv1d(512, 128, kernel_size=(1,), stride=(1,))
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (head): Sequential(
    (0): Linear(in_features=128, out_features

In [None]:
class MLPModel(nn.Module):
    def __init__(self, audio_ft = auft, num_classes=12):
        super(MLPModel, self).__init__()
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.activ = nn.ReLU()
        self.fc1 = nn.Linear(self.concat_dim, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs, dim=1)
        feat = self.fc1(feat)
        feat = self.activ(feat)
        out = self.fc2(feat)
        return out, torch.sigmoid(out)

In [None]:
mlp_model = MLPModel().to(device)
mlp_model

MLPModel(
  (activ): ReLU()
  (fc1): Linear(in_features=1536, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=12, bias=True)
)

#### Utils

In [None]:
weights = torch.tensor([0.54733899, 0.44180561, 0.56990565, 0.61997328, 0.73956417,0.74692377, 0.72684634, 0.33222808, 0.17383676, 0.20608964, 0.83688068, 0.33890931]).to(device)

In [None]:
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, AU_model.parameters()), lr=0.00001, betas=(0.9, 0.999), weight_decay=0.00005)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-5)

In [None]:
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, mlp_model.parameters()), lr=0.00001, betas=(0.9, 0.999), weight_decay=0.00005)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-5)

### Training

In [None]:
def train(model, mod_type, train_loader, val_loader, epoch, batch_size, optim, au_feat, weight, vi_au):
    model.train(True)
    model.eval()
    best_loss = float('inf')
    f1s_best, accbest = 0, 0
    loss_value = []
    loss_train = []
    loss_val = []
    all_preds = []
    all_targets = []

    for e in range(epoch):
        print(f'Epoch: {e+1}')
        torch.manual_seed(2809)
        iterator = iter(train_loader)
        for i in range(len(train_loader)//32):
            try:
                AU = next(iterator)
                if au_feat == 'nope':
                    vis_feat, y = AU[visual_feat], AU['label']
                    vis_feat, y = vis_feat.to(device), y.to(device)
                    aud_feat = None
                else:
                    vis_feat, aud_feat, y = AU[visual_feat], AU[au_feat], AU['label']
                    vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
                model.zero_grad()
                pred, au_pred = model(vis_feat, aud_feat)
                loss = compute_AU_loss(pred, y, weight)
                loss.backward()
                optim.step()
                loss_value.append(loss.item())
                all_preds.extend(au_pred.cpu().tolist())
                all_targets.extend(y.cpu().tolist())
            except:
                break
        avg_loss = round(np.mean(loss_value),3)
        loss_train.append(avg_loss)
        f1_scores, f1_thresh, accuracy, threshold = compute_AU_F1(all_preds, all_targets)
        print(f'Train Loss: {avg_loss}, Accuracy of 12 AU classes: {accuracy}')

        val_loss, f1s, f1t, acc, f1_threshold = evaluate_model(model, val_loader, au_feat, weight)
        loss_val.append(val_loss)

        if val_loss < best_loss:
            best_loss = val_loss
            torch.save(model.state_dict(), os.path.join(root,f'models/ABAW6/{vis}/best_AU_{mod_type}_{vi_au}_loss.pth'))
            # f1best = f1s
            # accbest = acc

        if f1s > f1s_best:
            #best_loss = val_loss
            torch.save(model.state_dict(), os.path.join(root,f'models/ABAW6/{vis}/best_AU_{mod_type}_{vi_au}_f1s.pth'))
            f1s_best = f1s
            f1t_best = f1t
            #accbest = acc

        if np.mean(acc) > accbest:
            #best_loss = val_loss
            torch.save(model.state_dict(), os.path.join(root,f'models/ABAW6/{vis}/best_AU_{mod_type}_{vi_au}_acc.pth'))
            #f1best = f1s
            accbest = np.mean(acc)

        print(f'Validation Loss: {val_loss}, Accuracy of 12 AU classes: {acc}')
        scheduler.step(val_loss)
    return loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest

In [None]:
def evaluate_model(model, data_loader, au_feat, weight):
    model.eval()
    total_loss = []
    all_preds = []
    all_targets = []
    with torch.no_grad():
        iterator = iter(data_loader)
        for i in range(len(data_loader)//32):
          try:
            AU = next(iterator)
            if au_feat == 'nope':
                vis_feat, y = AU[visual_feat], AU['label']
                vis_feat, y = vis_feat.to(device), y.to(device)
                aud_feat = None
            else:
                vis_feat, aud_feat, y = AU[visual_feat], AU[au_feat], AU['label']
                vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
            pred, au_pred = model(vis_feat, aud_feat)
            loss = compute_AU_loss(pred, y, weight)
            total_loss.append(loss.item())
            all_preds.extend(au_pred.cpu().tolist())
            all_targets.extend(y.cpu().tolist())
          except:
            break

    f1_scores, f1_thresh, acc, threshold = compute_AU_F1(all_preds, all_targets)
    return round(np.mean(total_loss),3), round(f1_scores,3), round(f1_thresh,3), acc, threshold

#### Cropped_aligned images

##### EffNet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(AU_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.23, Accuracy of 12 AU classes: [0.851, 0.365, 0.73, 0.687, 0.597, 0.597, 0.78, 0.609, 0.034, 0.04, 0.734, 0.088]
Validation Loss: 0.193, Accuracy of 12 AU classes: [0.909, 0.734, 0.798, 0.882, 0.791, 0.866, 0.89, 0.925, 0.179, 0.082, 0.781, 0.742]
Epoch: 2
Train Loss: 0.208, Accuracy of 12 AU classes: [0.866, 0.904, 0.807, 0.749, 0.741, 0.77, 0.819, 0.78, 0.236, 0.253, 0.761, 0.742]
Validation Loss: 0.171, Accuracy of 12 AU classes: [0.916, 0.924, 0.87, 0.891, 0.809, 0.878, 0.895, 0.984, 0.651, 0.547, 0.762, 0.878]
Epoch: 3
Train Loss: 0.195, Accuracy of 12 AU classes: [0.874, 0.916, 0.837, 0.823, 0.752, 0.782, 0.835, 0.473, 0.386, 0.421, 0.775, 0.797]
Validation Loss: 0.162, Accuracy of 12 AU classes: [0.918, 0.932, 0.872, 0.894, 0.845, 0.882, 0.896, 0.852, 0.677, 0.745, 0.754, 0.9]
Epoch: 4
Train Loss: 0.185, Accuracy of 12 AU classes: [0.879, 0.922, 0.854, 0.833, 0.761, 0.791, 0.868, 0.575, 0.479, 0.522, 0.785, 0.824]
Validation Loss: 0.159, Accuracy of 12 AU 

In [None]:
try:
    print(f'best metric: f1_score {f1s_best}, f1_threshold: {f1t_best}, accuracy: {accbest}')
except:
    AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_fusion_{viau}.pth'))
    AU_model = AU_fusion().to(device)
    AU_model.load_state_dict(AU_best_model)
    val_loss, f1s, f1t, acc = evaluate_model(AU_model, val_loader, auft, weights)
    print(f'best metric: f1_score {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

best metric: f1_score 0.538, f1_threshold: 0.258, accuracy: [0.92, 0.934, 0.869, 0.897, 0.844, 0.883, 0.897, 0.884, 0.706, 0.817, 0.768, 0.909]


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(mlp_model, model_type[1], train_loader, val_loader, 50, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.28, Accuracy of 12 AU classes: [0.128, 0.085, 0.403, 0.516, 0.541, 0.589, 0.612, 0.03, 0.161, 0.313, 0.629, 0.078]
Validation Loss: 0.234, Accuracy of 12 AU classes: [0.168, 0.396, 0.15, 0.864, 0.707, 0.858, 0.872, 0.025, 0.174, 0.373, 0.786, 0.088]
Epoch: 2
Train Loss: 0.25, Accuracy of 12 AU classes: [0.199, 0.447, 0.63, 0.652, 0.635, 0.525, 0.734, 0.37, 0.284, 0.412, 0.658, 0.208]
Validation Loss: 0.205, Accuracy of 12 AU classes: [0.897, 0.902, 0.417, 0.866, 0.784, 0.868, 0.867, 0.988, 0.825, 0.937, 0.727, 0.766]
Epoch: 3
Train Loss: 0.233, Accuracy of 12 AU classes: [0.711, 0.608, 0.701, 0.699, 0.67, 0.59, 0.774, 0.57, 0.511, 0.598, 0.689, 0.392]
Validation Loss: 0.189, Accuracy of 12 AU classes: [0.91, 0.912, 0.648, 0.87, 0.809, 0.87, 0.872, 0.989, 0.826, 0.937, 0.759, 0.849]
Epoch: 4
Train Loss: 0.222, Accuracy of 12 AU classes: [0.749, 0.69, 0.735, 0.723, 0.69, 0.625, 0.793, 0.67, 0.626, 0.692, 0.707, 0.494]
Validation Loss: 0.18, Accuracy of 12 AU classe

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.289, Accuracy of 12 AU classes: [0.209, 0.052, 0.35, 0.386, 0.6, 0.41, 0.54, 0.175, 0.136, 0.082, 0.663, 0.078]
Validation Loss: 0.234, Accuracy of 12 AU classes: [0.773, 0.091, 0.138, 0.85, 0.742, 0.852, 0.874, 0.909, 0.174, 0.063, 0.74, 0.094]
Epoch: 2
Train Loss: 0.254, Accuracy of 12 AU classes: [0.148, 0.312, 0.597, 0.583, 0.662, 0.54, 0.695, 0.307, 0.302, 0.233, 0.695, 0.311]
Validation Loss: 0.203, Accuracy of 12 AU classes: [0.897, 0.9, 0.401, 0.869, 0.77, 0.867, 0.889, 0.959, 0.818, 0.927, 0.734, 0.863]
Epoch: 3
Train Loss: 0.235, Accuracy of 12 AU classes: [0.64, 0.514, 0.677, 0.652, 0.687, 0.599, 0.747, 0.521, 0.521, 0.474, 0.715, 0.484]
Validation Loss: 0.188, Accuracy of 12 AU classes: [0.915, 0.912, 0.681, 0.871, 0.803, 0.869, 0.889, 0.981, 0.826, 0.937, 0.758, 0.891]
Epoch: 4
Train Loss: 0.224, Accuracy of 12 AU classes: [0.699, 0.619, 0.717, 0.688, 0.701, 0.631, 0.824, 0.631, 0.633, 0.599, 0.727, 0.572]
Validation Loss: 0.179, Accuracy of 12 AU cl

In [None]:
try:
    print(f'best metric: f1_score {f1s_best}, f1_threshold: {f1t_best}, accuracy: {accbest}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_mlp_{viau}.pth'))
    mlp_model = MLPModel(num_classes = 12).to(device)
    mlp_model.load_state_dict(mlp_best_model)
    val_loss, f1s, f1t, acc = evaluate_model(mlp_model, val_loader, auft, weights)
    print(f'best metric: f1_score {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

best metric: f1_score 0.518, f1_threshold: 0.258, accuracy: [0.917, 0.925, 0.881, 0.885, 0.834, 0.871, 0.895, 0.939, 0.826, 0.937, 0.793, 0.908]


##### EffNet + Vggish

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(AU_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.227, Accuracy of 12 AU classes: [0.801, 0.603, 0.795, 0.737, 0.702, 0.707, 0.846, 0.077, 0.328, 0.031, 0.725, 0.306]
Validation Loss: 0.188, Accuracy of 12 AU classes: [0.911, 0.806, 0.82, 0.899, 0.838, 0.889, 0.899, 0.119, 0.175, 0.063, 0.797, 0.752]
Epoch: 2
Train Loss: 0.206, Accuracy of 12 AU classes: [0.843, 0.742, 0.838, 0.777, 0.737, 0.788, 0.858, 0.349, 0.188, 0.175, 0.753, 0.557]
Validation Loss: 0.168, Accuracy of 12 AU classes: [0.901, 0.878, 0.873, 0.89, 0.849, 0.889, 0.895, 0.785, 0.357, 0.305, 0.795, 0.829]
Epoch: 3
Train Loss: 0.193, Accuracy of 12 AU classes: [0.86, 0.804, 0.855, 0.797, 0.753, 0.796, 0.864, 0.52, 0.359, 0.361, 0.768, 0.659]
Validation Loss: 0.158, Accuracy of 12 AU classes: [0.915, 0.93, 0.873, 0.895, 0.865, 0.885, 0.905, 0.871, 0.642, 0.696, 0.818, 0.866]
Epoch: 4
Train Loss: 0.185, Accuracy of 12 AU classes: [0.869, 0.838, 0.865, 0.81, 0.762, 0.802, 0.869, 0.614, 0.478, 0.487, 0.779, 0.716]
Validation Loss: 0.154, Accuracy of 12

In [None]:
try:
    print(f'best metric: f1_score {f1s_best}, f1_threshold: {f1t_best}, accuracy: {accbest}')
except:
    AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_fusion_{viau}.pth'))
    AU_model = AU_fusion().to(device)
    AU_model.load_state_dict(AU_best_model)
    val_loss, f1s, f1t, acc = evaluate_model(AU_model, val_loader, auft, weights)
    print(f'best metric: f1_score {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

best metric: f1_score 0.572, f1_threshold: 0.25, accuracy: [0.92, 0.915, 0.878, 0.894, 0.87, 0.879, 0.908, 0.896, 0.787, 0.858, 0.814, 0.897]


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.289, Accuracy of 12 AU classes: [0.159, 0.052, 0.191, 0.467, 0.57, 0.405, 0.561, 0.194, 0.152, 0.041, 0.629, 0.079]
Validation Loss: 0.236, Accuracy of 12 AU classes: [0.508, 0.09, 0.673, 0.89, 0.814, 0.855, 0.89, 0.979, 0.174, 0.237, 0.746, 0.089]
Epoch: 2
Train Loss: 0.255, Accuracy of 12 AU classes: [0.128, 0.345, 0.496, 0.63, 0.651, 0.537, 0.713, 0.346, 0.285, 0.193, 0.667, 0.413]
Validation Loss: 0.201, Accuracy of 12 AU classes: [0.898, 0.911, 0.883, 0.883, 0.819, 0.854, 0.892, 0.989, 0.826, 0.937, 0.724, 0.91]
Epoch: 3
Train Loss: 0.237, Accuracy of 12 AU classes: [0.628, 0.546, 0.616, 0.683, 0.678, 0.596, 0.76, 0.555, 0.513, 0.452, 0.694, 0.582]
Validation Loss: 0.185, Accuracy of 12 AU classes: [0.919, 0.918, 0.879, 0.88, 0.823, 0.847, 0.89, 0.989, 0.826, 0.937, 0.76, 0.892]
Epoch: 4
Train Loss: 0.226, Accuracy of 12 AU classes: [0.691, 0.646, 0.673, 0.709, 0.694, 0.628, 0.783, 0.659, 0.627, 0.582, 0.709, 0.66]
Validation Loss: 0.176, Accuracy of 12 AU c

In [None]:
try:
    print(f'best metric: f1_score {f1s_best}, f1_threshold: {f1t_best}, accuracy: {accbest}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_mlp_{viau}.pth'))
    mlp_model = MLPModel(num_classes = 12).to(device)
    mlp_model.load_state_dict(mlp_best_model)
    val_loss, f1s, f1t, acc = evaluate_model(mlp_model, val_loader, auft, weights)
    print(f'best metric: f1_score {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

best metric: f1_score 0.532, f1_threshold: 0.217, accuracy: [0.916, 0.9, 0.854, 0.892, 0.85, 0.883, 0.892, 0.961, 0.826, 0.937, 0.767, 0.909]


##### EffNet

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(AU_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.229, Accuracy of 12 AU classes: [0.848, 0.751, 0.81, 0.809, 0.694, 0.692, 0.818, 0.882, 0.479, 0.353, 0.749, 0.735]
Validation Loss: 0.192, Accuracy of 12 AU classes: [0.9, 0.735, 0.88, 0.882, 0.852, 0.877, 0.9, 0.92, 0.173, 0.741, 0.766, 0.915]
Epoch: 2
Train Loss: 0.208, Accuracy of 12 AU classes: [0.87, 0.813, 0.844, 0.822, 0.732, 0.737, 0.845, 0.913, 0.716, 0.145, 0.771, 0.561]
Validation Loss: 0.17, Accuracy of 12 AU classes: [0.906, 0.644, 0.859, 0.893, 0.858, 0.879, 0.904, 0.976, 0.188, 0.826, 0.796, 0.904]
Epoch: 3
Train Loss: 0.195, Accuracy of 12 AU classes: [0.879, 0.856, 0.861, 0.831, 0.749, 0.755, 0.855, 0.85, 0.192, 0.301, 0.783, 0.669]
Validation Loss: 0.159, Accuracy of 12 AU classes: [0.906, 0.93, 0.904, 0.895, 0.864, 0.881, 0.896, 0.923, 0.433, 0.863, 0.789, 0.92]
Epoch: 4
Train Loss: 0.185, Accuracy of 12 AU classes: [0.885, 0.879, 0.872, 0.838, 0.761, 0.767, 0.879, 0.872, 0.331, 0.408, 0.791, 0.726]
Validation Loss: 0.155, Accuracy of 12 AU cl

In [None]:
try:
    print(f'best metric: f1_score {f1s_best}, f1_threshold: {f1t_best}, accuracy: {accbest}')
except:
    AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_fusion_{viau}.pth'))
    AU_model = AU_fusion().to(device)
    AU_model.load_state_dict(AU_best_model)
    val_loss, f1s, f1t, acc = evaluate_model(AU_model, val_loader, auft, weights)
    print(f'best metric: f1_score {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

best metric: f1_score 0.578, f1_threshold: 0.25, accuracy: [0.913, 0.939, 0.89, 0.88, 0.867, 0.88, 0.903, 0.948, 0.713, 0.804, 0.765, 0.92]


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.306, Accuracy of 12 AU classes: [0.241, 0.074, 0.173, 0.487, 0.553, 0.549, 0.668, 0.448, 0.041, 0.028, 0.689, 0.076]
Validation Loss: 0.248, Accuracy of 12 AU classes: [0.119, 0.178, 0.366, 0.891, 0.815, 0.865, 0.892, 0.768, 0.234, 0.091, 0.607, 0.088]
Epoch: 2
Train Loss: 0.268, Accuracy of 12 AU classes: [0.376, 0.155, 0.462, 0.647, 0.643, 0.66, 0.767, 0.343, 0.232, 0.122, 0.702, 0.082]
Validation Loss: 0.201, Accuracy of 12 AU classes: [0.905, 0.602, 0.888, 0.886, 0.818, 0.865, 0.898, 0.22, 0.177, 0.661, 0.742, 0.103]
Epoch: 3
Train Loss: 0.246, Accuracy of 12 AU classes: [0.547, 0.383, 0.598, 0.604, 0.673, 0.697, 0.803, 0.308, 0.213, 0.359, 0.717, 0.215]
Validation Loss: 0.184, Accuracy of 12 AU classes: [0.922, 0.89, 0.883, 0.889, 0.818, 0.87, 0.894, 0.976, 0.763, 0.884, 0.758, 0.693]
Epoch: 4
Train Loss: 0.233, Accuracy of 12 AU classes: [0.629, 0.521, 0.661, 0.649, 0.689, 0.716, 0.82, 0.473, 0.398, 0.509, 0.726, 0.36]
Validation Loss: 0.175, Accuracy of 12

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(mlp_model, model_type[1], train_loader, val_loader, 20, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.187, Accuracy of 12 AU classes: [0.864, 0.903, 0.835, 0.79, 0.742, 0.739, 0.874, 0.97, 0.97, 0.971, 0.745, 0.838]
Validation Loss: 0.17, Accuracy of 12 AU classes: [0.913, 0.917, 0.747, 0.889, 0.816, 0.876, 0.893, 0.989, 0.827, 0.937, 0.755, 0.89]
Epoch: 2
Train Loss: 0.185, Accuracy of 12 AU classes: [0.859, 0.911, 0.835, 0.791, 0.744, 0.741, 0.874, 0.969, 0.97, 0.972, 0.749, 0.832]
Validation Loss: 0.166, Accuracy of 12 AU classes: [0.907, 0.918, 0.822, 0.891, 0.825, 0.881, 0.893, 0.988, 0.827, 0.937, 0.765, 0.876]
Epoch: 3
Train Loss: 0.183, Accuracy of 12 AU classes: [0.856, 0.912, 0.836, 0.791, 0.745, 0.743, 0.874, 0.968, 0.97, 0.972, 0.751, 0.826]
Validation Loss: 0.162, Accuracy of 12 AU classes: [0.907, 0.912, 0.84, 0.891, 0.83, 0.884, 0.903, 0.986, 0.827, 0.937, 0.77, 0.868]
Epoch: 4
Train Loss: 0.182, Accuracy of 12 AU classes: [0.854, 0.911, 0.836, 0.791, 0.747, 0.745, 0.874, 0.966, 0.97, 0.972, 0.753, 0.822]
Validation Loss: 0.16, Accuracy of 12 AU cl

In [None]:
try:
    print(f'best metric: f1_score {f1s_best}, f1_threshold: {f1t_best}, accuracy: {accbest}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_model/best_AU_mlp_{viau}.pth'))
    mlp_model = MLPModel(num_classes = 12).to(device)
    mlp_model.load_state_dict(mlp_best_model)
    val_loss, f1s, f1t, acc, thresh = evaluate_model(mlp_model, val_loader, auft, weights)
    print(f'best metric: f1_score {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

best metric: f1_score 0.527, f1_threshold: 0.225, accuracy: [0.913, 0.934, 0.862, 0.895, 0.829, 0.888, 0.903, 0.966, 0.827, 0.937, 0.766, 0.914]


#### Cropped images

##### EffNet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(AU_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.248, Accuracy of 12 AU classes: [0.464, 0.616, 0.696, 0.494, 0.59, 0.726, 0.822, 0.821, 0.398, 0.027, 0.698, 0.474]
Validation Loss: 0.201, Accuracy of 12 AU classes: [0.84, 0.875, 0.523, 0.893, 0.783, 0.88, 0.895, 0.906, 0.176, 0.55, 0.755, 0.465]
Epoch: 2
Train Loss: 0.224, Accuracy of 12 AU classes: [0.863, 0.773, 0.788, 0.641, 0.668, 0.754, 0.838, 0.881, 0.228, 0.089, 0.721, 0.633]
Validation Loss: 0.177, Accuracy of 12 AU classes: [0.913, 0.916, 0.754, 0.884, 0.826, 0.877, 0.898, 0.955, 0.176, 0.333, 0.743, 0.734]
Epoch: 3
Train Loss: 0.21, Accuracy of 12 AU classes: [0.872, 0.828, 0.82, 0.698, 0.701, 0.767, 0.847, 0.769, 0.135, 0.3, 0.735, 0.705]
Validation Loss: 0.168, Accuracy of 12 AU classes: [0.91, 0.919, 0.768, 0.888, 0.833, 0.881, 0.892, 0.978, 0.219, 0.754, 0.755, 0.867]
Epoch: 4
Train Loss: 0.2, Accuracy of 12 AU classes: [0.877, 0.944, 0.838, 0.808, 0.72, 0.776, 0.853, 0.812, 0.274, 0.437, 0.746, 0.889]
Validation Loss: 0.164, Accuracy of 12 AU cl

In [None]:
try:
    print(f'best metric: f1_score {f1s_best}, f1_threshold: {f1t_best}, accuracy: {accbest}')
except:
    AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_fusion_{viau}.pth'))
    AU_model = AU_fusion().to(device)
    AU_model.load_state_dict(AU_best_model)
    val_loss, f1s, f1t, acc = evaluate_model(AU_model, val_loader, auft, weights)
    print(f'best metric: f1_score {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

best metric: f1_score 0.552, f1_threshold: 0.258, accuracy: [0.92, 0.923, 0.814, 0.894, 0.846, 0.884, 0.88, 0.946, 0.505, 0.846, 0.759, 0.896]


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.287, Accuracy of 12 AU classes: [0.126, 0.109, 0.306, 0.452, 0.535, 0.368, 0.505, 0.146, 0.073, 0.277, 0.63, 0.078]
Validation Loss: 0.24, Accuracy of 12 AU classes: [0.12, 0.577, 0.138, 0.878, 0.683, 0.843, 0.855, 0.933, 0.513, 0.089, 0.645, 0.089]
Epoch: 2
Train Loss: 0.256, Accuracy of 12 AU classes: [0.182, 0.47, 0.557, 0.607, 0.62, 0.491, 0.664, 0.294, 0.466, 0.362, 0.637, 0.24]
Validation Loss: 0.215, Accuracy of 12 AU classes: [0.51, 0.867, 0.409, 0.865, 0.811, 0.851, 0.882, 0.972, 0.821, 0.933, 0.723, 0.837]
Epoch: 3
Train Loss: 0.241, Accuracy of 12 AU classes: [0.244, 0.622, 0.644, 0.662, 0.652, 0.557, 0.717, 0.516, 0.633, 0.564, 0.655, 0.431]
Validation Loss: 0.201, Accuracy of 12 AU classes: [0.651, 0.881, 0.655, 0.891, 0.814, 0.853, 0.879, 0.986, 0.824, 0.936, 0.736, 0.89]
Epoch: 4
Train Loss: 0.23, Accuracy of 12 AU classes: [0.734, 0.699, 0.688, 0.69, 0.67, 0.594, 0.744, 0.629, 0.717, 0.666, 0.67, 0.536]
Validation Loss: 0.192, Accuracy of 12 AU cl

In [None]:
try:
    print(f'best metric: f1_score {f1s_best}, f1_threshold: {f1t_best}, accuracy: {accbest}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_mlp_{viau}.pth'))
    mlp_model = MLPModel(num_classes = 12).to(device)
    mlp_model.load_state_dict(mlp_best_model)
    val_loss, f1s, f1t, acc = evaluate_model(mlp_model, val_loader, auft, weights)
    print(f'best metric: f1_score {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

best metric: f1_score 0.502, f1_threshold: 0.258, accuracy: [0.914, 0.908, 0.856, 0.878, 0.837, 0.888, 0.888, 0.933, 0.824, 0.937, 0.764, 0.852]


##### EffNet + Vggish

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(AU_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.237, Accuracy of 12 AU classes: [0.831, 0.388, 0.805, 0.657, 0.687, 0.737, 0.774, 0.873, 0.539, 0.495, 0.702, 0.372]
Validation Loss: 0.19, Accuracy of 12 AU classes: [0.897, 0.593, 0.896, 0.875, 0.819, 0.889, 0.884, 0.989, 0.176, 0.342, 0.781, 0.812]
Epoch: 2
Train Loss: 0.216, Accuracy of 12 AU classes: [0.861, 0.625, 0.844, 0.726, 0.708, 0.757, 0.84, 0.904, 0.324, 0.277, 0.726, 0.63]
Validation Loss: 0.171, Accuracy of 12 AU classes: [0.915, 0.884, 0.819, 0.89, 0.845, 0.883, 0.895, 0.971, 0.446, 0.642, 0.795, 0.909]
Epoch: 3
Train Loss: 0.205, Accuracy of 12 AU classes: [0.873, 0.891, 0.858, 0.754, 0.718, 0.768, 0.846, 0.805, 0.51, 0.491, 0.739, 0.72]
Validation Loss: 0.164, Accuracy of 12 AU classes: [0.912, 0.915, 0.889, 0.877, 0.83, 0.877, 0.897, 0.987, 0.673, 0.941, 0.78, 0.911]
Epoch: 4
Train Loss: 0.197, Accuracy of 12 AU classes: [0.88, 0.906, 0.866, 0.771, 0.725, 0.775, 0.851, 0.838, 0.611, 0.323, 0.727, 0.767]
Validation Loss: 0.161, Accuracy of 12 AU

In [None]:
try:
    print(f'best metric: f1_score {f1s_best}, f1_threshold: {f1t_best}, accuracy: {accbest}')
except:
    AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_fusion_{viau}.pth'))
    AU_model = AU_fusion().to(device)
    AU_model.load_state_dict(AU_best_model)
    val_loss, f1s, f1t, acc = evaluate_model(AU_model, val_loader, auft, weights)
    print(f'best metric: f1_score {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

best metric: f1_score 0.57, f1_threshold: 0.258, accuracy: [0.931, 0.919, 0.862, 0.886, 0.833, 0.881, 0.896, 0.961, 0.74, 0.799, 0.782, 0.811]


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.299, Accuracy of 12 AU classes: [0.14, 0.205, 0.18, 0.493, 0.472, 0.39, 0.539, 0.043, 0.031, 0.045, 0.63, 0.12]
Validation Loss: 0.245, Accuracy of 12 AU classes: [0.284, 0.091, 0.51, 0.888, 0.7, 0.848, 0.886, 0.151, 0.176, 0.21, 0.736, 0.721]
Epoch: 2
Train Loss: 0.266, Accuracy of 12 AU classes: [0.126, 0.336, 0.5, 0.633, 0.59, 0.511, 0.686, 0.226, 0.099, 0.189, 0.63, 0.191]
Validation Loss: 0.212, Accuracy of 12 AU classes: [0.131, 0.908, 0.139, 0.879, 0.797, 0.841, 0.881, 0.988, 0.557, 0.829, 0.737, 0.788]
Epoch: 3
Train Loss: 0.248, Accuracy of 12 AU classes: [0.148, 0.541, 0.611, 0.679, 0.632, 0.569, 0.731, 0.474, 0.341, 0.444, 0.661, 0.419]
Validation Loss: 0.196, Accuracy of 12 AU classes: [0.472, 0.909, 0.397, 0.878, 0.818, 0.844, 0.878, 0.989, 0.721, 0.936, 0.746, 0.895]
Epoch: 4
Train Loss: 0.237, Accuracy of 12 AU classes: [0.684, 0.643, 0.664, 0.703, 0.656, 0.601, 0.754, 0.599, 0.488, 0.576, 0.676, 0.533]
Validation Loss: 0.187, Accuracy of 12 AU cla

In [None]:
try:
    print(f'best metric: f1_score {f1s_best}, f1_threshold: {f1t_best}, accuracy: {accbest}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_mlp_{viau}.pth'))
    mlp_model = MLPModel(num_classes = 12).to(device)
    mlp_model.load_state_dict(mlp_best_model)
    val_loss, f1s, f1t, acc = evaluate_model(mlp_model, val_loader, auft, weights)
    print(f'best metric: f1_score {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

best metric: f1_score 0.524, f1_threshold: 0.233, accuracy: [0.921, 0.922, 0.856, 0.883, 0.831, 0.89, 0.886, 0.973, 0.824, 0.936, 0.783, 0.873]


##### EffNet

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(AU_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.237, Accuracy of 12 AU classes: [0.747, 0.821, 0.8, 0.698, 0.657, 0.705, 0.8, 0.751, 0.048, 0.053, 0.686, 0.834]
Validation Loss: 0.197, Accuracy of 12 AU classes: [0.903, 0.921, 0.852, 0.891, 0.852, 0.891, 0.888, 0.728, 0.187, 0.759, 0.791, 0.696]
Epoch: 2
Train Loss: 0.217, Accuracy of 12 AU classes: [0.797, 0.885, 0.842, 0.748, 0.703, 0.728, 0.826, 0.604, 0.251, 0.261, 0.717, 0.865]
Validation Loss: 0.177, Accuracy of 12 AU classes: [0.893, 0.903, 0.862, 0.88, 0.859, 0.893, 0.902, 0.956, 0.381, 0.728, 0.804, 0.794]
Epoch: 3
Train Loss: 0.205, Accuracy of 12 AU classes: [0.88, 0.906, 0.859, 0.769, 0.723, 0.74, 0.837, 0.363, 0.411, 0.402, 0.734, 0.876]
Validation Loss: 0.17, Accuracy of 12 AU classes: [0.899, 0.934, 0.89, 0.888, 0.859, 0.893, 0.887, 0.812, 0.567, 0.776, 0.789, 0.817]
Epoch: 4
Train Loss: 0.197, Accuracy of 12 AU classes: [0.883, 0.917, 0.868, 0.781, 0.736, 0.748, 0.844, 0.483, 0.512, 0.493, 0.745, 0.882]
Validation Loss: 0.167, Accuracy of 12 AU

In [None]:
try:
    print(f'best metric: f1_score {f1s_best}, f1_threshold: {f1t_best}, accuracy: {accbest}')
except:
    AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_fusion_{viau}.pth'))
    AU_model = AU_fusion().to(device)
    AU_model.load_state_dict(AU_best_model)
    val_loss, f1s, f1t, acc = evaluate_model(AU_model, val_loader, auft, weights)
    print(f'best metric: f1_score {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

best metric: f1_score 0.542, f1_threshold: 0.25, accuracy: [0.897, 0.937, 0.888, 0.881, 0.853, 0.892, 0.892, 0.87, 0.734, 0.845, 0.782, 0.9]


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, f1s_best, f1t_best, accbest = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, auft, weights, viau)

Epoch: 1
Train Loss: 0.313, Accuracy of 12 AU classes: [0.281, 0.339, 0.204, 0.492, 0.504, 0.518, 0.513, 0.568, 0.064, 0.039, 0.627, 0.076]
Validation Loss: 0.259, Accuracy of 12 AU classes: [0.119, 0.286, 0.662, 0.891, 0.806, 0.843, 0.889, 0.877, 0.282, 0.072, 0.711, 0.089]
Epoch: 2
Train Loss: 0.276, Accuracy of 12 AU classes: [0.395, 0.226, 0.296, 0.503, 0.6, 0.498, 0.673, 0.436, 0.05, 0.253, 0.627, 0.086]
Validation Loss: 0.212, Accuracy of 12 AU classes: [0.882, 0.749, 0.833, 0.867, 0.818, 0.847, 0.878, 0.731, 0.194, 0.068, 0.721, 0.094]
Epoch: 3
Train Loss: 0.256, Accuracy of 12 AU classes: [0.558, 0.451, 0.466, 0.587, 0.635, 0.561, 0.636, 0.411, 0.266, 0.235, 0.635, 0.229]
Validation Loss: 0.195, Accuracy of 12 AU classes: [0.474, 0.903, 0.193, 0.872, 0.825, 0.849, 0.88, 0.908, 0.769, 0.898, 0.668, 0.616]
Epoch: 4
Train Loss: 0.244, Accuracy of 12 AU classes: [0.637, 0.574, 0.554, 0.631, 0.532, 0.596, 0.68, 0.544, 0.44, 0.416, 0.648, 0.369]
Validation Loss: 0.185, Accuracy of 12

In [None]:
try:
    print(f'best metric: f1_score {f1s_best}, f1_threshold: {f1t_best}, accuracy: {accbest}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_mlp_{viau}.pth'))
    mlp_model = MLPModel(num_classes = 12).to(device)
    mlp_model.load_state_dict(mlp_best_model)
    val_loss, f1s, f1t, acc = evaluate_model(mlp_model, val_loader, auft, weights)
    print(f'best metric: f1_score {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

best metric: f1_score 0.513, f1_threshold: 0.217, accuracy: [0.923, 0.93, 0.858, 0.863, 0.844, 0.885, 0.888, 0.934, 0.825, 0.937, 0.772, 0.861]


### Testing

#### Cropped_aligned images

##### EffNet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
print('AU_model')
print(visual_feat + ' & ' + auft)
AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_model/best_AU_fusion_{viau}.pth'))
AU_model = AU_fusion().to(device)
AU_model.load_state_dict(AU_best_model)
test_loss, f1s, f1t, acc, threshold = evaluate_model(AU_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {threshold}, accuracy: {acc}')

AU_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: f1_score: 0.449, f1_threshold: [0.30000000000000004, 0.30000000000000004, 0.2, 0.30000000000000004, 0.5, 0.30000000000000004, 0.6, 0.1, 0.1, 0.2, 0.6, 0.1], accuracy: [0.844, 0.978, 0.846, 0.875, 0.79, 0.814, 0.916, 0.859, 0.569, 0.961, 0.778, 0.819]
14.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_mlp_{viau}.pth'))
mlp_model = MLPModel(num_classes = 12).to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, f1s, f1t, acc, threshold = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {threshold}, accuracy: {acc}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: f1_score: 0.463, f1_threshold: [0.30000000000000004, 0.2, 0.2, 0.30000000000000004, 0.5, 0.30000000000000004, 0.6, 0.1, 0.1, 0.1, 0.6, 0.30000000000000004], accuracy: [0.89, 0.961, 0.862, 0.881, 0.779, 0.828, 0.922, 0.936, 0.98, 0.667, 0.784, 0.936]
1.58 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet + Vggish

In [None]:
%%timeit -n 1 -r 1
print('AU_model')
print(visual_feat + ' & ' + auft)
AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_fusion_{viau}.pth'))
AU_model = AU_fusion().to(device)
AU_model.load_state_dict(AU_best_model)
test_loss, f1s, f1t, acc = evaluate_model(AU_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

AU_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score: 0.439, f1_threshold: 0.3, accuracy: [0.823, 0.965, 0.809, 0.816, 0.812, 0.814, 0.898, 0.901, 0.98, 0.783, 0.753, 0.944]
2.72 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_mlp_{viau}.pth'))
mlp_model = MLPModel(num_classes = 12).to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, f1s, f1t, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score: 0.423, f1_threshold: 0.267, accuracy: [0.866, 0.935, 0.801, 0.754, 0.782, 0.806, 0.923, 0.955, 0.98, 0.982, 0.78, 0.855]
1.53 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet

In [None]:
%%timeit -n 1 -r 1
print('AU_model')
print(visual_feat + ' & ' + auft)
AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_model/best_AU_fusion_{viau}.pth'))
AU_model = AU_fusion().to(device)
AU_model.load_state_dict(AU_best_model)
test_loss, f1s, f1t, acc, threshold = evaluate_model(AU_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {threshold}, accuracy: {acc}')

AU_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score: 0.455, f1_threshold: [0.30000000000000004, 0.30000000000000004, 0.2, 0.2, 0.6, 0.30000000000000004, 0.6, 0.1, 0.1, 0.2, 0.5, 0.2], accuracy: [0.882, 0.97, 0.833, 0.792, 0.834, 0.799, 0.916, 0.832, 0.404, 0.892, 0.75, 0.914]
2.71 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('AU_model')
print(visual_feat + ' & ' + auft)
AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_fusion_{viau}.pth'))
AU_model = AU_fusion().to(device)
AU_model.load_state_dict(AU_best_model)
test_loss, f1s, f1t, acc = evaluate_model(AU_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

AU_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score: 0.455, f1_threshold: 0.3, accuracy: [0.882, 0.97, 0.833, 0.792, 0.834, 0.799, 0.916, 0.832, 0.404, 0.892, 0.75, 0.914]
2.52 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_mlp_{viau}.pth'))
mlp_model = MLPModel(num_classes = 12).to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, f1s, f1t, acc, threshold = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {threshold}, accuracy: {acc}')

MLP_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score: 0.443, f1_threshold: [0.30000000000000004, 0.2, 0.2, 0.30000000000000004, 0.5, 0.30000000000000004, 0.6, 0.1, 0.1, 0.1, 0.5, 0.2], accuracy: [0.903, 0.97, 0.805, 0.815, 0.812, 0.819, 0.924, 0.94, 0.98, 0.768, 0.779, 0.93]
1.39 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


#### Cropped images

##### EffNet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
print('AU_model')
print(visual_feat + ' & ' + auft)
AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_fusion_{viau}.pth'))
AU_model = AU_fusion().to(device)
AU_model.load_state_dict(AU_best_model)
test_loss, f1s, f1t, acc = evaluate_model(AU_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

AU_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: f1_score: 0.443, f1_threshold: 0.3, accuracy: [0.83, 0.955, 0.813, 0.771, 0.806, 0.812, 0.928, 0.964, 0.952, 0.74, 0.763, 0.905]
2.94 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, f1s, f1t, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: f1_score: 0.435, f1_threshold: 0.308, accuracy: [0.884, 0.982, 0.828, 0.801, 0.799, 0.838, 0.927, 0.953, 0.979, 0.96, 0.765, 0.948]
1.82 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet + Vggish

In [None]:
%%timeit -n 1 -r 1
print('AU_model')
print(visual_feat + ' & ' + auft)
AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_fusion_{viau}.pth'))
AU_model = AU_fusion().to(device)
AU_model.load_state_dict(AU_best_model)
test_loss, f1s, f1t, acc = evaluate_model(AU_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

AU_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score: 0.445, f1_threshold: 0.333, accuracy: [0.879, 0.976, 0.802, 0.808, 0.812, 0.825, 0.922, 0.959, 0.898, 0.863, 0.773, 0.793]
2.82 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_mlp_{viau}.pth'))
mlp_model = MLPModel(num_classes = 12).to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, f1s, f1t, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: f1_score: 0.43, f1_threshold: 0.275, accuracy: [0.903, 0.981, 0.751, 0.755, 0.816, 0.813, 0.925, 0.958, 0.98, 0.985, 0.767, 0.78]
1.61 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet

In [None]:
%%timeit -n 1 -r 1
print('AU_model')
print(visual_feat + ' & ' + auft)
AU_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_fusion_{viau}.pth'))
AU_model = AU_fusion().to(device)
AU_model.load_state_dict(AU_best_model)
test_loss, f1s, f1t, acc = evaluate_model(AU_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

AU_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score: 0.439, f1_threshold: 0.317, accuracy: [0.891, 0.987, 0.836, 0.781, 0.803, 0.827, 0.92, 0.764, 0.754, 0.715, 0.777, 0.851]
2.64 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_mlp_{viau}.pth'))
mlp_model = MLPModel(num_classes = 12).to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, f1s, f1t, acc = evaluate_model(mlp_model, test_loader, auft, weights)
print(f'Test set: f1_score: {f1s}, f1_threshold: {f1t}, accuracy: {acc}')

MLP_model
visualfeat_enet_b2_8_best & nope
Test set: f1_score: 0.427, f1_threshold: 0.275, accuracy: [0.905, 0.978, 0.781, 0.788, 0.815, 0.817, 0.914, 0.95, 0.98, 0.984, 0.754, 0.793]
1.39 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## VA Estimation Challenge

### Loading data

In [None]:
# Cropped_aligned images
vis = vis_typ[0]
visft = visual_feat[0]

In [None]:
# Cropped images
vis = vis_typ[1]
visft = visual_feat[1]

#### Effnet + wav2vec2

In [None]:
auft = audio_feat[0]
viau = vis_aud[0]

#### Effnet + vggish

In [None]:
auft = audio_feat[1]
viau = vis_aud[1]

#### Effnet

In [None]:
auft = audio_feat[2]
viau = vis_aud[2]

#### Train

In [None]:
if auft == audio_feat[0]:
    with open(os.path.join(root,f'models/ABAW6/{vis}/VA/{task[2]}_{typ[0]}_visual.pkl'), 'rb') as f:
        data1 = pickle.load(f)
    with open(os.path.join(root, f'data/Annotations/{task[2]}/{typ[0]}.txt'), 'r') as f:
        vidnames = f.read().splitlines()
    task1 = task[2]
    feature_a = 'audiofeat_wav2vec2'
    feat_root = os.path.join(root + '/models/ABAW6', feature_a)
    filenames = os.listdir(feat_root)[:]
    for vname in tqdm(vidnames):
            feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
            for imgname, val in feature.items():
                if imgname in data1[task1][vname]:
                    data1[task1][vname][imgname].update({f'{feature_a}': val})
            for img, value in list(data1[task1][vname].items()):
                if len(value) < 3:
                    data1[task1][vname].pop(img)
elif auft == audio_feat[1]:
    with open(os.path.join(root,f'models/ABAW6/{vis}/VA/{task[2]}_{typ[0]}_visual.pkl'), 'rb') as f:
        data1 = pickle.load(f)
    with open(os.path.join(root, f'data/Annotations/{task[2]}/{typ[0]}.txt'), 'r') as f:
        vidnames = f.read().splitlines()
    task1 = task[2]
    feature_a = 'audiofeat_vggish'
    feat_root = os.path.join(root + '/models/ABAW6', feature_a)
    filenames = os.listdir(feat_root)[:]
    for vname in tqdm(vidnames):
            feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
            for imgname, val in feature.items():
                if imgname in data1[task1][vname]:
                    data1[task1][vname][imgname].update({f'{feature_a}': val})
            for img, value in list(data1[task1][vname].items()):
                if len(value) < 3:
                    data1[task1][vname].pop(img)
else:
    with open(os.path.join(root,f'models/ABAW6/{vis}/VA/{task[2]}_{typ[0]}_{viau}.pkl'), 'rb') as f:
        data1 = pickle.load(f)

100%|██████████| 285/285 [00:10<00:00, 26.43it/s]


In [None]:
dims = 0
iname = []
task1 = task[2]
with open(os.path.join(root, f'data/Annotations/{task[2]}/{typ[0]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data1[task1][vname])
    for img in data1[task1][vname].keys():
        iname.append(img)

100%|██████████| 285/285 [00:00<00:00, 1249.04it/s]


In [None]:
dataset = ABAW_dataset1(data1, iname, dims, task1)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0, drop_last=True)

#### Val

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis}/VA/{task[2]}_{typ[1]}_{viau}.pkl'), 'rb') as f:
    data2 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[2]
with open(os.path.join(root, f'data/Annotations/{task[2]}/{typ[1]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data2[task1][vname])
    for img in data2[task1][vname].keys():
        iname.append(img)

100%|██████████| 71/71 [00:00<00:00, 934.23it/s]


In [None]:
dataset = ABAW_dataset1(data2, iname, dims, task1)
val_loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=True)

#### Test

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis}/VA/{task[2]}_{typ[2]}_{viau}.pkl'), 'rb') as f:
    data3 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[2]
with open(os.path.join(root, f'data/Annotations/{task[2]}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
for vname in tqdm(vidnames):
    dims += len(data3[task1][vname])
    for img in data3[task1][vname].keys():
        iname.append(img)

100%|██████████| 76/76 [00:00<00:00, 1159.12it/s]


In [None]:
dataset = ABAW_dataset1(data3, iname, dims, task1)
test_loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=True)

### Modeling

In [None]:
class VA_fusion(nn.Module):
    def __init__(self, batchsize = batch_size, audio_ft = auft, hidden_size = [512, 128, batch_size]):
        super(VA_fusion, self).__init__()
        self.batchsize = batchsize
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.hidden_size = hidden_size
        self.feat_fc = nn.Conv1d(self.concat_dim, hidden_size[0], 1, padding=0)
        self.activ = nn.LeakyReLU(0.1)
        self.dropout = nn.Dropout(p=0.3)
        self.conv1 = nn.Conv1d(hidden_size[0], hidden_size[1], 1, padding=0)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size[1], nhead=4, dim_feedforward=hidden_size[1], dropout=0.3)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
        self.vhead = nn.Sequential(
                nn.Linear(hidden_size[1], hidden_size[2]),
                nn.BatchNorm1d(hidden_size[2]),
                nn.Linear(hidden_size[2], 1),
                )
        self.ahead = nn.Sequential(
                nn.Linear(hidden_size[1], hidden_size[2]),
                nn.BatchNorm1d(hidden_size[2]),
                nn.Linear(hidden_size[2], 1),
                )

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs,dim=1)
        feat = torch.transpose(feat,0,1)
        feat = self.feat_fc(feat)
        feat = self.activ(feat)
        out = self.conv1(feat)
        out = torch.transpose(out,0,1)
        out = self.transformer_encoder(out)
        vout = self.vhead(out)
        aout = self.ahead(out)

        return vout, aout, torch.tanh(vout), torch.tanh(aout)

In [None]:
VA_model = VA_fusion().to(device)
VA_model

VA_fusion(
  (feat_fc): Conv1d(1536, 512, kernel_size=(1,), stride=(1,))
  (activ): LeakyReLU(negative_slope=0.1)
  (dropout): Dropout(p=0.3, inplace=False)
  (conv1): Conv1d(512, 128, kernel_size=(1,), stride=(1,))
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (vhead): Sequential(
    (0): Linear(in_features=128, out_feature

In [None]:
class MLPModel(nn.Module):
    def __init__(self, audio_ft = auft, num_classes=1):
        super(MLPModel, self).__init__()
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.activ = nn.ReLU()
        self.fc1 = nn.Linear(self.concat_dim, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs, dim=1)
        feat = self.fc1(feat)
        feat = self.activ(feat)
        vout = self.fc2(feat)
        aout = self.fc2(feat)

        return vout, aout, torch.tanh(vout), torch.tanh(aout)

In [None]:
mlp_model = MLPModel().to(device)
mlp_model

MLPModel(
  (activ): ReLU()
  (fc1): Linear(in_features=2176, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)

#### Utils

In [None]:
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, VA_model.parameters()), lr=0.00005, betas=(0.9, 0.999), weight_decay=0.00001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-5)

In [None]:
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, mlp_model.parameters()), lr=0.00005, betas=(0.9, 0.999), weight_decay=0.00001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-5)

### Training

In [None]:
def train(model, mod_type, train_loader, val_loader, epoch, batch_size, optim, scheduler, au_feat, vis_aud):

    model.train(True)
    model.eval()
    best_loss, best_mse = float('inf'), float('inf')
    loss_value = []
    loss_train = []
    loss_val = []
    loss_mse = []
    cc1best, cc2best = 0, 0

    for e in range(epoch):
        print(f'Training Epoch: {e+1}')
        torch.manual_seed(2809)
        iterator = iter(train_loader)
        for i in range(len(train_loader)//32):
            try:
                VA = next(iterator)
                if au_feat == 'nope':
                    vis_feat, y = VA[visual_feat], VA['label']
                    vis_feat, y = vis_feat.to(device), y.to(device)
                    aud_feat = None
                else:
                    vis_feat, aud_feat, y = VA[visual_feat], VA[au_feat], VA['label']
                    vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
                model.zero_grad()
                Vpred, Apred, v_pred, a_pred = model(vis_feat, aud_feat)
                mse_loss, ccc_loss = compute_VA_loss(Vpred, Apred, y)
                ccc_loss.backward()
                optim.step()
                loss_value.append(ccc_loss.item())
                loss_mse.append(mse_loss.item())
                preds = torch.cat((v_pred, a_pred), dim=1)
            except:
                break

        avg_loss = round(np.mean(loss_value),3)
        loss_train.append(avg_loss)
        print(f'Train Loss: {avg_loss}, mse: {round(np.mean(loss_mse),3)}')

        val_loss, mse, ccc1, ccc2 = evaluate_model(model, val_loader, au_feat)
        loss_val.append(val_loss)

        if val_loss < best_loss:
            best_loss = val_loss
            #best_mse = mse
            torch.save(model.state_dict(), os.path.join(root,f'models/ABAW6/{vis}/best_VA_{mod_type}_{vis_aud}_loss.pth'))
            # cc1best = ccc1
            # cc2best = ccc2

        if mse < best_mse:
            best_mse = mse
            torch.save(model.state_dict(), os.path.join(root,f'models/ABAW6/{vis}/best_VA_{mod_type}_{vis_aud}_mse.pth'))

        if ccc1 > cc1best:
            torch.save(model.state_dict(), os.path.join(root,f'models/ABAW6/{vis}/best_VA_{mod_type}_{vis_aud}_c1.pth'))
            cc1best = ccc1

        if ccc2 > cc2best:
            torch.save(model.state_dict(), os.path.join(root,f'models/ABAW6/{vis}/best_VA_{mod_type}_{vis_aud}_c2.pth'))
            cc2best = ccc2

        print(f'Validation Loss: {val_loss}, mse: {mse}')

        scheduler.step(val_loss)
    return loss_train, loss_val, best_loss, best_mse, cc1best, cc2best

In [None]:
def evaluate_model(model, data_loader, au_feat):
    model.eval()
    total_loss = []
    all_targets = []
    all_preds = []
    mse = []
    with torch.no_grad():
        iterator = iter(data_loader)
        for i in range(len(data_loader)//32):
            VA = next(iterator)
            if au_feat == 'nope':
                vis_feat, y = VA[visual_feat], VA['label']
                vis_feat, y = vis_feat.to(device), y.to(device)
                aud_feat = None
            else:
                vis_feat, aud_feat, y = VA[visual_feat], VA[au_feat], VA['label']
                vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
            Vpred, Apred, v_pred, a_pred = model(vis_feat, aud_feat)
            mse_loss, ccc_loss = compute_VA_loss(Vpred, Apred, y)
            total_loss.append(ccc_loss.item())
            mse.append(mse_loss.item())
            preds = torch.cat((v_pred, a_pred), dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_targets.extend(y.cpu().tolist())

    ccc1, ccc2 = compute_VA_CCC(all_preds, all_targets)
    return round(np.mean(total_loss),3), round(np.mean(mse),3), round(ccc1,3), round(ccc2,3)

#### Cropped_aligned images

##### Effnet + Wav2vec

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss,best_mse, cccv, ccca = train(VA_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, scheduler, auft, viau)

Training Epoch: 1
Train Loss: 0.624, mse: 0.491
Validation Loss: 1.251, mse: 0.253
Training Epoch: 2
Train Loss: 0.579, mse: 0.489
Validation Loss: 1.265, mse: 0.25
Training Epoch: 3
Train Loss: 0.556, mse: 0.486
Validation Loss: 1.256, mse: 0.252
Training Epoch: 4
Train Loss: 0.539, mse: 0.486
Validation Loss: 1.301, mse: 0.261
Training Epoch: 5
Train Loss: 0.525, mse: 0.485
Validation Loss: 1.33, mse: 0.264
Training Epoch: 6
Train Loss: 0.514, mse: 0.485
Validation Loss: 1.315, mse: 0.249
Training Epoch: 7
Train Loss: 0.504, mse: 0.484
Validation Loss: 1.394, mse: 0.243
Training Epoch: 8
Train Loss: 0.494, mse: 0.484
Validation Loss: 1.259, mse: 0.237
Training Epoch: 9
Train Loss: 0.487, mse: 0.484
Validation Loss: 1.275, mse: 0.222
Training Epoch: 10
Train Loss: 0.48, mse: 0.483
Validation Loss: 1.262, mse: 0.243
3min 16s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: CCC_Valence {cccv}, CCC_Arousal: {ccca}')
except:
    VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
    VA_model = VA_fusion().to(device)
    VA_model.load_state_dict(VA_best_model)
    val_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, val_loader, auft)
    print(f'best metric: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

best metric: CCC_Valence 0.352, CCC_Arousal: 0.48


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss,best_mse, cccv, ccca = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, scheduler, auft, viau)

Training Epoch: 1
Train Loss: 1.234, mse: 0.47
Validation Loss: 1.221, mse: 0.288
Training Epoch: 2
Train Loss: 1.179, mse: 0.474
Validation Loss: 1.185, mse: 0.274
Training Epoch: 3
Train Loss: 1.154, mse: 0.474
Validation Loss: 1.194, mse: 0.269
Training Epoch: 4
Train Loss: 1.135, mse: 0.473
Validation Loss: 1.175, mse: 0.264
Training Epoch: 5
Train Loss: 1.121, mse: 0.474
Validation Loss: 1.186, mse: 0.268
Training Epoch: 6
Train Loss: 1.112, mse: 0.475
Validation Loss: 1.198, mse: 0.276
Training Epoch: 7
Train Loss: 1.104, mse: 0.475
Validation Loss: 1.193, mse: 0.283
Training Epoch: 8
Train Loss: 1.096, mse: 0.476
Validation Loss: 1.174, mse: 0.264
Training Epoch: 9
Train Loss: 1.089, mse: 0.476
Validation Loss: 1.191, mse: 0.276
Training Epoch: 10
Train Loss: 1.082, mse: 0.476
Validation Loss: 1.181, mse: 0.262
1min 20s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: CCC_Valence {cccv}, CCC_Arousal: {ccca}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_mlp_{viau}.pth'))
    mlp_model = MLPModel().to(device)
    mlp_model.load_state_dict(mlp_best_model)
    val_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, val_loader, auft)
    print(f'best metric: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

best metric: CCC_Valence 0.501, CCC_Arousal: 0.532


##### Effnet + Vggish

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss,best_mse, cccv, ccca = train(VA_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, scheduler, auft, viau)

Training Epoch: 1
Train Loss: 0.81, mse: 0.483
Validation Loss: 1.33, mse: 0.34
Training Epoch: 2
Train Loss: 0.71, mse: 0.484
Validation Loss: 1.322, mse: 0.307
Training Epoch: 3
Train Loss: 0.657, mse: 0.484
Validation Loss: 1.336, mse: 0.3
Training Epoch: 4
Train Loss: 0.617, mse: 0.484
Validation Loss: 1.334, mse: 0.293
Training Epoch: 5
Train Loss: 0.586, mse: 0.483
Validation Loss: 1.349, mse: 0.288
Training Epoch: 6
Train Loss: 0.559, mse: 0.483
Validation Loss: 1.355, mse: 0.283
Training Epoch: 7
Train Loss: 0.535, mse: 0.482
Validation Loss: 1.382, mse: 0.282
Training Epoch: 8
Train Loss: 0.513, mse: 0.482
Validation Loss: 1.407, mse: 0.275
Training Epoch: 9
Train Loss: 0.493, mse: 0.482
Validation Loss: 1.414, mse: 0.275
Training Epoch: 10
Train Loss: 0.475, mse: 0.481
Validation Loss: 1.426, mse: 0.272
3min 9s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: CCC_Valence {cccv}, CCC_Arousal: {ccca}')
except:
    VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
    VA_model = VA_fusion().to(device)
    VA_model.load_state_dict(VA_best_model)
    val_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, val_loader, auft)
    print(f'best metric: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

best metric: CCC_Valence 0.241, CCC_Arousal: 0.485


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss,best_mse, cccv, ccca = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, scheduler, auft, viau)

Training Epoch: 1
Train Loss: 1.218, mse: 0.461
Validation Loss: 1.176, mse: 0.29
Training Epoch: 2
Train Loss: 1.178, mse: 0.468
Validation Loss: 1.183, mse: 0.308
Training Epoch: 3
Train Loss: 1.157, mse: 0.469
Validation Loss: 1.195, mse: 0.315
Training Epoch: 4
Train Loss: 1.144, mse: 0.469
Validation Loss: 1.174, mse: 0.308
Training Epoch: 5
Train Loss: 1.135, mse: 0.471
Validation Loss: 1.216, mse: 0.335
Training Epoch: 6
Train Loss: 1.126, mse: 0.471
Validation Loss: 1.234, mse: 0.341
Training Epoch: 7
Train Loss: 1.12, mse: 0.472
Validation Loss: 1.232, mse: 0.341
Training Epoch: 8
Train Loss: 1.113, mse: 0.473
Validation Loss: 1.212, mse: 0.352
Training Epoch: 9
Train Loss: 1.107, mse: 0.474
Validation Loss: 1.218, mse: 0.352
Training Epoch: 10
Train Loss: 1.102, mse: 0.475
Validation Loss: 1.222, mse: 0.37
1min 17s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: CCC_Valence {cccv}, CCC_Arousal: {ccca}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_mlp_{viau}.pth'))
    mlp_model = MLPModel().to(device)
    mlp_model.load_state_dict(mlp_best_model)
    val_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, val_loader, auft)
    print(f'best metric: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

best metric: CCC_Valence 0.416, CCC_Arousal: 0.422


##### Effnet

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss,best_mse, cccv, ccca = train(VA_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, scheduler, auft, viau)

Training Epoch: 1
Train Loss: 0.84, mse: 0.483
Validation Loss: 1.38, mse: 0.282
Training Epoch: 2
Train Loss: 0.75, mse: 0.486
Validation Loss: 1.404, mse: 0.265
Training Epoch: 3
Train Loss: 0.705, mse: 0.487
Validation Loss: 1.369, mse: 0.279
Training Epoch: 4
Train Loss: 0.673, mse: 0.487
Validation Loss: 1.408, mse: 0.293
Training Epoch: 5
Train Loss: 0.65, mse: 0.486
Validation Loss: 1.445, mse: 0.31
Training Epoch: 6
Train Loss: 0.631, mse: 0.485
Validation Loss: 1.462, mse: 0.307
Training Epoch: 7
Train Loss: 0.615, mse: 0.485
Validation Loss: 1.399, mse: 0.326
Training Epoch: 8
Train Loss: 0.601, mse: 0.485
Validation Loss: 1.404, mse: 0.303
Training Epoch: 9
Train Loss: 0.59, mse: 0.485
Validation Loss: 1.417, mse: 0.321
Training Epoch: 10
Train Loss: 0.579, mse: 0.485
Validation Loss: 1.384, mse: 0.293
3min 1s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: CCC_Valence {cccv}, CCC_Arousal: {ccca}')
except:
    VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
    VA_model = VA_fusion().to(device)
    VA_model.load_state_dict(VA_best_model)
    val_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, val_loader, auft)
    print(f'best metric: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

best metric: CCC_Valence 0.244, CCC_Arousal: 0.375


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, best_mse, cccv, ccca = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, scheduler, auft, viau)

Training Epoch: 1
Train Loss: 1.281, mse: 0.467
Validation Loss: 1.254, mse: 0.284
Training Epoch: 2
Train Loss: 1.236, mse: 0.466
Validation Loss: 1.245, mse: 0.299
Training Epoch: 3
Train Loss: 1.214, mse: 0.468
Validation Loss: 1.24, mse: 0.313
Training Epoch: 4
Train Loss: 1.197, mse: 0.467
Validation Loss: 1.236, mse: 0.317
Training Epoch: 5
Train Loss: 1.184, mse: 0.469
Validation Loss: 1.231, mse: 0.303
Training Epoch: 6
Train Loss: 1.175, mse: 0.469
Validation Loss: 1.253, mse: 0.308
Training Epoch: 7
Train Loss: 1.169, mse: 0.469
Validation Loss: 1.249, mse: 0.306
Training Epoch: 8
Train Loss: 1.161, mse: 0.471
Validation Loss: 1.245, mse: 0.324
Training Epoch: 9
Train Loss: 1.155, mse: 0.472
Validation Loss: 1.243, mse: 0.325
Training Epoch: 10
Train Loss: 1.15, mse: 0.473
Validation Loss: 1.254, mse: 0.317
1min 9s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: CCC_Valence {cccv}, CCC_Arousal: {ccca}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_mlp_{viau}.pth'))
    mlp_model = MLPModel().to(device)
    mlp_model.load_state_dict(mlp_best_model)
    val_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, val_loader, auft)
    print(f'best metric: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

best metric: CCC_Valence 0.422, CCC_Arousal: 0.427


#### Cropped images

##### Effnet + Wav2vec

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss,best_mse, cccv, ccca = train(VA_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, scheduler, auft, viau)

Training Epoch: 1
Train Loss: 0.903, mse: 0.504
Validation Loss: 1.389, mse: 0.254
Training Epoch: 2
Train Loss: 0.793, mse: 0.494
Validation Loss: 1.377, mse: 0.243
Training Epoch: 3
Train Loss: 0.74, mse: 0.491
Validation Loss: 1.356, mse: 0.244
Training Epoch: 4
Train Loss: 0.704, mse: 0.49
Validation Loss: 1.324, mse: 0.232
Training Epoch: 5
Train Loss: 0.678, mse: 0.489
Validation Loss: 1.32, mse: 0.206
Training Epoch: 6
Train Loss: 0.657, mse: 0.488
Validation Loss: 1.384, mse: 0.236
Training Epoch: 7
Train Loss: 0.639, mse: 0.488
Validation Loss: 1.384, mse: 0.236
Training Epoch: 8
Train Loss: 0.624, mse: 0.488
Validation Loss: 1.329, mse: 0.22
Training Epoch: 9
Train Loss: 0.611, mse: 0.487
Validation Loss: 1.31, mse: 0.253
Training Epoch: 10
Train Loss: 0.599, mse: 0.487
Validation Loss: 1.295, mse: 0.228
3min 35s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: CCC_Valence {cccv}, CCC_Arousal: {ccca}')
except:
    VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
    VA_model = VA_fusion().to(device)
    VA_model.load_state_dict(VA_best_model)
    val_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, val_loader, auft)
    print(f'best metric: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

best metric: CCC_Valence 0.329, CCC_Arousal: 0.39


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss,best_mse, cccv, ccca = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, scheduler, auft, viau)

Training Epoch: 1
Train Loss: 1.279, mse: 0.464
Validation Loss: 1.261, mse: 0.315
Training Epoch: 2
Train Loss: 1.231, mse: 0.47
Validation Loss: 1.242, mse: 0.302
Training Epoch: 3
Train Loss: 1.209, mse: 0.472
Validation Loss: 1.202, mse: 0.277
Training Epoch: 4
Train Loss: 1.192, mse: 0.473
Validation Loss: 1.218, mse: 0.275
Training Epoch: 5
Train Loss: 1.179, mse: 0.473
Validation Loss: 1.178, mse: 0.264
Training Epoch: 6
Train Loss: 1.166, mse: 0.474
Validation Loss: 1.139, mse: 0.254
Training Epoch: 7
Train Loss: 1.158, mse: 0.475
Validation Loss: 1.163, mse: 0.262
Training Epoch: 8
Train Loss: 1.15, mse: 0.476
Validation Loss: 1.147, mse: 0.263
Training Epoch: 9
Train Loss: 1.144, mse: 0.477
Validation Loss: 1.153, mse: 0.265
Training Epoch: 10
Train Loss: 1.137, mse: 0.477
Validation Loss: 1.161, mse: 0.257
1min 33s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: CCC_Valence {cccv}, CCC_Arousal: {ccca}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_mlp_{viau}.pth'))
    mlp_model = MLPModel().to(device)
    mlp_model.load_state_dict(mlp_best_model)
    val_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, val_loader, auft)
    print(f'best metric: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

best metric: CCC_Valence 0.501, CCC_Arousal: 0.523


##### Effnet + Vggish

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss,best_mse, cccv, ccca = train(VA_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, scheduler, auft, viau)

Training Epoch: 1
Train Loss: 0.904, mse: 0.484
Validation Loss: 1.319, mse: 0.281
Training Epoch: 2
Train Loss: 0.808, mse: 0.483
Validation Loss: 1.338, mse: 0.283
Training Epoch: 3
Train Loss: 0.753, mse: 0.482
Validation Loss: 1.357, mse: 0.28
Training Epoch: 4
Train Loss: 0.712, mse: 0.482
Validation Loss: 1.367, mse: 0.275
Training Epoch: 5
Train Loss: 0.678, mse: 0.481
Validation Loss: 1.375, mse: 0.271
Training Epoch: 6
Train Loss: 0.649, mse: 0.481
Validation Loss: 1.383, mse: 0.269
Training Epoch: 7
Train Loss: 0.622, mse: 0.481
Validation Loss: 1.4, mse: 0.269
Training Epoch: 8
Train Loss: 0.599, mse: 0.48
Validation Loss: 1.417, mse: 0.269
Training Epoch: 9
Train Loss: 0.577, mse: 0.48
Validation Loss: 1.434, mse: 0.273
Training Epoch: 10
Train Loss: 0.556, mse: 0.48
Validation Loss: 1.445, mse: 0.272
3min 7s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: CCC_Valence {cccv}, CCC_Arousal: {ccca}')
except:
    VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
    VA_model = VA_fusion().to(device)
    VA_model.load_state_dict(VA_best_model)
    val_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, val_loader, auft)
    print(f'best metric: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

best metric: CCC_Valence 0.215, CCC_Arousal: 0.456


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss,best_mse, cccv, ccca = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, scheduler, auft, viau)

Training Epoch: 1
Train Loss: 1.286, mse: 0.45
Validation Loss: 1.298, mse: 0.352
Training Epoch: 2
Train Loss: 1.241, mse: 0.458
Validation Loss: 1.312, mse: 0.349
Training Epoch: 3
Train Loss: 1.221, mse: 0.462
Validation Loss: 1.321, mse: 0.349
Training Epoch: 4
Train Loss: 1.207, mse: 0.464
Validation Loss: 1.328, mse: 0.347
Training Epoch: 5
Train Loss: 1.197, mse: 0.465
Validation Loss: 1.333, mse: 0.346
Training Epoch: 6
Train Loss: 1.189, mse: 0.466
Validation Loss: 1.338, mse: 0.345
Training Epoch: 7
Train Loss: 1.182, mse: 0.467
Validation Loss: 1.343, mse: 0.345
Training Epoch: 8
Train Loss: 1.176, mse: 0.468
Validation Loss: 1.346, mse: 0.345
Training Epoch: 9
Train Loss: 1.171, mse: 0.468
Validation Loss: 1.35, mse: 0.345
Training Epoch: 10
Train Loss: 1.166, mse: 0.469
Validation Loss: 1.354, mse: 0.345
1min 18s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: CCC_Valence {cccv}, CCC_Arousal: {ccca}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_mlp_{viau}.pth'))
    mlp_model = MLPModel().to(device)
    mlp_model.load_state_dict(mlp_best_model)
    val_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, val_loader, auft)
    print(f'best metric: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

best metric: CCC_Valence 0.34, CCC_Arousal: 0.337


##### Effnet

In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss,best_mse, cccv, ccca = train(VA_model, model_type[0], train_loader, val_loader, 10, 32, optimizer, scheduler, auft, viau)

Training Epoch: 1
Train Loss: 0.961, mse: 0.488
Validation Loss: 1.517, mse: 0.328
Training Epoch: 2
Train Loss: 0.859, mse: 0.488
Validation Loss: 1.522, mse: 0.31
Training Epoch: 3
Train Loss: 0.805, mse: 0.487
Validation Loss: 1.546, mse: 0.298
Training Epoch: 4
Train Loss: 0.766, mse: 0.487
Validation Loss: 1.558, mse: 0.288
Training Epoch: 5
Train Loss: 0.735, mse: 0.487
Validation Loss: 1.572, mse: 0.282
Training Epoch: 6
Train Loss: 0.708, mse: 0.487
Validation Loss: 1.584, mse: 0.276
Training Epoch: 7
Train Loss: 0.685, mse: 0.487
Validation Loss: 1.599, mse: 0.275
Training Epoch: 8
Train Loss: 0.664, mse: 0.487
Validation Loss: 1.604, mse: 0.273
Training Epoch: 9
Train Loss: 0.644, mse: 0.487
Validation Loss: 1.619, mse: 0.273
Training Epoch: 10
Train Loss: 0.626, mse: 0.486
Validation Loss: 1.632, mse: 0.274
2min 56s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: CCC_Valence {cccv}, CCC_Arousal: {ccca}')
except:
    VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
    VA_model = VA_fusion().to(device)
    VA_model.load_state_dict(VA_best_model)
    val_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, val_loader, auft)
    print(f'best metric: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

best metric: CCC_Valence 0.141, CCC_Arousal: 0.324


In [None]:
%%timeit -n 1 -r 1
loss_train, loss_val, best_loss, best_mse, cccv, ccca = train(mlp_model, model_type[1], train_loader, val_loader, 10, 32, optimizer, scheduler, auft, viau)

Training Epoch: 1
Train Loss: 1.357, mse: 0.469
Validation Loss: 1.34, mse: 0.352
Training Epoch: 2
Train Loss: 1.315, mse: 0.473
Validation Loss: 1.34, mse: 0.336
Training Epoch: 3
Train Loss: 1.296, mse: 0.474
Validation Loss: 1.338, mse: 0.332
Training Epoch: 4
Train Loss: 1.283, mse: 0.475
Validation Loss: 1.333, mse: 0.329
Training Epoch: 5
Train Loss: 1.274, mse: 0.476
Validation Loss: 1.328, mse: 0.326
Training Epoch: 6
Train Loss: 1.266, mse: 0.477
Validation Loss: 1.323, mse: 0.323
Training Epoch: 7
Train Loss: 1.26, mse: 0.477
Validation Loss: 1.318, mse: 0.32
Training Epoch: 8
Train Loss: 1.254, mse: 0.478
Validation Loss: 1.314, mse: 0.318
Training Epoch: 9
Train Loss: 1.248, mse: 0.478
Validation Loss: 1.311, mse: 0.316
Training Epoch: 10
Train Loss: 1.244, mse: 0.478
Validation Loss: 1.309, mse: 0.314
1min 9s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
try:
    print(f'best metric: CCC_Valence {cccv}, CCC_Arousal: {ccca}')
except:
    mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_mlp_{viau}.pth'))
    mlp_model = MLPModel().to(device)
    mlp_model.load_state_dict(mlp_best_model)
    val_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, val_loader, auft)
    print(f'best metric: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

best metric: CCC_Valence 0.376, CCC_Arousal: 0.366


### Testing

#### Cropped_aligned images

##### EffNet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
print('VA_model')
print(visual_feat + ' & ' + auft)
VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
VA_model = VA_fusion().to(device)
VA_model.load_state_dict(VA_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: CCC_Valence 0.657, CCC_Arousal: 0.465
1.85 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_model/best_VA_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: CCC_Valence 0.752, CCC_Arousal: 0.542
1.51 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: CCC_Valence 0.752, CCC_Arousal: 0.542
843 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet + Vggish

In [None]:
%%timeit -n 1 -r 1
print('VA_model')
print(visual_feat + ' & ' + auft)
VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
VA_model = VA_fusion().to(device)
VA_model.load_state_dict(VA_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

VA_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: CCC_Valence 0.496, CCC_Arousal: 0.36
1.77 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: CCC_Valence 0.704, CCC_Arousal: 0.511
904 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet

In [None]:
%%timeit -n 1 -r 1
print('VA_model')
print(visual_feat + ' & ' + auft)
VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
VA_model = VA_fusion().to(device)
VA_model.load_state_dict(VA_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

VA_model
visualfeat_enet_b2_8_best & nope
Test set: CCC_Valence 0.514, CCC_Arousal: 0.429
1.73 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

MLP_model
visualfeat_enet_b2_8_best & nope
Test set: CCC_Valence 0.684, CCC_Arousal: 0.525
773 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


#### Cropped images

##### EffNet + Wav2vec2

In [None]:
%%timeit -n 1 -r 1
print('VA_model')
print(visual_feat + ' & ' + auft)
VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
VA_model = VA_fusion().to(device)
VA_model.load_state_dict(VA_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, test_loader, auft)
print(f'Test set: CCC_Valence: {ccc1}, CCC_Arousal: {ccc2}')

VA_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: CCC_Valence: 0.643, CCC_Arousal: 0.421
2.43 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_wav2vec2
Test set: CCC_Valence 0.761, CCC_Arousal: 0.464
1.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet + Vggish

In [None]:
%%timeit -n 1 -r 1
print('VA_model')
print(visual_feat + ' & ' + auft)
VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
VA_model = VA_fusion().to(device)
VA_model.load_state_dict(VA_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

VA_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: CCC_Valence 0.509, CCC_Arousal: 0.321
1.83 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

MLP_model
visualfeat_enet_b2_8_best & audiofeat_vggish
Test set: CCC_Valence 0.699, CCC_Arousal: 0.427
779 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


##### EffNet

In [None]:
%%timeit -n 1 -r 1
print('VA_model')
print(visual_feat + ' & ' + auft)
VA_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_fusion_{viau}.pth'))
VA_model = VA_fusion().to(device)
VA_model.load_state_dict(VA_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(VA_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

VA_model
visualfeat_enet_b2_8_best & nope
Test set: CCC_Valence 0.383, CCC_Arousal: 0.301
1.68 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
print('MLP_model')
print(visual_feat + ' & ' + auft)
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)
test_loss, mse, ccc1, ccc2 = evaluate_model(mlp_model, test_loader, auft)
print(f'Test set: CCC_Valence {ccc1}, CCC_Arousal: {ccc2}')

MLP_model
visualfeat_enet_b2_8_best & nope
Test set: CCC_Valence 0.632, CCC_Arousal: 0.503
745 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Smooth validation prediction (EffNet + Wav2vec2 + MLP)

In [None]:
def smooth_prediction(img, predict):
    cur_ind = 0
    preds_proba = []
    if img:
        for i in range(img[-1]):
            if img[cur_ind] - 1 == i:
                preds_proba.append(predict[cur_ind])
                cur_ind += 1
            else:
                if cur_ind == 0:
                    preds_proba.append(predict[cur_ind])
                else:
                    w = (i - img[cur_ind - 1] + 1) / (img[cur_ind] - img[cur_ind - 1])
                    pred = w * predict[cur_ind - 1] + (1 - w) * predict[cur_ind]
                    preds_proba.append(pred)
        return np.array([p.cpu().detach().numpy() for p in preds_proba])

In [None]:
def slide_window(preds_proba, i, delta, typ):
    i1 = max(i - delta, 0)
    if typ == 'mean':
        proba = np.mean(preds_proba[i1:i+delta+1], axis=0)
    elif typ == 'median':
        proba = np.median(preds_proba[i1:i+delta+1], axis=0)
    else:
        proba = np.mean(preds_proba[i1:i+delta+1:int(typ)], axis=0)
    return np.argmax(proba), proba

## EXPR

In [None]:
tsk = task[0]
vis = vis_typ[0]
viau = vis_aud[0]
auft = audio_feat[0]

In [None]:
class MLPModel(nn.Module):
    def __init__(self, audio_ft = auft, num_classes=8):
        super(MLPModel, self).__init__()
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.activ = nn.ReLU()
        self.fc1 = nn.Linear(self.concat_dim, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, vis_feat, aud_feat):
        inputs = [vis_feat]
        inputs.append(aud_feat)
        feat = torch.cat(inputs, dim=0)
        feat = self.fc1(feat)
        feat = self.activ(feat)
        out = self.fc2(feat)
        return out, torch.softmax(out, dim=0)

In [None]:
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_EXPR_model/best_EXPR_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)

<All keys matched successfully>

In [None]:
anno_path = os.path.join(root,f'data/Annotations/{tsk}/{split[1]}')
with open(os.path.join(root, f'data/Annotations/{tsk}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
with open(os.path.join(root,f'models/ABAW6/{vis}/EXPR/{tsk}_{typ[2]}_{viau}.pkl'), 'rb') as f:
    data = pickle.load(f)

In [None]:
test_vid = {}
for vname in tqdm(vidnames):
        img, predict, label = [], [], []
        for imgname, val in sorted(data[tsk][vname].items()):
            vis_feat = torch.tensor(val[visual_feat]).to(device)
            if auft == 'nope':
                aud_feat = None
            else:
                aud_feat = torch.tensor(val[auft]).to(device)
            if tsk == task[2]:
                _, _, vpred, apred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor([vpred, apred])
            else:
                _, pred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor(pred)
            ind = int(imgname.split('/')[1][:-4])
            img.append(ind)
            predict.append(preds)
            label.append(data[tsk][vname][imgname]['label'])
        test_vid[vname] = (img, predict, label)

100%|██████████| 70/70 [01:53<00:00,  1.62s/it]


In [None]:
hyperparams=[(isMean,delta) for delta in [0, 15, 30, 60, 100, 200] for isMean in [1,0] if not (isMean==0 and delta==0)]
total_true=[]
total_preds=[[] for _ in range(len(hyperparams))]
timing_results = {(isMean, delta): 0 for isMean, delta in hyperparams}
for videoname,(img, predict, label) in test_vid.items():
    for i,ind in enumerate(img):
        total_true.append(label[i].cpu().numpy())
    preds_proba = smooth_prediction(img, predict)
    for hInd,(isMean,delta) in enumerate(hyperparams):
        preds=[]
        start = time.time()
        for i in range(len(preds_proba)):
            i1=max(i-delta,0)
            if isMean:
                best_ind, proba = slide_window(preds_proba, i, delta, 'mean')
            else:
                best_ind, proba = slide_window(preds_proba, i, delta, 'median')
            preds.append(best_ind)
        for i,ind in enumerate(img):
            if label[i]>=0:
                total_preds[hInd].append(preds[ind-1])
        end = time.time()
        timing_results[(isMean, delta)] += end - start
total_true=np.array(total_true)

In [None]:
for hInd, (isMean, delta) in enumerate(hyperparams):
    preds = np.array(total_preds[hInd])
    accuracy = round((preds == total_true).mean(), 3)
    f1 = round(f1_score(y_true=total_true, y_pred=preds, average='macro'), 3)
    mean_or_median = 'mean' if isMean else 'median'
    time_taken = round(timing_results[(isMean, delta)],3)
    print(f'{mean_or_median}; delta: {delta}; Acc: {accuracy}; F1: {f1}; Time: {time_taken}s')

mean; delta: 0; Acc: 0.488; F1: 0.394; Time: 9.554s
mean; delta: 15; Acc: 0.524; F1: 0.429; Time: 9.97s
median; delta: 15; Acc: 0.523; F1: 0.43; Time: 20.216s
mean; delta: 30; Acc: 0.533; F1: 0.44; Time: 9.814s
median; delta: 30; Acc: 0.533; F1: 0.442; Time: 20.944s
mean; delta: 60; Acc: 0.537; F1: 0.443; Time: 9.286s
median; delta: 60; Acc: 0.538; F1: 0.449; Time: 22.854s
mean; delta: 100; Acc: 0.539; F1: 0.448; Time: 9.671s
median; delta: 100; Acc: 0.541; F1: 0.454; Time: 25.641s
mean; delta: 200; Acc: 0.538; F1: 0.437; Time: 11.106s
median; delta: 200; Acc: 0.544; F1: 0.457; Time: 31.662s


## AU

In [None]:
tsk = task[1]
vis = vis_typ[0]
viau = vis_aud[0]
auft = audio_feat[0]

In [None]:
class MLPModel(nn.Module):
    def __init__(self, audio_ft = auft, num_classes=12):
        super(MLPModel, self).__init__()
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.activ = nn.ReLU()
        self.fc1 = nn.Linear(self.concat_dim, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs, dim=0)
        feat = self.fc1(feat)
        feat = self.activ(feat)
        out = self.fc2(feat)
        return out, torch.sigmoid(out)

In [None]:
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_AU_model/best_AU_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)

<All keys matched successfully>

In [None]:
anno_path = os.path.join(root,f'data/Annotations/{tsk}/{split[1]}')
with open(os.path.join(root, f'data/Annotations/{tsk}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
with open(os.path.join(root,f'models/ABAW6/{vis}/AU/{tsk}_{typ[2]}_{viau}.pkl'), 'rb') as f:
    data = pickle.load(f)

In [None]:
test_vid = {}
for vname in tqdm(vidnames):
        img, predict, label = [], [], []
        for imgname, val in sorted(data[tsk][vname].items()):
            vis_feat = torch.tensor(val[visual_feat]).to(device)
            if auft == 'nope':
                aud_feat = None
            else:
                aud_feat = torch.tensor(val[auft]).to(device)
            if tsk == task[2]:
                _, _, vpred, apred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor([vpred, apred])
            else:
                _, pred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor(pred)
            ind = int(imgname.split('/')[1][:-4])
            img.append(ind)
            predict.append(preds)
            label.append(data[tsk][vname][imgname]['label'])
        test_vid[vname] = (img, predict, label)

100%|██████████| 105/105 [03:46<00:00,  2.16s/it]


In [None]:
thresholds = np.array([0.30000000000000004, 0.2, 0.2, 0.30000000000000004, 0.5, 0.30000000000000004, 0.6, 0.1, 0.1, 0.1, 0.6, 0.30000000000000004])
#np.array([0.2, 0.2, 0.2, 0.30000000000000004, 0.6, 0.30000000000000004, 0.6, 0.1, 0.1, 0.1, 0.6, 0.2])
#np.array([0.2, 0.2, 0.2, 0.4, 0.5, 0.4, 0.6, 0.1, 0.1, 0.1, 0.6, 0.2])

In [None]:
hyperparams=[(isMean,delta) for delta in [0, 15, 30, 60, 100, 200] for isMean in [1,0] if not (isMean==0 and delta==0)]
total_true=[]
total_preds=[[] for _ in range(len(hyperparams))]
timing_results = {(isMean, delta): 0 for isMean, delta in hyperparams}
for videoname,(img, predict, label) in test_vid.items():
    for i,ind in enumerate(img):
        total_true.append(label[i].cpu().numpy())
    preds_proba = smooth_prediction(img, predict)
    for hInd,(isMean,delta) in enumerate(hyperparams):
        preds=[]
        start = time.time()
        for i in range(len(preds_proba)):
            i1=max(i-delta,0)
            if isMean:
                _, proba = slide_window(preds_proba, i, delta, 'mean')
            else:
                _, proba = slide_window(preds_proba, i, delta, 'median')
            aus = (proba>=thresholds)*1
            preds.append(aus)
        for i,ind in enumerate(img):
            if label[i][0]>=-1 and label[i][1]>=-1:
                total_preds[hInd].append(preds[ind-1])
        end = time.time()
        timing_results[(isMean, delta)] += end - start
total_true=np.array(total_true)

In [None]:
for hInd, (isMean, delta) in enumerate(hyperparams):
    preds = np.array(total_preds[hInd])
    accuracy = round((preds == total_true).mean(), 3)
    f1 = round(np.mean([f1_score(y_true=total_true[:,i],y_pred=preds[:,i]) for i in range(preds.shape[1])]), 3)
    mean_or_median = 'mean' if isMean else 'median'
    time_taken = round(timing_results[(isMean, delta)],3)
    print(f'{mean_or_median}; delta: {delta}; Acc: {accuracy}; F1: {f1}; Time: {time_taken}s')

mean; delta: 0; Acc: 0.855; F1: 0.487; Time: 15.418s
mean; delta: 15; Acc: 0.86; F1: 0.496; Time: 15.937s
median; delta: 15; Acc: 0.863; F1: 0.495; Time: 29.683s
mean; delta: 30; Acc: 0.858; F1: 0.491; Time: 16.427s
median; delta: 30; Acc: 0.863; F1: 0.491; Time: 33.438s
mean; delta: 60; Acc: 0.854; F1: 0.478; Time: 16.618s
median; delta: 60; Acc: 0.862; F1: 0.477; Time: 35.013s
mean; delta: 100; Acc: 0.85; F1: 0.465; Time: 17.563s
median; delta: 100; Acc: 0.859; F1: 0.459; Time: 39.341s
mean; delta: 200; Acc: 0.842; F1: 0.438; Time: 19.513s
median; delta: 200; Acc: 0.852; F1: 0.429; Time: 50.336s


## VA

In [None]:
tsk = task[2]
vis = vis_typ[0]
viau = vis_aud[0]
auft = audio_feat[0]

In [None]:
class MLPModel(nn.Module):
    def __init__(self, audio_ft = auft, num_classes=1):
        super(MLPModel, self).__init__()
        if audio_ft == 'audiofeat_wav2vec2':
            self.concat_dim = 2176    #1408+768
        elif audio_ft == 'audiofeat_vggish':
            self.concat_dim = 1536    #1408+128
        elif audio_ft == 'nope':
            self.concat_dim = 1408    #visual only
        self.activ = nn.ReLU()
        self.fc1 = nn.Linear(self.concat_dim, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, vis_feat, aud_feat):
        if aud_feat == None:
            feat = vis_feat
        else:
            inputs = [vis_feat]
            inputs.append(aud_feat)
            feat = torch.cat(inputs, dim=1)
        feat = self.fc1(feat)
        feat = self.activ(feat)
        vout = self.fc2(feat)
        aout = self.fc2(feat)

        return vout, aout, torch.tanh(vout), torch.tanh(aout)

In [None]:
mlp_best_model = torch.load(os.path.join(root,f'models/ABAW6/{vis}/best_VA_model/best_VA_mlp_{viau}.pth'))
mlp_model = MLPModel().to(device)
mlp_model.load_state_dict(mlp_best_model)

<All keys matched successfully>

In [None]:
img, predict, label = [], [], []
vid = {}
iterator = iter(test_loader)
for i in range(len(test_loader)//32):
    VA = next(iterator)
    images = VA['frame']
    vis_feat, aud_feat, y = VA[visual_feat], VA[auft], VA['label']
    vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
    _, _, vpred, apred = mlp_model(vis_feat, aud_feat)
    preds = torch.cat((vpred, apred), dim=1)
    img.extend(images)
    predict.extend(preds)
    label.extend(y)

In [None]:
index, pred, lab = [], [], []
test_vid = {}
for i, val in enumerate(img):
    ind = int(val.split('/')[1][:-4])
    vname = val.split('/')[0]
    if i == 0:
        prename = vname
        index.append(ind)
        pred.append(predict[i])
        lab.append(label[i])
    else:
        if vname == prename:
            index.append(ind)
            pred.append(predict[i])
            lab.append(label[i])
        else:
            combined = list(zip(index, pred, lab))
            combined_sorted = sorted(combined, key=lambda x: x[0])
            index_list_sorted, pred_list_sorted, lab_list_sorted = zip(*combined_sorted)
            test_vid[prename] = (list(index_list_sorted), list(pred_list_sorted), list(lab_list_sorted))
            prename = vname
            index, pred, lab = [], [], []
            index.append(ind)
            pred.append(predict[i])
            lab.append(label[i])

In [None]:
anno_path = os.path.join(root,f'data/Annotations/{tsk}/{split[1]}')
with open(os.path.join(root, f'data/Annotations/{tsk}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
with open(os.path.join(root,f'models/ABAW6/{vis}/VA/{tsk}_{typ[2]}_{viau}.pkl'), 'rb') as f:
    data = pickle.load(f)

In [None]:
test_vid = {}
for vname in tqdm(vidnames):
        img, predict, label = [], [], []
        for imgname, val in sorted(data[tsk][vname].items()):
            vis_feat = torch.tensor(val[visual_feat]).to(device)
            if auft == 'nope':
                aud_feat = None
            else:
                aud_feat = torch.tensor(val[auft]).to(device)
            if tsk == task[2]:
                _, _, vpred, apred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor([vpred, apred])
            else:
                _, pred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor(pred)
            ind = int(imgname.split('/')[1][:-4])
            img.append(ind)
            predict.append(preds)
            label.append(data[tsk][vname][imgname]['label'])
        test_vid[vname] = (img, predict, label)

100%|██████████| 76/76 [03:38<00:00,  2.87s/it]


In [None]:
hyperparams=[(isMean,delta) for delta in [0, 15, 30, 60, 100, 200] for isMean in [1,0] if not (isMean==0 and delta==0)]
total_true=[]
total_preds=[[] for _ in range(len(hyperparams))]
timing_results = {(isMean, delta): 0 for isMean, delta in hyperparams}
for videoname,(img, predict, label) in test_vid.items():
    for i,ind in enumerate(img):
        total_true.append(label[i].cpu().numpy())
    preds_proba = smooth_prediction(img, predict)
    for hInd,(isMean,delta) in enumerate(hyperparams):
        preds=[]
        start = time.time()
        for i in range(len(preds_proba)):
            i1=max(i-delta,0)
            if isMean:
                best_ind, proba = slide_window(preds_proba, i, delta, 'mean')
            else:
                best_ind, proba = slide_window(preds_proba, i, delta, 'median')
            preds.append(proba)
        for i, ind in enumerate(img):
            if label[i][0]>=-1 and label[i][1]>=-1:
                total_preds[hInd].append(preds[ind-1])
        end = time.time()
        timing_results[(isMean, delta)] += end - start
total_true=np.array(total_true)

In [None]:
for hInd, (isMean, delta) in enumerate(hyperparams):
    preds = np.array(total_preds[hInd])
    ccc1, ccc2 = compute_VA_CCC(preds, total_true)
    mean_or_median = 'mean' if isMean else 'median'
    time_taken = round(timing_results[(isMean, delta)],3)
    print(f'{mean_or_median}; delta: {delta}; CCCV: {ccc1:.3f}; CCCA: {ccc2:.3f}; Time: {time_taken}s')

mean; delta: 0; CCCV: 0.755; CCCA: 0.538; Time: 2.09s
mean; delta: 15; CCCV: 0.781; CCCA: 0.574; Time: 1.054s
median; delta: 15; CCCV: 0.780; CCCA: 0.566; Time: 1.343s
mean; delta: 30; CCCV: 0.788; CCCA: 0.582; Time: 1.022s
median; delta: 30; CCCV: 0.790; CCCA: 0.573; Time: 1.333s
mean; delta: 60; CCCV: 0.789; CCCA: 0.576; Time: 1.084s
median; delta: 60; CCCV: 0.793; CCCA: 0.569; Time: 1.389s
mean; delta: 100; CCCV: 0.783; CCCA: 0.559; Time: 1.034s
median; delta: 100; CCCV: 0.788; CCCA: 0.556; Time: 1.374s
mean; delta: 200; CCCV: 0.770; CCCA: 0.519; Time: 1.141s
median; delta: 200; CCCV: 0.771; CCCA: 0.499; Time: 1.45s


# Adaptive Frame Rate (EffNet + Wav2vec2 + MLP)

### EXPR

In [None]:
delta = 100

In [None]:
anno_path = os.path.join(root,f'data/Annotations/{tsk}/{split[0]}')
with open(os.path.join(root, f'data/Annotations/{tsk}/{typ[0]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
with open(os.path.join(root,f'models/ABAW6/{vis}/EXPR/{tsk}_{typ[0]}_{viau}.pkl'), 'rb') as f:
    data = pickle.load(f)
train_vid = {}
for vname in tqdm(vidnames):
        img, predict, label = [], [], []
        for imgname, val in sorted(data[tsk][vname].items()):
            vis_feat = torch.tensor(val[visual_feat]).to(device)
            if auft == 'nope':
                aud_feat = None
            else:
                aud_feat = torch.tensor(val[auft]).to(device)
            if tsk == task[2]:
                _, _, vpred, apred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor([vpred, apred])
            else:
                _, pred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor(pred)
            ind = int(imgname.split('/')[1][:-4])
            img.append(ind)
            predict.append(preds)
            label.append(data[tsk][vname][imgname]['label'])
        train_vid[vname] = (img, predict, label)

100%|██████████| 199/199 [03:59<00:00,  1.20s/it]


In [None]:
stride2scores={}
for stride in [200, 100, 50, 25, 10]:
    total_true, predictions, max_decision_values = [],[],[]
    for vidname, (img, predict, label) in train_vid.items():
        index = []
        for i,ind in enumerate(img):
            total_true.append(label[i].cpu().numpy())
            index.append(ind-1)
        preds_proba = smooth_prediction(img, predict)
        for i in range(len(index)):
            best_ind, proba = slide_window(preds_proba, index[i], delta, stride)
            predictions.append(best_ind)
            max_decision_values.append(proba[best_ind])
    stride2scores[stride] = (np.array(total_true),np.array(predictions),np.array(max_decision_values))

In [None]:
def get_threshold(stride,fpr_corrected):
    (total_true,predictions,max_decision_values) = stride2scores[stride]
    mistakes = max_decision_values[predictions != total_true]
    best_threshold = -1
    for i, threshold in enumerate(sorted(max_decision_values[predictions == total_true])[::-1]):
        tpr = i/len(predictions)
        fpr = (mistakes > threshold).sum()/len(predictions)

        if fpr > fpr_corrected:
            if best_threshold == -1:
                best_threshold = threshold
            print(stride, 'best_threshold', best_threshold, i)
            break
        best_threshold = threshold
    return best_threshold

In [None]:
stride2threshold = {}
for stride in stride2scores:
    fpr_corrected=0.05
    stride2threshold[stride] = get_threshold(stride,fpr_corrected)
stride2threshold[1] = 0
print(stride2threshold)

200 best_threshold 0.5828423 220303
100 best_threshold 0.54001206 253009
50 best_threshold 0.52979404 261165
25 best_threshold 0.523032 267088
10 best_threshold 0.51859856 271299
{200: 0.5828423, 100: 0.54001206, 50: 0.52979404, 25: 0.523032, 10: 0.51859856, 1: 0}


In [None]:
all_strides=[
    [200, 100, 50, 10, 1],
    [50, 25, 1],
    [50, 10, 1],
    [200,50,1],
    [100,50,1],
    [200,1],
    [100,1],
    [50,1]
]
for s in stride2threshold.keys():
    all_strides.append([s])

for strides in all_strides:
    print(strides)
    last_stride=strides[-1]

    total_true=[]
    total_preds=[]
    total_frames_processed,total_frames=0,0
    time_each = []
    start = time.time()
    for videoname, (img, predict, label) in test_vid.items():
        emotional_img=[]
        start1 = time.time()
        for i,ind in enumerate(img):
            total_true.append(label[i].cpu().numpy())
            emotional_img.append(ind-1)
        cur_ind=0
        preds_proba=[]
        for i in range(img[-1]):
            if img[cur_ind]-1==i:
                preds_proba.append(predict[cur_ind])
                cur_ind+=1
            else:
                if cur_ind==0:
                    preds_proba.append(predict[cur_ind])
                else:
                    w=(i-img[cur_ind-1]+1)/(img[cur_ind]-img[cur_ind-1])
                    pred=w*predict[cur_ind-1]+(1-w)*predict[cur_ind]
                    preds_proba.append(pred)

        preds_proba=np.array([p.cpu().numpy() for p in preds_proba])

        preds=-np.ones(len(emotional_img))
        end1 = time.time()
        time_each.append(end1 - start1)
        for stride in strides:
            threshold=stride2threshold[stride]
            for i in range(len(emotional_img)):
                if preds[i]<0:
                    i1=max(emotional_img[i]-delta,0)
                    cur_preds=preds_proba[i1:emotional_img[i]+delta+1:stride]
                    proba=np.mean(cur_preds,axis=0)
                    best_ind=np.argmax(proba)
                    if proba[best_ind]>=threshold or stride==last_stride:
                        total_frames_processed+=len(cur_preds)
                        total_frames+=len(preds_proba[i1:emotional_img[i]+delta+1])
                        preds[i]=best_ind
        for p in preds:
            total_preds.append(p)
    end = time.time()
    elapsed_time = end - start - sum(time_each)
    total_true=np.array(total_true)
    preds=np.array(total_preds)
    print('Acc:',round((preds==total_true).mean(),3), 'F1:',round(f1_score(y_true=total_true,y_pred=preds, average="macro"),3))
    print(total_frames_processed,total_frames,round(total_frames_processed/total_frames,3))
    print(f"Time: {elapsed_time:.2f} seconds")

[200, 100, 50, 10, 1]
Acc: 0.532 F1: 0.443
28654087 55522377 0.516
Time: 17.52 seconds
[50, 25, 1]
Acc: 0.539 F1: 0.448
35626909 55522377 0.642
Time: 13.31 seconds
[50, 10, 1]
Acc: 0.539 F1: 0.448
35270322 55522377 0.635
Time: 13.57 seconds
[200, 50, 1]
Acc: 0.532 F1: 0.443
31363350 55522377 0.565
Time: 13.05 seconds
[100, 50, 1]
Acc: 0.538 F1: 0.447
33947035 55522377 0.611
Time: 13.10 seconds
[200, 1]
Acc: 0.532 F1: 0.443
35393313 55522377 0.637
Time: 9.46 seconds
[100, 1]
Acc: 0.538 F1: 0.447
36501721 55522377 0.657
Time: 12.46 seconds
[50, 1]
Acc: 0.539 F1: 0.448
37616958 55522377 0.678
Time: 9.93 seconds
[200]
Acc: 0.472 F1: 0.379
547752 55522377 0.01
Time: 5.42 seconds
[100]
Acc: 0.51 F1: 0.421
826609 55522377 0.015
Time: 5.61 seconds
[50]
Acc: 0.523 F1: 0.434
1378986 55522377 0.025
Time: 5.59 seconds
[25]
Acc: 0.532 F1: 0.441
2483889 55522377 0.045
Time: 5.56 seconds
[10]
Acc: 0.537 F1: 0.446
5798729 55522377 0.104
Time: 5.79 seconds
[1]
Acc: 0.539 F1: 0.448
55522377 55522377 1.0

### AU

In [None]:
delta = 15

In [None]:
anno_path = os.path.join(root,f'data/Annotations/{tsk}/{split[0]}')
with open(os.path.join(root, f'data/Annotations/{tsk}/{typ[0]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
with open(os.path.join(root,f'models/ABAW6/{vis}/AU/{tsk}_{typ[0]}_visual.pkl'), 'rb') as f:
        data = pickle.load(f)

task1 = task[1]
feature_a = 'audiofeat_wav2vec2'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
    feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
    for imgname, val in feature.items():
        if imgname in data[task1][vname]:
            data[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data[task1][vname].items()):
            if len(value) < 3:
                data[task1][vname].pop(img)

100%|██████████| 236/236 [01:50<00:00,  2.14it/s]


In [None]:
train_vid = {}
for vname in tqdm(vidnames):
        img, predict, label = [], [], []
        for imgname, val in sorted(data[tsk][vname].items()):
            vis_feat = torch.tensor(val[visual_feat]).to(device)
            if auft == 'nope':
                aud_feat = None
            else:
                aud_feat = torch.tensor(val[auft]).to(device)
            if tsk == task[2]:
                _, _, vpred, apred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor([vpred, apred])
            else:
                _, pred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor(pred)
            ind = int(imgname.split('/')[1][:-4])
            img.append(ind)
            predict.append(preds)
            label.append(data[tsk][vname][imgname]['label'])
        train_vid[vname] = (img, predict, label)

100%|██████████| 236/236 [00:00<00:00, 2371.61it/s]


In [None]:
thresholds = np.array([0.30000000000000004, 0.2, 0.2, 0.30000000000000004, 0.5, 0.30000000000000004, 0.6, 0.1, 0.1, 0.1, 0.6, 0.30000000000000004])

In [None]:
stride2scores={}
for stride in [200, 100, 50, 25, 10]:
    total_true, predictions, max_decision_values = [],[],[]
    decision_values = [[] for _ in range(8)]
    for vidname, (img, predict, label) in train_vid.items():
        index = []
        for i,ind in enumerate(img):
            total_true.append(label[i].cpu().numpy())
            index.append(ind-1)
        preds_proba = smooth_prediction(img, predict)
        for i in range(len(index)):
            best_ind, proba = slide_window(preds_proba, index[i], delta, stride)
            aus = (proba>=thresholds)*1
            predictions.append(aus)
            for j in range(8):
                if proba[j] > thresholds[j]:
                    decision_values[j].append(proba[j])
                else:
                    decision_values[j].append(thresholds[j])
            max_decision_values.append([max(values) for values in decision_values])
    stride2scores[stride] = (np.array(total_true),np.array(predictions),np.array(max_decision_values))

In [None]:
def get_threshold(stride,fpr_corrected):
    (total_true,predictions,max_decision_values) = stride2scores[stride]
    mistakes = max_decision_values[predictions.any() != total_true.any()]
    best_threshold = -1
    for i, threshold in enumerate(sorted(max_decision_values[predictions.all() == total_true.all()])[::-1]):
        fpr = (mistakes > threshold).sum()/len(predictions)
        if fpr > fpr_corrected:
            if best_threshold == -1:
                best_threshold = threshold
            print(stride, 'best_threshold', best_threshold, i)
            break
        best_threshold = threshold
    return best_threshold

In [None]:
stride2threshold = {}
for stride in stride2scores:
    fpr_corrected=0.05
    stride2threshold[stride] = get_threshold(stride,fpr_corrected)
stride2threshold[1] = 0
print(stride2threshold)

{200: array([[0.3       , 0.2       , 0.2       , ..., 0.3       , 0.6       ,
        0.1       ],
       [0.3       , 0.2       , 0.2       , ..., 0.3       , 0.6       ,
        0.1       ],
       [0.3       , 0.2       , 0.2       , ..., 0.3       , 0.6       ,
        0.1       ],
       ...,
       [0.88672322, 0.70943815, 0.91607934, ..., 0.99227798, 0.99390829,
        0.25974458],
       [0.88672322, 0.70943815, 0.91607934, ..., 0.99227798, 0.99390829,
        0.25974458],
       [0.88672322, 0.70943815, 0.91607934, ..., 0.99227798, 0.99390829,
        0.25974458]]), 100: array([[0.3       , 0.2       , 0.2       , ..., 0.3       , 0.6       ,
        0.1       ],
       [0.3       , 0.2       , 0.2       , ..., 0.3       , 0.6       ,
        0.1       ],
       [0.3       , 0.2       , 0.2       , ..., 0.3       , 0.6       ,
        0.1       ],
       ...,
       [0.88672322, 0.70943815, 0.91607934, ..., 0.99227798, 0.99390829,
        0.25974458],
       [0.88672322, 0.7

In [None]:
all_strides=[
    [200, 100, 50, 10, 1],
    [50, 25, 1],
    [50, 10, 1],
    [200,50,1],
    [100,50,1],
    [200,1],
    [100,1],
    [50,1]
]
for s in stride2threshold.keys():
    all_strides.append([s])

for strides in all_strides:
    start = time.time()
    print(strides)
    last_stride=strides[-1]

    total_true=[]
    total_preds=[]
    total_frames_processed,total_frames=0,0
    time_each = []
    start = time.time()
    for videoname, (img, predict, label) in test_vid.items():
        emotional_img=[]
        start1 = time.time()
        for i,ind in enumerate(img):
            total_true.append(label[i].cpu().numpy())
            emotional_img.append(ind-1)
        cur_ind=0
        preds_proba=[]
        for i in range(img[-1]):
            if img[cur_ind]-1==i:
                preds_proba.append(predict[cur_ind])
                cur_ind+=1
            else:
                if cur_ind==0:
                    preds_proba.append(predict[cur_ind])
                else:
                    w=(i-img[cur_ind-1]+1)/(img[cur_ind]-img[cur_ind-1])
                    pred=w*predict[cur_ind-1]+(1-w)*predict[cur_ind]
                    preds_proba.append(pred)

        preds_proba=np.array([p.cpu().numpy() for p in preds_proba])
        preds=[[-1]*12 for _ in range(len(emotional_img))]
        end1 = time.time()
        time_each.append(end1-start1)
        for stride in strides:
            threshold=stride2threshold[stride]
            for i in range(len(emotional_img)):
                if max(preds[i])<0:
                    i1=max(emotional_img[i]-delta,0)
                    cur_preds=preds_proba[i1:emotional_img[i]+delta+1:stride]
                    proba=np.mean(cur_preds,axis=0)
                    aus=(proba>=thresholds)*1
                    if stride != 1:
                        if proba.all()>=threshold.all() or stride==last_stride:
                            total_frames_processed+=len(cur_preds)
                            total_frames+=len(preds_proba[i1:emotional_img[i]+delta+1])
                            preds[i] = aus
                    else:
                        if proba.all()>=threshold or stride==last_stride:
                            total_frames_processed+=len(cur_preds)
                            total_frames+=len(preds_proba[i1:emotional_img[i]+delta+1])
                            preds[i] = aus
        for p in preds:
            total_preds.append(p)
    end = time.time()
    elapsed_time = end - start - sum(time_each)
    total_true=np.array(total_true)
    pred=np.array(total_preds)
    print('Acc:',round((pred==total_true).mean(),3), 'F1:',round(np.mean([f1_score(y_true=total_true[:,i],y_pred=pred[:,i]) for i in range(pred.shape[1])]),3))
    print(total_frames_processed,total_frames,round(total_frames_processed/total_frames,3))
    print(f"Time: {elapsed_time:.2f} seconds")

[200, 100, 50, 10, 1]
Acc: 0.84 F1: 0.461
445842 13797139 0.032
Time: 14.08 seconds
[50, 25, 1]
Acc: 0.84 F1: 0.461
445842 13797139 0.032
Time: 12.03 seconds
[50, 10, 1]
Acc: 0.84 F1: 0.461
445842 13797139 0.032
Time: 12.65 seconds
[200, 50, 1]
Acc: 0.84 F1: 0.461
445842 13797139 0.032
Time: 12.12 seconds
[100, 50, 1]
Acc: 0.84 F1: 0.461
445842 13797139 0.032
Time: 12.02 seconds
[200, 1]
Acc: 0.84 F1: 0.461
445842 13797139 0.032
Time: 11.22 seconds
[100, 1]
Acc: 0.84 F1: 0.461
445842 13797139 0.032
Time: 11.00 seconds
[50, 1]
Acc: 0.84 F1: 0.461
445842 13797139 0.032
Time: 11.44 seconds
[200]
Acc: 0.84 F1: 0.461
445842 13797139 0.032
Time: 10.24 seconds
[100]
Acc: 0.84 F1: 0.461
445842 13797139 0.032
Time: 10.22 seconds
[50]
Acc: 0.84 F1: 0.461
445842 13797139 0.032
Time: 10.23 seconds
[25]
Acc: 0.849 F1: 0.479
889687 13797139 0.064
Time: 10.24 seconds
[10]
Acc: 0.857 F1: 0.492
1779368 13797139 0.129
Time: 10.48 seconds
[1]
Acc: 0.86 F1: 0.496
13797139 13797139 1.0
Time: 10.59 seconds


## VA

In [None]:
anno_path = os.path.join(root,f'data/Annotations/{tsk}/{split[0]}')
with open(os.path.join(root, f'data/Annotations/{tsk}/{typ[0]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
with open(os.path.join(root,f'models/ABAW6/{vis}/VA/{tsk}_{typ[0]}_visual.pkl'), 'rb') as f:
        data = pickle.load(f)

task1 = task[2]
feature_a = 'audiofeat_wav2vec2'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
    feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
    for imgname, val in feature.items():
        if imgname in data[task1][vname]:
            data[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data[task1][vname].items()):
            if len(value) < 3:
                data[task1][vname].pop(img)

100%|██████████| 285/285 [00:42<00:00,  6.70it/s]


In [None]:
img, predict, label = [], [], []
vid = {}
iterator = iter(train_loader)
for i in range(len(train_loader)//32):
    VA = next(iterator)
    images = VA['frame']
    vis_feat, aud_feat, y = VA[visual_feat], VA[auft], VA['label']
    vis_feat, aud_feat, y = vis_feat.to(device), aud_feat.to(device), y.to(device)
    _, _, vpred, apred = mlp_model(vis_feat, aud_feat)
    preds = torch.cat((vpred, apred), dim=1)
    img.extend(images)
    predict.extend(preds)
    label.extend(y)

In [None]:
index, pred, lab = [], [], []
train_vid = {}
for i, val in enumerate(img):
    ind = int(val.split('/')[1][:-4])
    vname = val.split('/')[0]
    if i == 0:
        prename = vname
        index.append(ind)
        pred.append(predict[i])
        lab.append(label[i])
    else:
        if vname == prename:
            index.append(ind)
            pred.append(predict[i])
            lab.append(label[i])
        else:
            combined = list(zip(index, pred, lab))
            combined_sorted = sorted(combined, key=lambda x: x[0])
            index_list_sorted, pred_list_sorted, lab_list_sorted = zip(*combined_sorted)
            train_vid[prename] = (list(index_list_sorted), list(pred_list_sorted), list(lab_list_sorted))
            prename = vname
            index, pred, lab = [], [], []
            index.append(ind)
            pred.append(predict[i])
            lab.append(label[i])

In [None]:
train_vid = {}
for vname in tqdm(vidnames):
        img, predict, label = [], [], []
        for imgname, val in sorted(data[tsk][vname].items()):
            vis_feat = torch.tensor(val[visual_feat]).to(device)
            if auft == 'nope':
                aud_feat = None
            else:
                aud_feat = torch.tensor(val[auft]).to(device)
            if tsk == task[2]:
                _, _, vpred, apred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor([vpred, apred])
            else:
                _, pred = mlp_model(vis_feat, aud_feat)
                preds = torch.tensor(pred)
            ind = int(imgname.split('/')[1][:-4])
            img.append(ind)
            predict.append(preds)
            label.append(data[tsk][vname][imgname]['label'])
        train_vid[vname] = (img, predict, label)

In [None]:
delta = 30

In [None]:
stride2scores={}
for stride in [200, 100, 50, 25, 10]:
    total_true, predictions, max_decision_values, decision_values1, decision_values2 = [],[],[], [], []
    for vidname, (img, predict, label) in train_vid.items():
        index = []
        for i,ind in enumerate(img):
            total_true.append(label[i].cpu().numpy())
            index.append(ind-1)
        preds_proba = smooth_prediction(img, predict)
        for i in range(len(index)):
            best_ind, proba = slide_window(preds_proba, index[i], delta, stride)
            predictions.append(proba)
            decision_values1.append(proba[0])
            decision_values2.append(proba[1])
            max_decision_values.append([max(decision_values1),max(decision_values2)])
    stride2scores[stride] = (np.array(total_true),np.array(predictions),np.array(max_decision_values))

In [None]:
def get_threshold(stride,fpr_corrected):
    (total_true,predictions,max_decision_values) = stride2scores[stride]
    mistakes = max_decision_values[predictions.any() != total_true.any()]
    best_threshold = -1
    for i, threshold in enumerate(sorted(max_decision_values[predictions.any() == total_true.any()])[::-1]):
        tpr = i/len(predictions)
        fpr = (mistakes > threshold).sum()/len(predictions)

        if fpr > fpr_corrected:
            if best_threshold == -1:
                best_threshold = threshold
            print(stride, 'best_threshold', best_threshold, i)
            break
        best_threshold = threshold
    return best_threshold

In [None]:
stride2threshold = {}
for stride in stride2scores:
    fpr_corrected=0.05
    stride2threshold[stride] = get_threshold(stride,fpr_corrected)
stride2threshold[1] = 0
print(stride2threshold)

{200: array([[0.24387085, 0.24387085],
       [0.24387085, 0.24387085],
       [0.24387085, 0.24387085],
       [0.24387085, 0.24387085],
       [0.24387085, 0.24387085],
       [0.4420853 , 0.4420853 ],
       [0.4420853 , 0.4420853 ],
       [0.4420853 , 0.4420853 ],
       [0.4420853 , 0.4420853 ],
       [0.4420853 , 0.4420853 ],
       [0.4420853 , 0.4420853 ],
       [0.4420853 , 0.4420853 ],
       [0.4420853 , 0.4420853 ],
       [0.4420853 , 0.4420853 ],
       [0.4420853 , 0.4420853 ],
       [0.4420853 , 0.4420853 ],
       [0.4420853 , 0.4420853 ],
       [0.4420853 , 0.4420853 ],
       [0.50624746, 0.50624746],
       [0.50624746, 0.50624746],
       [0.50624746, 0.50624746],
       [0.50624746, 0.50624746],
       [0.54399276, 0.54399276],
       [0.54399276, 0.54399276],
       [0.54399276, 0.54399276],
       [0.54399276, 0.54399276],
       [0.54399276, 0.54399276],
       [0.54399276, 0.54399276],
       [0.54399276, 0.54399276],
       [0.54399276, 0.54399276],
    

In [None]:
all_strides=[
    [200, 100, 50, 10, 1],
    [50, 25, 1],
    [50, 10, 1],
    [200,50,1],
    [100,50,1],
    [200,1],
    [100,1],
    [50,1]
]
for s in stride2threshold.keys():
    all_strides.append([s])

for strides in all_strides:
    print(strides)
    last_stride=strides[-1]

    total_true=[]
    total_preds=[]
    total_frames_processed,total_frames=0,0
    time_each = []
    start = time.time()
    for videoname, (img, predict, label) in test_vid.items():
        emotional_img=[]
        start1 = time.time()
        for i,ind in enumerate(img):
            total_true.append(label[i].cpu().numpy())
            emotional_img.append(ind-1)
        cur_ind=0
        preds_proba=[]
        for i in range(img[-1]):
            if img[cur_ind]-1==i:
                preds_proba.append(predict[cur_ind])
                cur_ind+=1
            else:
                if cur_ind==0:
                    preds_proba.append(predict[cur_ind])
                else:
                    w=(i-img[cur_ind-1]+1)/(img[cur_ind]-img[cur_ind-1])
                    pred=w*predict[cur_ind-1]+(1-w)*predict[cur_ind]
                    preds_proba.append(pred)

        preds_proba=np.array([p.cpu().detach().numpy() for p in preds_proba])

        preds=[[1]*2 for _ in range(len(emotional_img))]
        end1 = time.time()
        time_each.append(end1 - start1)
        for stride in strides:
            threshold=stride2threshold[stride]
            for i in range(len(emotional_img)):
                if preds[i][0]>=-1 and preds[i][1]>=-1:
                    i1=max(emotional_img[i]-delta,0)
                    cur_preds=preds_proba[i1:emotional_img[i]+delta+1:stride]
                    proba=np.mean(cur_preds,axis=0)
                    best_ind=np.argmax(proba)
                    if stride == 1:
                        if proba[best_ind]>=threshold or stride==last_stride:
                                total_frames_processed+=len(cur_preds)
                                total_frames+=len(preds_proba[i1:emotional_img[i]+delta+1])
                                preds[i]=proba
                    else:
                        if proba[0].all()>=threshold[0].all() or proba[1].all()>=threshold[1].all() or stride==last_stride:
                                total_frames_processed+=len(cur_preds)
                                total_frames+=len(preds_proba[i1:emotional_img[i]+delta+1])
                                preds[i]=proba

        for p in preds:
            total_preds.append(p)
    end = time.time()
    elapsed_time = end - start - sum(time_each)
    total_true=np.array(total_true)
    preds=np.array(total_preds)
    ccc1, ccc2 = compute_VA_CCC(preds, total_true)
    print(f'CCCV: {ccc1:.3f}; CCCA: {ccc2:.3f}')
    print(total_frames_processed,total_frames,round(total_frames_processed/total_frames,3))
    print(f"Time: {elapsed_time:.2f} seconds")

[200, 100, 50, 10, 1]
CCCV: 0.788; CCCA: 0.582
806494 3416235 0.236
Time: 1.66 seconds
[50, 25, 1]
CCCV: 0.788; CCCA: 0.582
739232 2049741 0.361
Time: 1.07 seconds
[50, 10, 1]
CCCV: 0.788; CCCA: 0.582
783940 2049741 0.382
Time: 1.00 seconds
[200, 50, 1]
CCCV: 0.788; CCCA: 0.582
716878 2049741 0.35
Time: 0.95 seconds
[100, 50, 1]
CCCV: 0.788; CCCA: 0.582
716878 2049741 0.35
Time: 0.98 seconds
[200, 1]
CCCV: 0.788; CCCA: 0.582
694524 1366494 0.508
Time: 0.63 seconds
[100, 1]
CCCV: 0.788; CCCA: 0.582
694524 1366494 0.508
Time: 0.53 seconds
[50, 1]
CCCV: 0.788; CCCA: 0.582
705601 1366494 0.516
Time: 0.62 seconds
[200]
CCCV: 0.755; CCCA: 0.547
11277 683247 0.017
Time: 0.36 seconds
[100]
CCCV: 0.755; CCCA: 0.547
11277 683247 0.017
Time: 0.24 seconds
[50]
CCCV: 0.776; CCCA: 0.562
22354 683247 0.033
Time: 0.30 seconds
[25]
CCCV: 0.783; CCCA: 0.575
33631 683247 0.049
Time: 0.30 seconds
[10]
CCCV: 0.787; CCCA: 0.579
78339 683247 0.115
Time: 0.28 seconds
[1]
CCCV: 0.788; CCCA: 0.582
683247 683247

# Draft

In [None]:
task = ['EXPR_Recognition_Challenge','AU_Detection_Challenge','VA_Estimation_Challenge']
split = ['Train_Set', 'Validation_Set']
typ = ['Train','Val','Test']
visual_feat = 'visualfeat_enet_b2_8_best'
audio_feat = 'audiofeat_wav2vec2'
audio_feat1 = 'audiofeat_vggish'
batch_size = 32

### EXPR

#### Train

In [None]:
feature_v = 'visualfeat_enet_b2_8_best'
task1 = task[0]
feature_dims = 0
data1 = {}
data1[task1] = {}
iname = []
anno_path = os.path.join(root,f'data/Annotations/{task[0]}/{split[0]}')
with open(os.path.join(root, f'data/Annotations/{task[0]}/{typ[0]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
feat_root = os.path.join(root + '/models/ABAW6', feature_v)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        with open(os.path.join(anno_path, f'{vname}.txt')) as f:
            labels = f.read().splitlines()
        data1[task1][vname] = {}

        for imgname, val in feature.items():
            for i,line in enumerate(labels):
                if i > 0:
                    imname = get_names(vname, i)
                    if imname == imgname:
                        exp = int(line)
                        if exp >= 0:
                            labs = torch.tensor(exp)
                            data1[task1][vname][imgname] = {f'{feature_v}': val, 'label': labs}
                            iname.append(imname)

100%|██████████| 199/199 [2:29:21<00:00, 45.03s/it]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{task[0]}_{typ[0]}_visual.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
feature_a = 'audiofeat_wav2vec2'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

100%|██████████| 199/199 [05:13<00:00,  1.58s/it]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{task[0]}_{typ[0]}_visual_wav2vec2.pkl', 'wb') as f:
    pickle.dump(data1, f)

#### Val

In [None]:
feature_v = 'visualfeat_enet_b2_8_best'
task1 = task[0]
feature_dims = 0
data1 = {}
data1[task1] = {}
iname = []
anno_path = os.path.join(root,f'data/Annotations/{task[0]}/{split[0]}')
with open(os.path.join(root, f'data/Annotations/{task[0]}/{typ[1]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
feat_root = os.path.join(root + '/models/ABAW6', feature_v)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        #fname = os.path.join(vname, '.npy')
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        with open(os.path.join(anno_path, f'{vname}.txt')) as f:
            labels = f.read().splitlines()
        data1[task1][vname] = {}

        for imgname, val in feature.items():
            for i,line in enumerate(labels):
                if i > 0:
                    imname = get_names(vname, i)
                    if imname == imgname:
                        exp = int(line)
                        if exp >= 0:
                            labs = torch.tensor(exp)
                            data1[task1][vname][imgname] = {f'{feature_v}': val, 'label': labs}
                            iname.append(imname)

In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{task[0]}_{typ[1]}_visual.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
feature_a = 'audiofeat_wav2vec2'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{task[0]}_{typ[1]}_visual_wav2vec2.pkl', 'wb') as f:
    pickle.dump(data1, f)

#### Test

In [None]:
feature_v = 'visualfeat_enet_b2_8_best'
task1 = task[0]
feature_dims = 0
data1 = {}
data1[task1] = {}
iname = []
anno_path = os.path.join(root,f'data/Annotations/{task[0]}/{split[1]}')
with open(os.path.join(root, f'data/Annotations/{task[0]}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
feat_root = os.path.join(root + '/models/ABAW6', feature_v)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        with open(os.path.join(anno_path, f'{vname}.txt')) as f:
            labels = f.read().splitlines()
        data1[task1][vname] = {}

        for imgname, val in feature.items():
            for i,line in enumerate(labels):
                if i > 0:
                    imname = get_names(vname, i)
                    if imname == imgname:
                        exp = int(line)
                        if exp >= 0:
                            labs = torch.tensor(exp)
                            data1[task1][vname][imgname] = {f'{feature_v}': val, 'label': labs}
                            iname.append(imname)

In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{task[0]}_{typ[2]}_visual.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
feature_a = 'audiofeat_wav2vec2'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{task[0]}_{typ[2]}_visual_wav2vec2.pkl', 'wb') as f:
    pickle.dump(data1, f)

### AU

#### Train

In [None]:
feature_v = 'visualfeat_enet_b2_8_best'
task1 = task[1]
feature_dims = 0
data1 = {}
data1[task1] = {}
iname = []
anno_path = os.path.join(root,f'data/Annotations/{task[1]}/{split[0]}')
with open(os.path.join(root, f'data/Annotations/{task[1]}/{typ[0]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
feat_root = os.path.join(root + '/models/ABAW6', feature_v)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        with open(os.path.join(anno_path, f'{vname}.txt')) as f:
            labels = f.read().splitlines()
        data1[task1][vname] = {}

        for imgname, val in feature.items():
            for i,line in enumerate(labels):
                if i > 0:
                    imname = get_names(vname, i)
                    if imname == imgname:
                        splitted_line=line.split(',')
                        aus = list(map(int,splitted_line))
                        if min(aus) >= 0:
                            labs = torch.tensor(aus)
                            data1[task1][vname][imgname] = {f'{feature_v}': val, 'label': labs}
                            iname.append(imname)

100%|██████████| 236/236 [2:19:24<00:00, 35.44s/it]    


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/AU/{task[1]}_{typ[0]}_visual.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
feature_a = 'audiofeat_wav2vec2'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

100%|██████████| 236/236 [00:13<00:00, 17.86it/s]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/AU/{task[1]}_{typ[0]}_visual_wav2vec2.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
with open(os.path.join(root,f'models/ABAW6/AU/{task[1]}_{typ[0]}_visual.pkl'), 'rb') as f:
    data1 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[1]
with open(os.path.join(root, f'data/Annotations/{task[1]}/{typ[0]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()

In [None]:
feature_a = 'audiofeat_vggish'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

100%|██████████| 236/236 [00:20<00:00, 11.35it/s]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/AU/{task[1]}_{typ[0]}_visual_vggish.pkl', 'wb') as f:
    pickle.dump(data1, f)

#### Val

In [None]:
feature_v = 'visualfeat_enet_b2_8_best'
task1 = task[1]
feature_dims = 0
data1 = {}
data1[task1] = {}
iname = []
anno_path = os.path.join(root,f'data/Annotations/{task[1]}/{split[0]}')
with open(os.path.join(root, f'data/Annotations/{task[1]}/{typ[1]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
feat_root = os.path.join(root + '/models/ABAW6', feature_v)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        with open(os.path.join(anno_path, f'{vname}.txt')) as f:
            labels = f.read().splitlines()
        data1[task1][vname] = {}

        for imgname, val in feature.items():
            for i,line in enumerate(labels):
                if i > 0:
                    imname = get_names(vname, i)
                    if imname == imgname:
                        splitted_line=line.split(',')
                        aus = list(map(int,splitted_line))
                        if min(aus) >= 0:
                            labs = torch.tensor(aus)
                            data1[task1][vname][imgname] = {f'{feature_v}': val, 'label': labs}
                            iname.append(imname)

100%|██████████| 59/59 [17:09<00:00, 17.45s/it]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/AU/{task[1]}_{typ[1]}_visual.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
feature_a = 'audiofeat_wav2vec2'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

100%|██████████| 59/59 [00:02<00:00, 25.22it/s]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/AU/{task[1]}_{typ[1]}_visual_wav2vec2.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
with open(os.path.join(root,f'models/ABAW6/AU/{task[1]}_{typ[1]}_visual.pkl'), 'rb') as f:
    data1 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[1]
with open(os.path.join(root, f'data/Annotations/{task[1]}/{typ[1]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()

In [None]:
feature_a = 'audiofeat_vggish'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

100%|██████████| 59/59 [00:07<00:00,  7.95it/s]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/AU/{task[1]}_{typ[1]}_visual_vggish.pkl', 'wb') as f:
    pickle.dump(data1, f)

#### Test

In [None]:
feature_v = 'visualfeat_enet_b2_8_best'
task1 = task[2]
feature_dims = 0
data1 = {}
data1[task1] = {}
iname = []
anno_path = os.path.join(root,f'data/Annotations/{task[2]}/{split[1]}')
with open(os.path.join(root, f'data/Annotations/{task[2]}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
feat_root = os.path.join(root + '/models/ABAW6', feature_v)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        #fname = os.path.join(vname, '.npy')
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        with open(os.path.join(anno_path, f'{vname}.txt')) as f:
            labels = f.read().splitlines()
        data1[task1][vname] = {}

        for imgname, val in feature.items():
            for i,line in enumerate(labels):
                if i > 0:
                    imname = get_names(vname, i)
                    if imname == imgname:
                        splitted_line=line.split(',')
                        valence=float(splitted_line[0])
                        arousal=float(splitted_line[1])
                        if valence >= -1 and arousal >= -1:
                            labs = torch.tensor([valence, arousal])
                            data1[task1][vname][imgname] = {f'{feature_v}': val, 'label': labs}
                            iname.append(imname)

100%|██████████| 105/105 [29:38<00:00, 16.94s/it]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/AU/{task[1]}_{typ[2]}_visual.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
feature_a = 'audiofeat_wav2vec2'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

100%|██████████| 105/105 [00:05<00:00, 20.62it/s]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/AU/{task[1]}_{typ[2]}_visual_wav2vec2.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
with open(os.path.join(root,f'models/ABAW6/AU/{task[1]}_{typ[2]}_visual.pkl'), 'rb') as f:
    data1 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[1]
with open(os.path.join(root, f'data/Annotations/{task[1]}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()

In [None]:
feature_a = 'audiofeat_vggish'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

100%|██████████| 105/105 [00:09<00:00, 11.66it/s]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/AU/{task[1]}_{typ[2]}_visual_vggish.pkl', 'wb') as f:
    pickle.dump(data1, f)

### VA

#### Train

In [None]:
feature_v = 'visualfeat_enet_b2_8_best'
task1 = task[2]
feature_dims = 0
data1 = {}
data1[task1] = {}
iname = []
anno_path = os.path.join(root,f'data/Annotations/{task[2]}/{split[0]}')
with open(os.path.join(root, f'data/Annotations/{task[2]}/{typ[0]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
feat_root = os.path.join(root + '/models/ABAW6', feature_v)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        #fname = os.path.join(vname, '.npy')
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        with open(os.path.join(anno_path, f'{vname}.txt')) as f:
            labels = f.read().splitlines()
        data1[task1][vname] = {}

        for imgname, val in feature.items():
            for i,line in enumerate(labels):
                if i > 0:
                    imname = get_names(vname, i)
                    if imname == imgname:
                        splitted_line=line.split(',')
                        valence=float(splitted_line[0])
                        arousal=float(splitted_line[1])
                        if valence >= -1 and arousal >= -1:
                            labs = torch.tensor([valence, arousal])
                            data1[task1][vname][imgname] = {f'{feature_v}': val, 'label': labs}
                            iname.append(imname)

In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{task[2]}_{typ[0]}_visual.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
feature_a = 'audiofeat_wav2vec2'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{task[2]}_{typ[0]}_visual_wav2vec2.pkl', 'wb') as f:
    pickle.dump(data1, f)

#### Val

In [None]:
feature_v = 'visualfeat_enet_b2_8_best'
task1 = task[2]
feature_dims = 0
data1 = {}
data1[task1] = {}
iname = []
anno_path = os.path.join(root,f'data/Annotations/{task[2]}/{split[0]}')
with open(os.path.join(root, f'data/Annotations/{task[2]}/{typ[1]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
feat_root = os.path.join(root + '/models/ABAW6', feature_v)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        with open(os.path.join(anno_path, f'{vname}.txt')) as f:
            labels = f.read().splitlines()
        data1[task1][vname] = {}

        for imgname, val in feature.items():
            for i,line in enumerate(labels):
                if i > 0:
                    imname = get_names(vname, i)
                    if imname == imgname:
                        splitted_line=line.split(',')
                        valence=float(splitted_line[0])
                        arousal=float(splitted_line[1])
                        if valence >= -1 and arousal >= -1:
                            labs = torch.tensor([valence, arousal])
                            data1[task1][vname][imgname] = {f'{feature_v}': val, 'label': labs}
                            iname.append(imname)

In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{task[2]}_{typ[1]}_visual.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
feature_a = 'audiofeat_wav2vec2'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{task[2]}_{typ[1]}_visual_wav2vec2.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis}/VA/{task[2]}_{typ[1]}_visual.pkl'), 'rb') as f:
    data1 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[2]
with open(os.path.join(root, f'data/Annotations/{task[2]}/{typ[1]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()

In [None]:
feature_a = 'audiofeat_vggish'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

100%|██████████| 71/71 [00:15<00:00,  4.64it/s]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{vis}/VA/{task[2]}_{typ[1]}_visual_vggish.pkl', 'wb') as f:
    pickle.dump(data1, f)

#### Test

In [None]:
feature_v = 'visualfeat_enet_b2_8_best'
task1 = task[2]
feature_dims = 0
data1 = {}
data1[task1] = {}
iname = []
anno_path = os.path.join(root,f'data/Annotations/{task[2]}/{split[1]}')
with open(os.path.join(root, f'data/Annotations/{task[2]}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()
feat_root = os.path.join(root + '/models/ABAW6', feature_v)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        #fname = os.path.join(vname, '.npy')
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        with open(os.path.join(anno_path, f'{vname}.txt')) as f:
            labels = f.read().splitlines()
        data1[task1][vname] = {}

        for imgname, val in feature.items():
            for i,line in enumerate(labels):
                if i > 0:
                    imname = get_names(vname, i)
                    if imname == imgname:
                        splitted_line=line.split(',')
                        valence=float(splitted_line[0])
                        arousal=float(splitted_line[1])
                        if valence >= -1 and arousal >= -1:
                            labs = torch.tensor([valence, arousal])
                            data1[task1][vname][imgname] = {f'{feature_v}': val, 'label': labs}
                            iname.append(imname)

100%|██████████| 76/76 [1:03:17<00:00, 49.97s/it]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{task[2]}_{typ[2]}_visual.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
feature_a = 'audiofeat_wav2vec2'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

100%|██████████| 76/76 [02:47<00:00,  2.20s/it]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{task[2]}_{typ[2]}_visual_wav2vec2.pkl', 'wb') as f:
    pickle.dump(data1, f)

In [None]:
with open(os.path.join(root,f'models/ABAW6/{vis}/VA/{task[2]}_{typ[2]}_visual.pkl'), 'rb') as f:
    data1 = pickle.load(f)

In [None]:
dims = 0
iname = []
task1 = task[2]
with open(os.path.join(root, f'data/Annotations/{task[2]}/{typ[2]}.txt'), 'r') as f:
    vidnames = f.read().splitlines()

In [None]:
feature_a = 'audiofeat_vggish'
feat_root = os.path.join(root + '/models/ABAW6', feature_a)
filenames = os.listdir(feat_root)[:]
for vname in tqdm(vidnames):
        feature = np.load(os.path.join(feat_root, f'{vname}.npy'), allow_pickle=True).tolist()
        for imgname, val in feature.items():
            if imgname in data1[task1][vname]:
                data1[task1][vname][imgname].update({f'{feature_a}': val})
        for img, value in list(data1[task1][vname].items()):
            if len(value) < 3:
                data1[task1][vname].pop(img)

100%|██████████| 76/76 [00:11<00:00,  6.43it/s]


In [None]:
with open(f'/content/drive/MyDrive/MSc/Thesis/models/ABAW6/{vis}/VA/{task[2]}_{typ[2]}_visual_vggish.pkl', 'wb') as f:
    pickle.dump(data1, f)

#### Smooth validation predictions

In [None]:
img, predict, label = [], [], []
vid = {}
iterator = iter(test_loader)
for i in range(len(test_loader)//32):
    AU = next(iterator)
    images = AU['frame']
    vis_feat, y = AU[visual_feat], AU['label']
    vis_feat = vis_feat.to(device)
    aud_feat = None
    _, au_pred = AU_model(vis_feat, aud_feat)
    img.extend(images)
    predict.extend(au_pred)
    label.extend(y)

In [None]:
index, pred, lab = [], [], []
test_vid = {}
for i, val in enumerate(img):
    ind = int(val.split('/')[1][:-4])
    vname = val.split('/')[0]
    if i == 0:
        prename = vname
        index.append(ind)
        pred.append(predict[i])
        lab.append(label[i])
    else:
        if vname == prename:
            index.append(ind)
            pred.append(predict[i])
            lab.append(label[i])
        else:
            combined = list(zip(index, pred, lab))
            combined_sorted = sorted(combined, key=lambda x: x[0])
            index_list_sorted, pred_list_sorted, lab_list_sorted = zip(*combined_sorted)
            test_vid[prename] = (list(index_list_sorted), list(pred_list_sorted), list(lab_list_sorted))
            prename = vname
            index, pred, lab = [], [], []
            index.append(ind)
            pred.append(predict[i])
            lab.append(label[i])