In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!python3 -m pip install einops
!python3 -m pip install facenet-pytorch
!python3 -m pip install face_alignment
!python3 -m pip install self_attention_cv

Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.7.0
Collecting facenet-pytorch
  Downloading facenet_pytorch-2.5.3-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.1->torchvision->facenet-pytorch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.1->torchvision->facenet-pytorch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.1

In [3]:
import cv2
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision import models
from PIL import Image
from einops.layers.torch import Rearrange
from einops import rearrange
from facenet_pytorch import MTCNN
from self_attention_cv import TransformerEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import face_alignment
import dlib
import requests

In [4]:
def extract_frame(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    mid_frame_index = frame_count // 2  # Index of the frame in the middle of the video
    cap.set(cv2.CAP_PROP_POS_FRAMES, mid_frame_index)
    ret, frame = cap.read()
    if ret:
        cap.release()
        return frame
    else:
        cap.release()
        return None

In [5]:
def detect_face(frame):
    mtcnn = MTCNN()
    boxes, _ = mtcnn.detect(frame)
    if boxes is not None:
        # Assuming only one face in the frame
        box = boxes[0]
        x1, y1, x2, y2 = box
        # Crop the frame to the detected face
        cropped_frame = frame[int(y1):int(y2), int(x1):int(x2)]
        return cropped_frame
    else:
        return None

In [6]:
# Function to download the pretrained face alignment model if it doesn't exist
def download_face_alignment_model(url, save_path):
    if not os.path.exists(save_path):
        print("Downloading pretrained face alignment model...")
        response = requests.get(url)
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print("Download complete.")

# Specify the URL of the pretrained face alignment model
face_alignment_model_url = "https://github.com/1adrianb/face-alignment-models/releases/download/2.0.1/2DFAN4-11f355bf06.pth.tar"

# Download the pretrained face alignment model if it doesn't exist
face_alignment_model_path = os.path.abspath("2DFAN4-11f355bf06.pth.tar")
download_face_alignment_model(face_alignment_model_url, face_alignment_model_path)

# Initialize face alignment model
fa = face_alignment.FaceAlignment(2, flip_input=False)  # 2 corresponds to 2D landmarks

def align_face(frame):
    # Perform face alignment
    aligned_faces = fa.get_landmarks(frame)
    if aligned_faces is not None:
        aligned_face = aligned_faces[0]  # Assuming only one face in the frame
        return aligned_face
    else:
        return None


Downloading pretrained face alignment model...
Download complete.


Downloading: "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" to /root/.cache/torch/hub/checkpoints/s3fd-619a316812.pth
100%|██████████| 85.7M/85.7M [00:05<00:00, 15.9MB/s]
Downloading: "https://www.adrianbulat.com/downloads/python-fan/3DFAN4-4a694010b9.zip" to /root/.cache/torch/hub/checkpoints/3DFAN4-4a694010b9.zip
100%|██████████| 91.9M/91.9M [00:06<00:00, 15.7MB/s]


In [7]:

# Define the preprocessing functions for video frames and spectrograms
def preprocess_image(frame):
    frame_pil = Image.fromarray(frame.astype('uint8'))
    frame_pil = frame_pil.convert('L')  # Convert to grayscale
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485], std=[0.229]),  # Normalize using ImageNet mean and std
    ])
    frame_tensor = transform(frame_pil)
    return frame_tensor

In [8]:
def preprocess_spectrogram(image_path):
    img = Image.open(image_path).convert('L')  # Convert to grayscale
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize to match ViT input size
        transforms.ToTensor(),           # Convert to tensor
    ])
    img_tensor = transform(img)
    return img_tensor

In [9]:
def load_spectrogram_dataset(input_folder):
    X = []
    y = []
    # List all files in the input folder
    files = os.listdir(input_folder)
    # Iterate over files in the folder
    for filename in files:
        if filename.endswith(".png"):  # Assuming mel spectrograms are stored as PNG files
            input_path = os.path.join(input_folder, filename)
            img_tensor = preprocess_spectrogram(input_path)
            X.append(img_tensor)
            # Extract label from filename (assuming filename is in format "abc_IEO_label_xyz.png")
            label = filename.split("_")[2]
            if label == "HAP":
                y.append(0)
            elif label == "SAD":
                y.append(1)
            elif label == "ANG":
                y.append(2)
            elif label == "DIS":
                y.append(3)
            elif label == "FEA":
                y.append(4)
            elif label == "NEU":
                y.append(5)
    return X, y

In [10]:
def load_dataset(input_folder):
    X = []
    y = []
    video_files = [file for file in os.listdir(input_folder) if file.endswith(".flv")]
    for video_file in tqdm(video_files):
        video_path = os.path.join(input_folder, video_file)
        frame = extract_frame(video_path)
        if frame is not None:
            cropped_face = detect_face(frame)
            if cropped_face is not None:
                preprocessed_face = preprocess_image(cropped_face)
                X.append(preprocessed_face)
                label = video_file.split("_")[2].split(".")[0]  # Adjusted to handle different file extensions
                if label == "HAP":
                    y.append(0)
                elif label == "SAD":
                    y.append(1)
                elif label == "ANG":
                    y.append(2)
                elif label == "DIS":
                    y.append(3)
                elif label == "FEA":
                    y.append(4)
                elif label == "NEU":
                    y.append(5)
            else:
                print(f"No face detected in {video_file}. Skipping.")
        else:
            print(f"Failed to extract frame from {video_file}. Skipping.")
    return X, y

In [11]:
# Define the ConcatDataset class to concatenate video frame and spectrogram tensors
class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, X1, X2, y, modality='multimodal', fullscale=False):
        self.X1 = X1
        self.X2 = X2
        self.y = y
        self.modality = modality
        self.fullscale = fullscale
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        if not self.fullscale:
          img1 = self.X1[idx]
          img2 = self.X2[idx]
          label = self.y[idx]
        else:
          img1 = torch.from_numpy(self.X1[idx]).float()  # Convert numpy array to torch tensor
          img2 = torch.from_numpy(self.X2[idx]).float()  # Convert numpy array to torch tensor
          label = torch.tensor(self.y[idx])  # Convert numpy array to torch tensor

        concatenated_img = torch.cat((img1, img2), dim=0)  # Concatenate along 0 dimension
        if self.modality == 'visual':
          return img1, label
        if self.modality == 'audio':
          return img2, label
        return concatenated_img, label # concatenate modalities

In [12]:
def train_model(model, criterion, optimizer, train_loader, device):
    model.train()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0
    for inputs, labels in tqdm(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    accuracy = correct_preds / total_preds
    return epoch_loss, accuracy

In [13]:
def test_model(model, criterion, test_loader, device):
    model.eval()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for inputs, labels in tqdm(test_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)
    epoch_loss = running_loss / len(test_loader.dataset)
    accuracy = correct_preds / total_preds
    return epoch_loss, accuracy

In [14]:
class ViT(nn.Module):
    # ViT architecture adapted from here - https://theaisummer.com/vision-transformer/
    def __init__(self, *,
                 img_dim,
                 in_channels=3,
                 patch_dim=16,
                 num_classes=6, # full-scale CREMA-D
                 dim=512,
                 blocks=6,
                 heads=4,
                 dim_linear_block=1024,
                 dim_head=None,
                 dropout=0, transformer=None, classification=True):
        """
        Args:
            img_dim: the spatial image size
            in_channels: number of img channels
            patch_dim: desired patch dim
            num_classes: classification task classes
            dim: the linear layer's dim to project the patches for MHSA
            blocks: number of transformer blocks
            heads: number of heads
            dim_linear_block: inner dim of the transformer linear block
            dim_head: dim head in case you want to define it. defaults to dim/heads
            dropout: for pos emb and transformer
            transformer: in case you want to provide another transformer implementation
            classification: creates an extra CLS token
        """
        super().__init__()
        assert img_dim % patch_dim == 0, f'patch size {patch_dim} not divisible'
        self.p = patch_dim
        self.classification = classification
        tokens = (img_dim // patch_dim) ** 2
        self.token_dim = in_channels * (patch_dim ** 2)
        self.dim = dim
        self.dim_head = (int(dim / heads)) if dim_head is None else dim_head
        self.project_patches = nn.Linear(self.token_dim, dim)

        self.emb_dropout = nn.Dropout(dropout)
        if self.classification:
            self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
            self.pos_emb1D = nn.Parameter(torch.randn(tokens + 1, dim))
            self.mlp_head = nn.Linear(dim, num_classes)
        else:
            self.pos_emb1D = nn.Parameter(torch.randn(tokens, dim))

        if transformer is None:
            self.transformer = TransformerEncoder(dim, blocks=blocks, heads=heads,
                                                  dim_head=self.dim_head,
                                                  dim_linear_block=dim_linear_block,
                                                  dropout=dropout)
        else:
            self.transformer = transformer

    def expand_cls_to_batch(self, batch):
        """
        Args:
            batch: batch size
        Returns: cls token expanded to the batch size
        """
        return self.cls_token.expand([batch, -1, -1])

    def forward(self, img, mask=None):
        batch_size = img.shape[0]
        img_patches = rearrange(
            img, 'b c (patch_x x) (patch_y y) -> b (x y) (patch_x patch_y c)',
                                patch_x=self.p, patch_y=self.p)
        # project patches with linear layer + add pos emb
        img_patches = self.project_patches(img_patches)

        if self.classification:
            img_patches = torch.cat(
                (self.expand_cls_to_batch(batch_size), img_patches), dim=1)

        patch_embeddings = self.emb_dropout(img_patches + self.pos_emb1D)

        # feed patch_embeddings and output of transformer. shape: [batch, tokens, dim]
        y = self.transformer(patch_embeddings, mask)

        if self.classification:
            # we index only the cls token for classification.
            return self.mlp_head(y[:, 0, :])
        else:
            return y

In [15]:
_fullscale = True # Run fullscale experiment?

# Define input_folder and input_folder_spec
if _fullscale:
  input_folder = '/content/drive/MyDrive/videos_fullscale'
  input_folder_spec = '/content/drive/MyDrive/melspec_fullscale'
else:
  input_folder = '/content/drive/MyDrive/csci535_aashi/videos'
  input_folder_spec = '/content/drive/MyDrive/csci535_aashi/melspec'

# Check if input folder exists
if not os.path.exists(input_folder):
    print("Input folder does not exist.")
    sys.exit(1)
# Check if input folder exists
if not os.path.exists(input_folder_spec):
    print("Input folder does not exist.")
    sys.exit(1)

# Load dataset and split into train and test sets

if not _fullscale:
  X, y = load_dataset(input_folder)
  X_spec, y_spec = load_spectrogram_dataset(input_folder_spec)

else:
  # Load numpy arrays with memory-mapping
  X = np.load('/content/drive/MyDrive/csci535_aashi/numpy_models/X.npy', mmap_mode='r')
  y = np.load('/content/drive/MyDrive/csci535_aashi/numpy_models/y.npy', mmap_mode='r')
  X_spec = np.load('/content/drive/MyDrive/csci535_aashi/numpy_models/X_spec.npy', mmap_mode='r')
  y_spec = np.load('/content/drive/MyDrive/csci535_aashi/numpy_models/y_spec.npy', mmap_mode='r')

# Split the data into train and test sets
print(f"Total number of samples: {len(X)}")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Number of train samples (video): {len(X_train)}", f"Number of test samples: {len(X_test)}")
X_train_spec, X_test_spec, y_train_spec, y_test_spec = train_test_split(X_spec, y_spec, test_size=0.3, random_state=42)
print(f"Number of train samples (audio): {len(X_train_spec)}", f"Number of test samples: {len(X_test_spec)}")

Total number of samples: 7442
Number of train samples (video): 5209 Number of test samples: 2233
Number of train samples (audio): 5231 Number of test samples: 2242


In [None]:
# Save X, y, X_spec, y_spec
# np.save('X.npy', np.array(X))
# np.save('y.npy', np.array(y))
# np.save('X_spec.npy', np.array(X_spec))
# np.save('y_spec.npy', np.array(y_spec))
# !cp 'X.npy' '/content/drive/MyDrive/csci535/models/'
# !cp 'y.npy' '/content/drive/MyDrive/csci535/models/'
# !cp 'X_spec.npy' '/content/drive/MyDrive/csci535/models/'
# !cp 'y_spec.npy' '/content/drive/MyDrive/csci535/models/'

In [16]:
def train_ViT(_modality):
  # Adjust input channels as per modality
  if _modality == 'multimodal':
    _input_channels = 2
  else:
    _input_channels = 1

  # Initialize the ViT model
  model = ViT(img_dim=224,  # Image dimension
              in_channels=_input_channels,  # Number of input channels
              patch_dim=16,  # Patch dimension
              num_classes=6,  # 6 classes for HAPPY, SAD, ANGRY, DISGUST, FEAR, NEUTRAL
              dim=768,  # Dimensionality of the token embeddings
              blocks=6,  # Number of transformer blocks
              heads=4,  # Number of attention heads
              dim_linear_block=1024,  # Dimensionality of the linear block
              dropout=0.4,  # Dropout rate
              classification=True)  # Whether or not to include a classification token

  # Define device
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  # Define loss function and optimizer
  _lr = 0.01
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=_lr)

  # Concatenate datasets if multimodal
  train_dataset = ConcatDataset(X_train, X_train_spec, y_train, modality = _modality, fullscale = _fullscale)
  test_dataset = ConcatDataset(X_test, X_test_spec, y_test, modality = _modality, fullscale = _fullscale)

  # Create data loaders
  _bs = 32

  # Create data loaders
  train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=_bs, shuffle=True)
  test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=_bs)

  print(f"\n\nBatch size: {_bs}", f"lr: {_lr}")

  # Training loop
  num_epochs = 50
  print(f"Training ViT for \"{_modality}\" pipeline ...\n------------------------------------------------\n")
  for epoch in range(num_epochs):
      print("Epoch " + str(epoch))
      train_loss, train_accuracy = train_model(model, criterion, optimizer, train_loader, device)
      test_loss, test_accuracy = test_model(model, criterion, test_loader, device)
      print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

  # Save the model
  if _modality == 'multimodal':
    torch.save(model.state_dict(), 'ViT_audio_video_fullscale_'+str(num_epochs)+'_'+str(_bs)+'_'+str(_lr))
  elif _modality == 'audio':
    torch.save(model.state_dict(), 'ViT_audio_fullscale_'+str(num_epochs)+'_'+str(_bs)+'_'+str(_lr))
  elif _modality == 'visual':
    torch.save(model.state_dict(), 'ViT_video_fullscale_'+str(num_epochs)+'_'+str(_bs)+'_'+str(_lr))
  else:
    print("Improper modality provided!")

  return train_loss, train_accuracy, test_loss, test_accuracy

In [19]:
# Define modalities
_modality_visual = ['visual', 'audio', 'multimodal']

In [20]:
# Train ViT
scores = {}
for _m in _modality_visual:
  train_loss, train_accuracy, test_loss, test_accuracy = train_ViT(_m)
  scores[_m] = [train_loss, train_accuracy, test_loss, test_accuracy]



Batch size: 32 lr: 0.01
Training ViT for "visual" pipeline ...
------------------------------------------------

Epoch 0


100%|██████████| 163/163 [00:54<00:00,  3.01it/s]
100%|██████████| 70/70 [00:07<00:00,  8.80it/s]


Epoch 1/50, Train Loss: 2.3409, Train Accuracy: 0.1615, Test Loss: 2.0024, Test Accuracy: 0.1675
Epoch 1


100%|██████████| 163/163 [00:52<00:00,  3.12it/s]
100%|██████████| 70/70 [00:08<00:00,  8.67it/s]


Epoch 2/50, Train Loss: 1.9095, Train Accuracy: 0.1760, Test Loss: 1.8304, Test Accuracy: 0.1675
Epoch 2


100%|██████████| 163/163 [00:52<00:00,  3.12it/s]
100%|██████████| 70/70 [00:07<00:00,  8.78it/s]


Epoch 3/50, Train Loss: 1.8386, Train Accuracy: 0.1639, Test Loss: 1.8072, Test Accuracy: 0.1675
Epoch 3


100%|██████████| 163/163 [00:52<00:00,  3.12it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 4/50, Train Loss: 1.8206, Train Accuracy: 0.1789, Test Loss: 1.8206, Test Accuracy: 0.1764
Epoch 4


100%|██████████| 163/163 [00:52<00:00,  3.11it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 5/50, Train Loss: 1.8195, Train Accuracy: 0.1628, Test Loss: 1.8231, Test Accuracy: 0.1764
Epoch 5


100%|██████████| 163/163 [00:52<00:00,  3.12it/s]
100%|██████████| 70/70 [00:07<00:00,  8.76it/s]


Epoch 6/50, Train Loss: 1.8123, Train Accuracy: 0.1670, Test Loss: 1.7943, Test Accuracy: 0.1738
Epoch 6


100%|██████████| 163/163 [00:52<00:00,  3.13it/s]
100%|██████████| 70/70 [00:07<00:00,  8.76it/s]


Epoch 7/50, Train Loss: 1.8118, Train Accuracy: 0.1626, Test Loss: 1.8157, Test Accuracy: 0.1675
Epoch 7


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:07<00:00,  8.78it/s]


Epoch 8/50, Train Loss: 1.8139, Train Accuracy: 0.1714, Test Loss: 1.8268, Test Accuracy: 0.1738
Epoch 8


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:07<00:00,  8.76it/s]


Epoch 9/50, Train Loss: 1.8113, Train Accuracy: 0.1607, Test Loss: 1.8036, Test Accuracy: 0.1738
Epoch 9


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:07<00:00,  8.78it/s]


Epoch 10/50, Train Loss: 1.8076, Train Accuracy: 0.1663, Test Loss: 1.8134, Test Accuracy: 0.1738
Epoch 10


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:07<00:00,  8.77it/s]


Epoch 11/50, Train Loss: 1.8108, Train Accuracy: 0.1580, Test Loss: 1.7978, Test Accuracy: 0.1764
Epoch 11


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:07<00:00,  8.77it/s]


Epoch 12/50, Train Loss: 1.8081, Train Accuracy: 0.1689, Test Loss: 1.8076, Test Accuracy: 0.1675
Epoch 12


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:07<00:00,  8.75it/s]


Epoch 13/50, Train Loss: 1.8075, Train Accuracy: 0.1590, Test Loss: 1.7920, Test Accuracy: 0.1675
Epoch 13


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:07<00:00,  8.77it/s]


Epoch 14/50, Train Loss: 1.8054, Train Accuracy: 0.1670, Test Loss: 1.7973, Test Accuracy: 0.1764
Epoch 14


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 15/50, Train Loss: 1.8024, Train Accuracy: 0.1726, Test Loss: 1.7929, Test Accuracy: 0.1764
Epoch 15


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.75it/s]


Epoch 16/50, Train Loss: 1.8017, Train Accuracy: 0.1709, Test Loss: 1.8102, Test Accuracy: 0.1738
Epoch 16


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.72it/s]


Epoch 17/50, Train Loss: 1.8005, Train Accuracy: 0.1745, Test Loss: 1.8029, Test Accuracy: 0.1711
Epoch 17


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.74it/s]


Epoch 18/50, Train Loss: 1.8017, Train Accuracy: 0.1716, Test Loss: 1.7921, Test Accuracy: 0.1764
Epoch 18


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 19/50, Train Loss: 1.7991, Train Accuracy: 0.1714, Test Loss: 1.8154, Test Accuracy: 0.1738
Epoch 19


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 20/50, Train Loss: 1.8035, Train Accuracy: 0.1574, Test Loss: 1.8029, Test Accuracy: 0.1639
Epoch 20


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.70it/s]


Epoch 21/50, Train Loss: 1.8009, Train Accuracy: 0.1682, Test Loss: 1.7982, Test Accuracy: 0.1764
Epoch 21


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 22/50, Train Loss: 1.8014, Train Accuracy: 0.1697, Test Loss: 1.7914, Test Accuracy: 0.1711
Epoch 22


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.71it/s]


Epoch 23/50, Train Loss: 1.7999, Train Accuracy: 0.1709, Test Loss: 1.7961, Test Accuracy: 0.1764
Epoch 23


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 24/50, Train Loss: 1.8015, Train Accuracy: 0.1795, Test Loss: 1.8011, Test Accuracy: 0.1711
Epoch 24


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 25/50, Train Loss: 1.8016, Train Accuracy: 0.1691, Test Loss: 1.7953, Test Accuracy: 0.1711
Epoch 25


100%|██████████| 163/163 [00:51<00:00,  3.17it/s]
100%|██████████| 70/70 [00:07<00:00,  8.76it/s]


Epoch 26/50, Train Loss: 1.7990, Train Accuracy: 0.1676, Test Loss: 1.7993, Test Accuracy: 0.1738
Epoch 26


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.74it/s]


Epoch 27/50, Train Loss: 1.7973, Train Accuracy: 0.1724, Test Loss: 1.7925, Test Accuracy: 0.1764
Epoch 27


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:07<00:00,  8.76it/s]


Epoch 28/50, Train Loss: 1.7990, Train Accuracy: 0.1680, Test Loss: 1.7918, Test Accuracy: 0.1764
Epoch 28


100%|██████████| 163/163 [00:51<00:00,  3.17it/s]
100%|██████████| 70/70 [00:07<00:00,  8.76it/s]


Epoch 29/50, Train Loss: 1.7981, Train Accuracy: 0.1703, Test Loss: 1.7993, Test Accuracy: 0.1738
Epoch 29


100%|██████████| 163/163 [00:51<00:00,  3.17it/s]
100%|██████████| 70/70 [00:08<00:00,  8.75it/s]


Epoch 30/50, Train Loss: 1.7999, Train Accuracy: 0.1674, Test Loss: 1.7989, Test Accuracy: 0.1711
Epoch 30


100%|██████████| 163/163 [00:51<00:00,  3.17it/s]
100%|██████████| 70/70 [00:08<00:00,  8.72it/s]


Epoch 31/50, Train Loss: 1.8008, Train Accuracy: 0.1720, Test Loss: 1.7947, Test Accuracy: 0.1675
Epoch 31


100%|██████████| 163/163 [00:51<00:00,  3.17it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 32/50, Train Loss: 1.7970, Train Accuracy: 0.1653, Test Loss: 1.8011, Test Accuracy: 0.1738
Epoch 32


100%|██████████| 163/163 [00:51<00:00,  3.17it/s]
100%|██████████| 70/70 [00:08<00:00,  8.72it/s]


Epoch 33/50, Train Loss: 1.7996, Train Accuracy: 0.1674, Test Loss: 1.7942, Test Accuracy: 0.1711
Epoch 33


100%|██████████| 163/163 [00:51<00:00,  3.17it/s]
100%|██████████| 70/70 [00:08<00:00,  8.72it/s]


Epoch 34/50, Train Loss: 1.7972, Train Accuracy: 0.1628, Test Loss: 1.8042, Test Accuracy: 0.1675
Epoch 34


100%|██████████| 163/163 [00:51<00:00,  3.17it/s]
100%|██████████| 70/70 [00:08<00:00,  8.71it/s]


Epoch 35/50, Train Loss: 1.8006, Train Accuracy: 0.1693, Test Loss: 1.8045, Test Accuracy: 0.1473
Epoch 35


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:07<00:00,  8.78it/s]


Epoch 36/50, Train Loss: 1.8011, Train Accuracy: 0.1737, Test Loss: 1.8045, Test Accuracy: 0.1711
Epoch 36


100%|██████████| 163/163 [00:51<00:00,  3.17it/s]
100%|██████████| 70/70 [00:07<00:00,  8.75it/s]


Epoch 37/50, Train Loss: 1.8038, Train Accuracy: 0.1601, Test Loss: 1.7931, Test Accuracy: 0.1764
Epoch 37


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:07<00:00,  8.77it/s]


Epoch 38/50, Train Loss: 1.7986, Train Accuracy: 0.1808, Test Loss: 1.8087, Test Accuracy: 0.1639
Epoch 38


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:08<00:00,  8.75it/s]


Epoch 39/50, Train Loss: 1.8033, Train Accuracy: 0.1613, Test Loss: 1.7949, Test Accuracy: 0.1675
Epoch 39


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:07<00:00,  8.78it/s]


Epoch 40/50, Train Loss: 1.7975, Train Accuracy: 0.1615, Test Loss: 1.7933, Test Accuracy: 0.1675
Epoch 40


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:08<00:00,  8.74it/s]


Epoch 41/50, Train Loss: 1.8049, Train Accuracy: 0.1707, Test Loss: 1.8140, Test Accuracy: 0.1675
Epoch 41


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:07<00:00,  8.77it/s]


Epoch 42/50, Train Loss: 1.8004, Train Accuracy: 0.1632, Test Loss: 1.8354, Test Accuracy: 0.1675
Epoch 42


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:08<00:00,  8.74it/s]


Epoch 43/50, Train Loss: 1.8031, Train Accuracy: 0.1691, Test Loss: 1.7947, Test Accuracy: 0.1711
Epoch 43


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:07<00:00,  8.76it/s]


Epoch 44/50, Train Loss: 1.7992, Train Accuracy: 0.1620, Test Loss: 1.8303, Test Accuracy: 0.1675
Epoch 44


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:08<00:00,  8.72it/s]


Epoch 45/50, Train Loss: 1.8023, Train Accuracy: 0.1701, Test Loss: 1.8054, Test Accuracy: 0.1764
Epoch 45


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:08<00:00,  8.74it/s]


Epoch 46/50, Train Loss: 1.8056, Train Accuracy: 0.1695, Test Loss: 1.8039, Test Accuracy: 0.1711
Epoch 46


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:07<00:00,  8.75it/s]


Epoch 47/50, Train Loss: 1.8031, Train Accuracy: 0.1655, Test Loss: 1.7984, Test Accuracy: 0.1473
Epoch 47


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:07<00:00,  8.78it/s]


Epoch 48/50, Train Loss: 1.8009, Train Accuracy: 0.1609, Test Loss: 1.8395, Test Accuracy: 0.1675
Epoch 48


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:07<00:00,  8.77it/s]


Epoch 49/50, Train Loss: 1.8030, Train Accuracy: 0.1676, Test Loss: 1.8040, Test Accuracy: 0.1764
Epoch 49


100%|██████████| 163/163 [00:51<00:00,  3.20it/s]
100%|██████████| 70/70 [00:07<00:00,  8.77it/s]


Epoch 50/50, Train Loss: 1.8019, Train Accuracy: 0.1695, Test Loss: 1.8093, Test Accuracy: 0.1711


Batch size: 32 lr: 0.01
Training ViT for "audio" pipeline ...
------------------------------------------------

Epoch 0


100%|██████████| 163/163 [00:53<00:00,  3.07it/s]
100%|██████████| 70/70 [00:08<00:00,  8.69it/s]


Epoch 1/50, Train Loss: 2.2918, Train Accuracy: 0.1630, Test Loss: 2.1754, Test Accuracy: 0.1675
Epoch 1


100%|██████████| 163/163 [00:52<00:00,  3.11it/s]
100%|██████████| 70/70 [00:08<00:00,  8.71it/s]


Epoch 2/50, Train Loss: 1.8721, Train Accuracy: 0.1720, Test Loss: 1.8062, Test Accuracy: 0.1675
Epoch 2


100%|██████████| 163/163 [00:52<00:00,  3.12it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 3/50, Train Loss: 1.8219, Train Accuracy: 0.1674, Test Loss: 1.8011, Test Accuracy: 0.1711
Epoch 3


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:07<00:00,  8.82it/s]


Epoch 4/50, Train Loss: 1.8224, Train Accuracy: 0.1678, Test Loss: 1.8172, Test Accuracy: 0.1738
Epoch 4


100%|██████████| 163/163 [00:51<00:00,  3.15it/s]
100%|██████████| 70/70 [00:08<00:00,  8.68it/s]


Epoch 5/50, Train Loss: 1.8218, Train Accuracy: 0.1615, Test Loss: 1.8106, Test Accuracy: 0.1711
Epoch 5


100%|██████████| 163/163 [00:51<00:00,  3.15it/s]
100%|██████████| 70/70 [00:08<00:00,  8.74it/s]


Epoch 6/50, Train Loss: 1.8131, Train Accuracy: 0.1651, Test Loss: 1.7966, Test Accuracy: 0.1738
Epoch 6


100%|██████████| 163/163 [00:51<00:00,  3.15it/s]
100%|██████████| 70/70 [00:08<00:00,  8.71it/s]


Epoch 7/50, Train Loss: 1.8141, Train Accuracy: 0.1684, Test Loss: 1.7999, Test Accuracy: 0.1711
Epoch 7


100%|██████████| 163/163 [00:51<00:00,  3.15it/s]
100%|██████████| 70/70 [00:08<00:00,  8.74it/s]


Epoch 8/50, Train Loss: 1.8100, Train Accuracy: 0.1645, Test Loss: 1.8452, Test Accuracy: 0.1675
Epoch 8


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.72it/s]


Epoch 9/50, Train Loss: 1.8119, Train Accuracy: 0.1766, Test Loss: 1.7974, Test Accuracy: 0.1738
Epoch 9


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:07<00:00,  8.83it/s]


Epoch 10/50, Train Loss: 1.8096, Train Accuracy: 0.1624, Test Loss: 1.7997, Test Accuracy: 0.1764
Epoch 10


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:07<00:00,  8.76it/s]


Epoch 11/50, Train Loss: 1.8084, Train Accuracy: 0.1774, Test Loss: 1.7980, Test Accuracy: 0.1675
Epoch 11


100%|██████████| 163/163 [00:51<00:00,  3.17it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 12/50, Train Loss: 1.8056, Train Accuracy: 0.1728, Test Loss: 1.7992, Test Accuracy: 0.1711
Epoch 12


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.74it/s]


Epoch 13/50, Train Loss: 1.8066, Train Accuracy: 0.1735, Test Loss: 1.7927, Test Accuracy: 0.1711
Epoch 13


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:07<00:00,  8.76it/s]


Epoch 14/50, Train Loss: 1.8042, Train Accuracy: 0.1661, Test Loss: 1.7960, Test Accuracy: 0.1738
Epoch 14


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.75it/s]


Epoch 15/50, Train Loss: 1.8011, Train Accuracy: 0.1657, Test Loss: 1.8170, Test Accuracy: 0.1639
Epoch 15


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:07<00:00,  8.79it/s]


Epoch 16/50, Train Loss: 1.8025, Train Accuracy: 0.1772, Test Loss: 1.7927, Test Accuracy: 0.1764
Epoch 16


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.70it/s]


Epoch 17/50, Train Loss: 1.7998, Train Accuracy: 0.1664, Test Loss: 1.7931, Test Accuracy: 0.1675
Epoch 17


100%|██████████| 163/163 [00:51<00:00,  3.17it/s]
100%|██████████| 70/70 [00:08<00:00,  8.69it/s]


Epoch 18/50, Train Loss: 1.8029, Train Accuracy: 0.1666, Test Loss: 1.8069, Test Accuracy: 0.1675
Epoch 18


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:07<00:00,  8.77it/s]


Epoch 19/50, Train Loss: 1.8007, Train Accuracy: 0.1707, Test Loss: 1.7962, Test Accuracy: 0.1738
Epoch 19


100%|██████████| 163/163 [00:51<00:00,  3.17it/s]
100%|██████████| 70/70 [00:08<00:00,  8.71it/s]


Epoch 20/50, Train Loss: 1.8000, Train Accuracy: 0.1712, Test Loss: 1.7957, Test Accuracy: 0.1738
Epoch 20


100%|██████████| 163/163 [00:51<00:00,  3.16it/s]
100%|██████████| 70/70 [00:08<00:00,  8.72it/s]


Epoch 21/50, Train Loss: 1.7994, Train Accuracy: 0.1657, Test Loss: 1.8036, Test Accuracy: 0.1738
Epoch 21


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:08<00:00,  8.75it/s]


Epoch 22/50, Train Loss: 1.7975, Train Accuracy: 0.1766, Test Loss: 1.8137, Test Accuracy: 0.1639
Epoch 22


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 23/50, Train Loss: 1.7994, Train Accuracy: 0.1714, Test Loss: 1.7991, Test Accuracy: 0.1764
Epoch 23


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:07<00:00,  8.79it/s]


Epoch 24/50, Train Loss: 1.7981, Train Accuracy: 0.1626, Test Loss: 1.7961, Test Accuracy: 0.1764
Epoch 24


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:07<00:00,  8.75it/s]


Epoch 25/50, Train Loss: 1.7999, Train Accuracy: 0.1687, Test Loss: 1.8022, Test Accuracy: 0.1711
Epoch 25


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:07<00:00,  8.78it/s]


Epoch 26/50, Train Loss: 1.7997, Train Accuracy: 0.1601, Test Loss: 1.7979, Test Accuracy: 0.1639
Epoch 26


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:08<00:00,  8.69it/s]


Epoch 27/50, Train Loss: 1.7972, Train Accuracy: 0.1668, Test Loss: 1.7966, Test Accuracy: 0.1738
Epoch 27


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:07<00:00,  8.83it/s]


Epoch 28/50, Train Loss: 1.7979, Train Accuracy: 0.1595, Test Loss: 1.7986, Test Accuracy: 0.1764
Epoch 28


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:08<00:00,  8.70it/s]


Epoch 29/50, Train Loss: 1.8012, Train Accuracy: 0.1687, Test Loss: 1.8019, Test Accuracy: 0.1675
Epoch 29


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:07<00:00,  8.78it/s]


Epoch 30/50, Train Loss: 1.7993, Train Accuracy: 0.1791, Test Loss: 1.7978, Test Accuracy: 0.1639
Epoch 30


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:08<00:00,  8.74it/s]


Epoch 31/50, Train Loss: 1.8001, Train Accuracy: 0.1661, Test Loss: 1.7986, Test Accuracy: 0.1675
Epoch 31


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:08<00:00,  8.74it/s]


Epoch 32/50, Train Loss: 1.7992, Train Accuracy: 0.1718, Test Loss: 1.8059, Test Accuracy: 0.1711
Epoch 32


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:08<00:00,  8.73it/s]


Epoch 33/50, Train Loss: 1.8031, Train Accuracy: 0.1695, Test Loss: 1.7997, Test Accuracy: 0.1675
Epoch 33


100%|██████████| 163/163 [00:51<00:00,  3.18it/s]
100%|██████████| 70/70 [00:07<00:00,  8.75it/s]


Epoch 34/50, Train Loss: 1.8043, Train Accuracy: 0.1709, Test Loss: 1.8088, Test Accuracy: 0.1738
Epoch 34


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:07<00:00,  8.78it/s]


Epoch 35/50, Train Loss: 1.8012, Train Accuracy: 0.1758, Test Loss: 1.7944, Test Accuracy: 0.1675
Epoch 35


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:07<00:00,  8.76it/s]


Epoch 36/50, Train Loss: 1.7979, Train Accuracy: 0.1760, Test Loss: 1.7971, Test Accuracy: 0.1764
Epoch 36


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:07<00:00,  8.78it/s]


Epoch 37/50, Train Loss: 1.8010, Train Accuracy: 0.1670, Test Loss: 1.7959, Test Accuracy: 0.1711
Epoch 37


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:08<00:00,  8.72it/s]


Epoch 38/50, Train Loss: 1.8015, Train Accuracy: 0.1766, Test Loss: 1.7926, Test Accuracy: 0.1738
Epoch 38


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:08<00:00,  8.75it/s]


Epoch 39/50, Train Loss: 1.8013, Train Accuracy: 0.1695, Test Loss: 1.7978, Test Accuracy: 0.1764
Epoch 39


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:08<00:00,  8.74it/s]


Epoch 40/50, Train Loss: 1.8099, Train Accuracy: 0.1622, Test Loss: 1.7994, Test Accuracy: 0.1675
Epoch 40


100%|██████████| 163/163 [00:50<00:00,  3.20it/s]
100%|██████████| 70/70 [00:07<00:00,  8.80it/s]


Epoch 41/50, Train Loss: 1.8020, Train Accuracy: 0.1747, Test Loss: 1.7956, Test Accuracy: 0.1711
Epoch 41


100%|██████████| 163/163 [00:50<00:00,  3.21it/s]
100%|██████████| 70/70 [00:07<00:00,  8.77it/s]


Epoch 42/50, Train Loss: 1.8029, Train Accuracy: 0.1703, Test Loss: 1.7949, Test Accuracy: 0.1764
Epoch 42


100%|██████████| 163/163 [00:50<00:00,  3.20it/s]
100%|██████████| 70/70 [00:07<00:00,  8.84it/s]


Epoch 43/50, Train Loss: 1.8083, Train Accuracy: 0.1626, Test Loss: 1.8606, Test Accuracy: 0.1639
Epoch 43


100%|██████████| 163/163 [00:51<00:00,  3.20it/s]
100%|██████████| 70/70 [00:08<00:00,  8.71it/s]


Epoch 44/50, Train Loss: 1.8050, Train Accuracy: 0.1674, Test Loss: 1.8044, Test Accuracy: 0.1738
Epoch 44


100%|██████████| 163/163 [00:50<00:00,  3.21it/s]
100%|██████████| 70/70 [00:07<00:00,  8.77it/s]


Epoch 45/50, Train Loss: 1.7990, Train Accuracy: 0.1668, Test Loss: 1.7962, Test Accuracy: 0.1675
Epoch 45


100%|██████████| 163/163 [00:50<00:00,  3.20it/s]
100%|██████████| 70/70 [00:08<00:00,  8.75it/s]


Epoch 46/50, Train Loss: 1.7985, Train Accuracy: 0.1724, Test Loss: 1.8002, Test Accuracy: 0.1639
Epoch 46


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:08<00:00,  8.72it/s]


Epoch 47/50, Train Loss: 1.8020, Train Accuracy: 0.1655, Test Loss: 1.8066, Test Accuracy: 0.1639
Epoch 47


100%|██████████| 163/163 [00:51<00:00,  3.19it/s]
100%|██████████| 70/70 [00:07<00:00,  8.77it/s]


Epoch 48/50, Train Loss: 1.8055, Train Accuracy: 0.1707, Test Loss: 1.8093, Test Accuracy: 0.1711
Epoch 48


100%|██████████| 163/163 [00:50<00:00,  3.20it/s]
100%|██████████| 70/70 [00:07<00:00,  8.79it/s]


Epoch 49/50, Train Loss: 1.8037, Train Accuracy: 0.1668, Test Loss: 1.7944, Test Accuracy: 0.1675
Epoch 49


100%|██████████| 163/163 [00:50<00:00,  3.21it/s]
100%|██████████| 70/70 [00:07<00:00,  8.83it/s]


Epoch 50/50, Train Loss: 1.8029, Train Accuracy: 0.1595, Test Loss: 1.8091, Test Accuracy: 0.1639


Batch size: 32 lr: 0.01
Training ViT for "multimodal" pipeline ...
------------------------------------------------

Epoch 0


100%|██████████| 163/163 [00:53<00:00,  3.06it/s]
100%|██████████| 70/70 [00:08<00:00,  8.60it/s]


Epoch 1/50, Train Loss: 2.2664, Train Accuracy: 0.1758, Test Loss: 2.2589, Test Accuracy: 0.1639
Epoch 1


100%|██████████| 163/163 [00:52<00:00,  3.09it/s]
100%|██████████| 70/70 [00:08<00:00,  8.69it/s]


Epoch 2/50, Train Loss: 1.9450, Train Accuracy: 0.1695, Test Loss: 1.8398, Test Accuracy: 0.1639
Epoch 2


100%|██████████| 163/163 [00:52<00:00,  3.09it/s]
100%|██████████| 70/70 [00:08<00:00,  8.62it/s]


Epoch 3/50, Train Loss: 1.8405, Train Accuracy: 0.1687, Test Loss: 1.8024, Test Accuracy: 0.1764
Epoch 3


100%|██████████| 163/163 [00:52<00:00,  3.09it/s]
100%|██████████| 70/70 [00:08<00:00,  8.65it/s]


Epoch 4/50, Train Loss: 1.8198, Train Accuracy: 0.1801, Test Loss: 1.8005, Test Accuracy: 0.1639
Epoch 4


100%|██████████| 163/163 [00:52<00:00,  3.09it/s]
100%|██████████| 70/70 [00:08<00:00,  8.62it/s]


Epoch 5/50, Train Loss: 1.8189, Train Accuracy: 0.1639, Test Loss: 1.8124, Test Accuracy: 0.1764
Epoch 5


100%|██████████| 163/163 [00:52<00:00,  3.09it/s]
100%|██████████| 70/70 [00:08<00:00,  8.65it/s]


Epoch 6/50, Train Loss: 1.8136, Train Accuracy: 0.1818, Test Loss: 1.7984, Test Accuracy: 0.1711
Epoch 6


100%|██████████| 163/163 [00:52<00:00,  3.09it/s]
100%|██████████| 70/70 [00:08<00:00,  8.65it/s]


Epoch 7/50, Train Loss: 1.8129, Train Accuracy: 0.1745, Test Loss: 1.8461, Test Accuracy: 0.1711
Epoch 7


100%|██████████| 163/163 [00:52<00:00,  3.09it/s]
100%|██████████| 70/70 [00:08<00:00,  8.66it/s]


Epoch 8/50, Train Loss: 1.8126, Train Accuracy: 0.1722, Test Loss: 1.7983, Test Accuracy: 0.1738
Epoch 8


100%|██████████| 163/163 [00:52<00:00,  3.09it/s]
100%|██████████| 70/70 [00:08<00:00,  8.64it/s]


Epoch 9/50, Train Loss: 1.8110, Train Accuracy: 0.1634, Test Loss: 1.7996, Test Accuracy: 0.1639
Epoch 9


100%|██████████| 163/163 [00:52<00:00,  3.10it/s]
100%|██████████| 70/70 [00:08<00:00,  8.62it/s]


Epoch 10/50, Train Loss: 1.8065, Train Accuracy: 0.1651, Test Loss: 1.8009, Test Accuracy: 0.1764
Epoch 10


100%|██████████| 163/163 [00:52<00:00,  3.10it/s]
100%|██████████| 70/70 [00:08<00:00,  8.68it/s]


Epoch 11/50, Train Loss: 1.8062, Train Accuracy: 0.1737, Test Loss: 1.8469, Test Accuracy: 0.1711
Epoch 11


100%|██████████| 163/163 [00:52<00:00,  3.11it/s]
100%|██████████| 70/70 [00:08<00:00,  8.62it/s]


Epoch 12/50, Train Loss: 1.8070, Train Accuracy: 0.1751, Test Loss: 1.8054, Test Accuracy: 0.1711
Epoch 12


100%|██████████| 163/163 [00:52<00:00,  3.10it/s]
100%|██████████| 70/70 [00:08<00:00,  8.64it/s]


Epoch 13/50, Train Loss: 1.8057, Train Accuracy: 0.1739, Test Loss: 1.8171, Test Accuracy: 0.1639
Epoch 13


100%|██████████| 163/163 [00:52<00:00,  3.10it/s]
100%|██████████| 70/70 [00:08<00:00,  8.64it/s]


Epoch 14/50, Train Loss: 1.8056, Train Accuracy: 0.1745, Test Loss: 1.8382, Test Accuracy: 0.1473
Epoch 14


100%|██████████| 163/163 [00:52<00:00,  3.11it/s]
100%|██████████| 70/70 [00:08<00:00,  8.65it/s]


Epoch 15/50, Train Loss: 1.8058, Train Accuracy: 0.1661, Test Loss: 1.8192, Test Accuracy: 0.1639
Epoch 15


100%|██████████| 163/163 [00:52<00:00,  3.12it/s]
100%|██████████| 70/70 [00:08<00:00,  8.66it/s]


Epoch 16/50, Train Loss: 1.8021, Train Accuracy: 0.1745, Test Loss: 1.8004, Test Accuracy: 0.1675
Epoch 16


100%|██████████| 163/163 [00:52<00:00,  3.12it/s]
100%|██████████| 70/70 [00:08<00:00,  8.65it/s]


Epoch 17/50, Train Loss: 1.8012, Train Accuracy: 0.1718, Test Loss: 1.8162, Test Accuracy: 0.1675
Epoch 17


100%|██████████| 163/163 [00:52<00:00,  3.12it/s]
100%|██████████| 70/70 [00:08<00:00,  8.68it/s]


Epoch 18/50, Train Loss: 1.8035, Train Accuracy: 0.1572, Test Loss: 1.7948, Test Accuracy: 0.1639
Epoch 18


100%|██████████| 163/163 [00:52<00:00,  3.12it/s]
100%|██████████| 70/70 [00:08<00:00,  8.66it/s]


Epoch 19/50, Train Loss: 1.8009, Train Accuracy: 0.1639, Test Loss: 1.7932, Test Accuracy: 0.1675
Epoch 19


100%|██████████| 163/163 [00:52<00:00,  3.12it/s]
100%|██████████| 70/70 [00:08<00:00,  8.62it/s]


Epoch 20/50, Train Loss: 1.7995, Train Accuracy: 0.1707, Test Loss: 1.8017, Test Accuracy: 0.1675
Epoch 20


100%|██████████| 163/163 [00:52<00:00,  3.11it/s]
100%|██████████| 70/70 [00:08<00:00,  8.66it/s]


Epoch 21/50, Train Loss: 1.8016, Train Accuracy: 0.1711, Test Loss: 1.7911, Test Accuracy: 0.1764
Epoch 21


100%|██████████| 163/163 [00:52<00:00,  3.13it/s]
100%|██████████| 70/70 [00:08<00:00,  8.68it/s]


Epoch 22/50, Train Loss: 1.8011, Train Accuracy: 0.1711, Test Loss: 1.7981, Test Accuracy: 0.1738
Epoch 22


100%|██████████| 163/163 [00:52<00:00,  3.13it/s]
100%|██████████| 70/70 [00:08<00:00,  8.70it/s]


Epoch 23/50, Train Loss: 1.7975, Train Accuracy: 0.1743, Test Loss: 1.7972, Test Accuracy: 0.1711
Epoch 23


100%|██████████| 163/163 [00:52<00:00,  3.13it/s]
100%|██████████| 70/70 [00:08<00:00,  8.66it/s]


Epoch 24/50, Train Loss: 1.8024, Train Accuracy: 0.1734, Test Loss: 1.7972, Test Accuracy: 0.1675
Epoch 24


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.68it/s]


Epoch 25/50, Train Loss: 1.7990, Train Accuracy: 0.1638, Test Loss: 1.7954, Test Accuracy: 0.1738
Epoch 25


100%|██████████| 163/163 [00:52<00:00,  3.12it/s]
100%|██████████| 70/70 [00:08<00:00,  8.59it/s]


Epoch 26/50, Train Loss: 1.7983, Train Accuracy: 0.1666, Test Loss: 1.7980, Test Accuracy: 0.1675
Epoch 26


100%|██████████| 163/163 [00:52<00:00,  3.12it/s]
100%|██████████| 70/70 [00:08<00:00,  8.65it/s]


Epoch 27/50, Train Loss: 1.7992, Train Accuracy: 0.1726, Test Loss: 1.7995, Test Accuracy: 0.1738
Epoch 27


100%|██████████| 163/163 [00:52<00:00,  3.13it/s]
100%|██████████| 70/70 [00:08<00:00,  8.68it/s]


Epoch 28/50, Train Loss: 1.8023, Train Accuracy: 0.1622, Test Loss: 1.7910, Test Accuracy: 0.1675
Epoch 28


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.65it/s]


Epoch 29/50, Train Loss: 1.7992, Train Accuracy: 0.1601, Test Loss: 1.8096, Test Accuracy: 0.1711
Epoch 29


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.66it/s]


Epoch 30/50, Train Loss: 1.8001, Train Accuracy: 0.1663, Test Loss: 1.8014, Test Accuracy: 0.1639
Epoch 30


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.63it/s]


Epoch 31/50, Train Loss: 1.7991, Train Accuracy: 0.1691, Test Loss: 1.7994, Test Accuracy: 0.1675
Epoch 31


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.69it/s]


Epoch 32/50, Train Loss: 1.7992, Train Accuracy: 0.1747, Test Loss: 1.8031, Test Accuracy: 0.1639
Epoch 32


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.63it/s]


Epoch 33/50, Train Loss: 1.8037, Train Accuracy: 0.1714, Test Loss: 1.8051, Test Accuracy: 0.1711
Epoch 33


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.64it/s]


Epoch 34/50, Train Loss: 1.7979, Train Accuracy: 0.1624, Test Loss: 1.8074, Test Accuracy: 0.1675
Epoch 34


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.63it/s]


Epoch 35/50, Train Loss: 1.8013, Train Accuracy: 0.1618, Test Loss: 1.7960, Test Accuracy: 0.1711
Epoch 35


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.67it/s]


Epoch 36/50, Train Loss: 1.8010, Train Accuracy: 0.1680, Test Loss: 1.8059, Test Accuracy: 0.1675
Epoch 36


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.62it/s]


Epoch 37/50, Train Loss: 1.8010, Train Accuracy: 0.1735, Test Loss: 1.7969, Test Accuracy: 0.1639
Epoch 37


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.63it/s]


Epoch 38/50, Train Loss: 1.8003, Train Accuracy: 0.1711, Test Loss: 1.7945, Test Accuracy: 0.1711
Epoch 38


100%|██████████| 163/163 [00:52<00:00,  3.13it/s]
100%|██████████| 70/70 [00:08<00:00,  8.62it/s]


Epoch 39/50, Train Loss: 1.7996, Train Accuracy: 0.1651, Test Loss: 1.8218, Test Accuracy: 0.1675
Epoch 39


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.66it/s]


Epoch 40/50, Train Loss: 1.8005, Train Accuracy: 0.1666, Test Loss: 1.7926, Test Accuracy: 0.1675
Epoch 40


100%|██████████| 163/163 [00:51<00:00,  3.15it/s]
100%|██████████| 70/70 [00:08<00:00,  8.68it/s]


Epoch 41/50, Train Loss: 1.7961, Train Accuracy: 0.1734, Test Loss: 1.8388, Test Accuracy: 0.1639
Epoch 41


100%|██████████| 163/163 [00:51<00:00,  3.15it/s]
100%|██████████| 70/70 [00:08<00:00,  8.60it/s]


Epoch 42/50, Train Loss: 1.8054, Train Accuracy: 0.1689, Test Loss: 1.7939, Test Accuracy: 0.1738
Epoch 42


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.67it/s]


Epoch 43/50, Train Loss: 1.8001, Train Accuracy: 0.1712, Test Loss: 1.7963, Test Accuracy: 0.1639
Epoch 43


100%|██████████| 163/163 [00:51<00:00,  3.15it/s]
100%|██████████| 70/70 [00:08<00:00,  8.63it/s]


Epoch 44/50, Train Loss: 1.7971, Train Accuracy: 0.1722, Test Loss: 1.8131, Test Accuracy: 0.1711
Epoch 44


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.67it/s]


Epoch 45/50, Train Loss: 1.8042, Train Accuracy: 0.1611, Test Loss: 1.8046, Test Accuracy: 0.1639
Epoch 45


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.65it/s]


Epoch 46/50, Train Loss: 1.8066, Train Accuracy: 0.1755, Test Loss: 1.7940, Test Accuracy: 0.1675
Epoch 46


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.65it/s]


Epoch 47/50, Train Loss: 1.8049, Train Accuracy: 0.1720, Test Loss: 1.7967, Test Accuracy: 0.1675
Epoch 47


100%|██████████| 163/163 [00:51<00:00,  3.15it/s]
100%|██████████| 70/70 [00:08<00:00,  8.63it/s]


Epoch 48/50, Train Loss: 1.8003, Train Accuracy: 0.1684, Test Loss: 1.7965, Test Accuracy: 0.1738
Epoch 48


100%|██████████| 163/163 [00:51<00:00,  3.14it/s]
100%|██████████| 70/70 [00:08<00:00,  8.65it/s]


Epoch 49/50, Train Loss: 1.8040, Train Accuracy: 0.1712, Test Loss: 1.7933, Test Accuracy: 0.1675
Epoch 49


100%|██████████| 163/163 [00:51<00:00,  3.15it/s]
100%|██████████| 70/70 [00:08<00:00,  8.64it/s]


Epoch 50/50, Train Loss: 1.8026, Train Accuracy: 0.1778, Test Loss: 1.7954, Test Accuracy: 0.1764


In [21]:
# Print results
print("\nResults\n---------------\n")
for _m, val in scores.items():
  print(f"Modality: {_m}, Train Loss: {val[0]:.4f}, Train Accuracy: {val[1]:.4f}, Test Loss: {val[2]:.4f}, Test Accuracy: {val[3]:.4f}")


Results
---------------

Modality: visual, Train Loss: 1.8019, Train Accuracy: 0.1695, Test Loss: 1.8093, Test Accuracy: 0.1711
Modality: audio, Train Loss: 1.8029, Train Accuracy: 0.1595, Test Loss: 1.8091, Test Accuracy: 0.1639
Modality: multimodal, Train Loss: 1.8026, Train Accuracy: 0.1778, Test Loss: 1.7954, Test Accuracy: 0.1764


In [22]:
# Copy trained models to GDrive
!cp 'ViT_audio_video_fullscale_50_32_0.01' '/content/drive/MyDrive/csci535_aashi/models'
!cp 'ViT_audio_fullscale_50_32_0.01' '/content/drive/MyDrive/csci535_aashi/models'
!cp 'ViT_video_fullscale_50_32_0.01' '/content/drive/MyDrive/csci535_aashi/models'

In [23]:
# Clear memory
import gc
gc.collect()

8