In [1]:
# Import Relevant Libraries

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
from torch import Tensor
from torch.autograd import Variable
from torchvision.utils import save_image
from PIL import Image
from torch.utils.data import DataLoader
from torch.autograd import Variable
from torchvision.models import vgg16, densenet121, resnet152
import glob
import os
import time
import datetime
from tqdm import tqdm
import sys
import pickle

In [2]:
# change to GPU

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# Data

In [3]:
# Normalization parameters for pre-trained PyTorch models
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])

# dataset loader which takes in the frames of the video
# default number of frames is 40, padding with the first frame if the video does not have enough frames

class Dataset():
    def __init__(self, dataset_path, split_path, split_number, input_shape, sequence_length, training):
        super().__init__()
        self.training = training
        self.label_index = self._extract_label_mapping(split_path)
        self.sequences = self._extract_sequence_paths(dataset_path, split_path, split_number, training)
        self.sequence_length = sequence_length
        self.label_names = sorted(list(set([self._activity_from_path(seq_path) for seq_path in self.sequences])))
        self.num_classes = len(self.label_names)
        self.transform = transforms.Compose(
            [
                transforms.Resize(input_shape[-2:], Image.BICUBIC),
                transforms.ToTensor(),
                transforms.Normalize(mean, std),
            ]
        )
        with open('audio_features.pickle', 'rb') as handle:
            self.audio_features = pickle.load(handle)

    def _extract_label_mapping(self, split_path="train_test_split"):
        """ Extracts a mapping between activity name and softmax index """
        with open(os.path.join(split_path, "classInd.txt")) as file:
            lines = file.read().splitlines()
        label_mapping = {}
        for line in lines:
            label, action = line.split()
            label_mapping[action] = int(label) - 1
        return label_mapping

    def _extract_sequence_paths(
        self, dataset_path, split_path="train_test_split", split_number=1, training=True
    ):
        """ Extracts paths to sequences given the specified train / test split """
        assert split_number in [1, 2, 3], "Split number has to be one of {1, 2, 3}"
        fn = f"trainlist0{split_number}.txt" if training else f"testlist0{split_number}.txt"
        split_path = os.path.join(split_path, fn)
        with open(split_path) as file:
            lines = file.read().splitlines()
        sequence_paths = []
        for line in lines:
            seq_name = line.split(".avi")[0]
            sequence_paths += [os.path.join(dataset_path, seq_name)]
        return sequence_paths

    def _activity_from_path(self, path):
        """ Extracts activity name from filepath """
        return path.split("/")[-2]

    def _frame_number(self, image_path):
        """ Extracts frame number from filepath """
        return int(image_path.split("/")[-1].split(".jpg")[0])

    def _pad_to_length(self, sequence):
        """ Pads the sequence to required sequence length """
        left_pad = sequence[0]
        if self.sequence_length is not None:
            while len(sequence) < self.sequence_length:
                sequence.insert(0, left_pad)
        return sequence
    
    def _get_audio_features(self, sequence):
        """ Get the audio features from the pickle """
        # Hardcoded
    
    

    def __getitem__(self, index):
        sequence_path = self.sequences[index % len(self)]
        image_name =  sequence_path.split('/')[-1]+".wav"
        # print(image_name)
        audio = self.audio_features[image_name]
        audio = torch.Tensor(audio)
        # Sort frame sequence based on frame number
        image_paths = sorted(glob.glob(f"{sequence_path}/*.jpg"), key=lambda path: self._frame_number(path))
        # Pad frames sequences shorter than `self.sequence_length` to length
        image_paths = self._pad_to_length(image_paths)
        if self.training:
            # Randomly choose sample interval and start frame
            sample_interval = np.random.randint(1, len(image_paths) // self.sequence_length + 1)
            start_i = np.random.randint(0, len(image_paths) - sample_interval * self.sequence_length + 1)
            flip = np.random.random() < 0.5
        else:
            # Start at first frame and sample uniformly over sequence
            start_i = 0
            sample_interval = 1 if self.sequence_length is None else len(image_paths) // self.sequence_length
            flip = False
        # Extract frames as tensors
        image_sequence = []
        for i in range(start_i, len(image_paths), sample_interval):
            if self.sequence_length is None or len(image_sequence) < self.sequence_length:
                image_tensor = self.transform(Image.open(image_paths[i]))
                if flip:
                    image_tensor = torch.flip(image_tensor, (-1,))
                image_sequence.append(image_tensor)
        image_sequence = torch.stack(image_sequence)
        target = self.label_index[self._activity_from_path(sequence_path)]
        
        
        return image_sequence, target, audio
    
    def __len__(self):
        return len(self.sequences)

# Model

In [4]:
##############################
#         Encoder
##############################


# encoder for visual frames
class Encoder(nn.Module):
    def __init__(self, latent_dim):
        super(Encoder, self).__init__()
        # for resnet encoder, use the following:
        # resnet = resnet152(pretrained=True)
        # self.feature_extractor = nn.Sequential(*list(resnet.children())[:-1])
        # self.final = nn.Sequential(
        #     nn.Linear(resnet.fc.in_features, latent_dim), nn.BatchNorm1d(latent_dim, momentum=0.01)
        # )
        
        # for densenet encoder, use the following:
        # model = densenet161(pretrained=True)
        # self.feature_extractor = nn.Sequential(*list(model.children())[:-1])
        # self.final = nn.Sequential(
        #     nn.Linear(108192, latent_dim), nn.BatchNorm1d(latent_dim, momentum=0.01)
        # )

        # for vgg16 encoder, using the following: 
        vgg = vgg16(pretrained=True) 
        self.feature_extractor = nn.Sequential(*list(vgg.children())[:-1])
        self.final = nn.Sequential(
            nn.Linear(vgg.classifier[6].in_features, latent_dim), nn.BatchNorm1d(latent_dim, momentum=0.01)
        )
        self.final = nn.Sequential(
            nn.Linear(25088, latent_dim), nn.BatchNorm1d(latent_dim, momentum=0.01)
        )
    

    def forward(self, x):
        with torch.no_grad():
            x = self.feature_extractor(x)
        x = x.view(x.size(0), -1)
        return self.final(x)

    
##############################
#           LSTM
##############################


class LSTM(nn.Module):
    def __init__(self, latent_dim, num_layers, hidden_dim, bidirectional):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(latent_dim, hidden_dim, num_layers, batch_first=True, bidirectional=bidirectional)
        self.hidden_state = None

    def reset_hidden_state(self):
        self.hidden_state = None

    def forward(self, x):
        x, self.hidden_state = self.lstm(x, self.hidden_state)
        return x
    
    
##############################
#         ConvLSTM
##############################


class ConvLSTM(nn.Module):
    def __init__(
        self, num_classes, latent_dim=512, lstm_layers=1, hidden_dim=1024, bidirectional=True, attention=True
    ):
        super(ConvLSTM, self).__init__()
        self.encoder = Encoder(latent_dim)
        self.lstm = LSTM(latent_dim, lstm_layers, hidden_dim, bidirectional)
        self.output_layers = nn.Sequential(
            nn.Linear((2 * hidden_dim if bidirectional else hidden_dim)+609, hidden_dim),
            nn.BatchNorm1d(hidden_dim, momentum=0.01),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes),
            nn.Softmax(dim=-1),
        )
        self.attention = attention
        self.attention_layer = nn.Linear(2 * hidden_dim if bidirectional else hidden_dim, 1)

    def forward(self, x, audio):
        batch_size, seq_length, c, h, w = x.shape
        x = x.view(batch_size * seq_length, c, h, w)
        x = self.encoder(x)
        x = x.view(batch_size, seq_length, -1)
        x = self.lstm(x)
        if self.attention:
            attention_w = F.softmax(self.attention_layer(x).squeeze(-1), dim=-1)
            x = torch.sum(attention_w.unsqueeze(-1) * x, dim=1)
        else:
            x = x[:, -1]
        # concatenate audio features with visual features
        x = torch.cat((x, audio), dim=1)
        return self.output_layers(x)

# Training Model

In [5]:
# define the training parameters

image_shape = (3, 224, 224) # (opt.channels, opt.img_dim, opt.img_dim)
dataset_path = "UCF_49-frames"
split_path = "train_test_split"
split_number = 1
sequence_length = 50
batch_size = 4
latent_dim = 256
num_epochs = 15
checkpoint_model = ""
checkpoint_interval = 1

In [6]:
# Define training set
train_dataset = Dataset(
    dataset_path=dataset_path,
    split_path=split_path,
    split_number=split_number,
    input_shape=image_shape,
    sequence_length=sequence_length,
    training=True
)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

# Define test set
test_dataset = Dataset(
    dataset_path=dataset_path,
    split_path=split_path,
    split_number=split_number,
    input_shape=image_shape,
    sequence_length=sequence_length,
    training=False
)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# Classification criterion
cls_criterion = nn.CrossEntropyLoss().to(device)\

In [None]:
# Define network
model = ConvLSTM(
    num_classes=train_dataset.num_classes,
    latent_dim=latent_dim,
    lstm_layers=1,
    hidden_dim=1024,
    bidirectional=True,
    attention=True,
)

model = model.to(device)
print(model)
# summary(model, input_size=(50, 3, 224, 224), batch_size=4)
        
        
# Add weights from checkpoint model if specified
if checkpoint_model:
    model.load_state_dict(torch.load(opt.checkpoint_model))

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [None]:
def test_model(epoch):
    """ Evaluate the model on the test set """
    model.eval()
    test_metrics = {"loss": [], "acc": []}
    for images, labels, audio in tqdm(test_dataloader):
        image_sequences = Variable(images.to(device), requires_grad=False)
        audio_sequences = Variable(audio.to(device), requires_grad=False)
        labels = Variable(labels, requires_grad=False).to(device)
        with torch.no_grad():
            # Reset LSTM hidden state
            model.lstm.reset_hidden_state()
            # Get sequence predictions
            predictions = model(image_sequences, audio_sequences)
        # Compute metrics
        acc = 100 * (predictions.detach().argmax(1) == labels).cpu().numpy().mean()
        loss = cls_criterion(predictions, labels).item()
        # Keep track of loss and accuracy
        test_metrics["loss"].append(loss)
        test_metrics["acc"].append(acc)
    model.train()
    return test_metrics

In [None]:
logfile = 'log_densenet161_with_audio.txt'
for epoch in range(num_epochs):
    model.train()
    epoch_metrics = {"loss": [], "acc": []}
    prev_time = time.time()
    with open(logfile, 'a') as log: 
        log.write(f"--- Epoch {epoch} ---\n")
        log.close()
    print(f"--- Epoch {epoch} ---")
    for images, labels, audio in tqdm(train_dataloader):
        if images.size(0) == 1:
            continue

        image_sequences = Variable(images.to(device), requires_grad=True)
        audio_sequences = Variable(audio.to(device), requires_grad=True)
        labels = Variable(labels.to(device), requires_grad=False)

        optimizer.zero_grad()

        # Reset LSTM hidden state
        model.lstm.reset_hidden_state()

        # Get sequence predictions
        predictions = model(image_sequences, audio_sequences)

        # Compute metrics
        loss = cls_criterion(predictions, labels)
        acc = 100 * (predictions.detach().argmax(1) == labels).cpu().numpy().mean()

        loss.backward()
        optimizer.step()

        # Keep track of epoch metrics
        epoch_metrics["loss"].append(loss.item())
        epoch_metrics["acc"].append(acc)
#         print(loss.item())
#         print(acc)

        # Empty cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    # Evaluate the model on the test set
    test_metrics = test_model(epoch)
    
    with open(logfile, 'a') as log: 
        with open(logfile, 'a') as log: 
            log.write("Epoch: {}/{} - ".format(epoch, num_epochs))
            log.write("Training Loss: {:.3f} ({:.3f}) ".format(epoch_metrics["loss"][-1], np.mean(epoch_metrics["loss"])))
            log.write("Training Acc: {:.3f} ({:.3f}) ".format(epoch_metrics["acc"][-1], np.mean(epoch_metrics["acc"])))
            log.write("Test Loss: {:.3f} ({:.3f}) ".format(test_metrics["loss"][-1], np.mean(test_metrics["loss"])))
            log.write("Test Acc: {:.3f} ({:.3f})\n".format(test_metrics["acc"][-1], np.mean(test_metrics["acc"])))
    
    print("Epoch: {}/{} - ".format(epoch, num_epochs),
          "Training Loss: {:.3f} ({:.3f})".format(epoch_metrics["loss"][-1], np.mean(epoch_metrics["loss"])),
          "Training Acc: {:.3f} ({:.3f})".format(epoch_metrics["acc"][-1], np.mean(epoch_metrics["acc"])),
          "Test Loss: {:.3f} ({:.3f})".format(test_metrics["loss"][-1], np.mean(test_metrics["loss"])),
          "Test Acc: {:.3f} ({:.3f})".format(test_metrics["acc"][-1], np.mean(test_metrics["acc"])))

    # Save model checkpoint
    if epoch % checkpoint_interval == 0:
        os.makedirs("model_checkpoints", exist_ok=True)
        torch.save(model.state_dict(), f"model_checkpoints/{model.__class__.__name__}_{epoch}.pth")

In [None]:
def plot_graph_epoch(train_loss, val_loss, epoch):
    plt.title("Epoch {} Training/Validation losses over epoch".format(epoch+1))
    plt.plot(train_loss, label = "Training Loss")
    plt.plot(val_loss, label = 'Validation Loss')
    plt.xlabel('Number of epoch', fontsize = 14)
    plt.ylabel('Loss', fontsize = 14)
    plt.legend()