### Libraries installed:
* fvcore: `conda install -c fvcore -c iopath -c conda-forge fvcore`
* moviepy: `conda install -c conda-forge moviepy`
    * Change line 259 of video/io/ffmpeg_reader.py: `infos = error.decode('utf8', errors='ignore')`
* pytorchvideo: `pip install pytorchvideo`
* soundfile: `pip install pysoundfile`

In [11]:
import collections
import copy
import json
import os

from glob import glob

import numpy as np
import soundfile as sf
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import urllib

from decord import cpu, VideoReader
from moviepy.editor import VideoFileClip
from PIL import Image
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import CenterCropVideo, NormalizeVideo
from tqdm.notebook import tqdm

device = torch.device('cuda')

## Utils

In [2]:
def extract_audio(video_fname):
    video = VideoFileClip(video_fname)
    audio_fname = video_fname.replace('mp4', 'wav')
    
    if not(os.path.exists(audio_fname)):
        try:
            audio = video.audio
            audio.write_audiofile(video_fname.replace('mp4', 'wav'))
        except AttributeError:
            print(f'No audio was found for: {os.path.basename(video_fname)}')
    else:
        print(f'{audio_fname} already exists')

def sample_video_from_mp4(file, num_frames=16):
    """This function takes a mp4 video file as input and returns
    a list of uniformly sampled frames (PIL Image).
    Parameters
    ----------
    file : str
        path to mp4 video file
    num_frames : int
        how many frames to select using uniform frame sampling.
    Returns
    -------
    images: list of PIL Images
    num_frames: int
        number of frames extracted
    """
    images = list()
    vr = VideoReader(file, ctx=cpu(0))
    total_frames = len(vr)
    indices = np.linspace(0, total_frames - 1, num_frames, dtype=np.int)
    for seg_ind in indices:
        images.append(Image.fromarray(vr[seg_ind].asnumpy()))
    return images, num_frames

In [83]:
video_paths = glob('../../Algonauts2021_devkit/AlgonautsVideos268_All_30fpsmax/*.mp4')
audio_paths = glob('../../Algonauts2021_devkit/AlgonautsVideos268_All_30fpsmax/*.wav')

if audio_paths is None:
    for video_path in video_paths:
        extract_audios(video_path)

print(f'Found {len(video_paths)} videos, {len(audio_paths)} audios')

Found 1102 videos, 1068 audios


## Video extractor: action classification

In [4]:
video_model_name = 'x3d_s'
video_model = torch.hub.load('facebookresearch/pytorchvideo', video_model_name, pretrained=True)

# Set to eval mode and move to desired device
video_model = video_model.eval()
# video_model = video_model.to(device)

Using cache found in /home/nscheidwasserclow/.cache/torch/hub/facebookresearch_pytorchvideo_master


In [5]:
# Extract features from previous block

video_model.blocks[5] = nn.AdaptiveAvgPool3d(output_size=1)

### Input transform

In [55]:
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
frames_per_second = 30
model_transform_params  = {
    "x3d_xs": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 4,
        "sampling_rate": 12,
    },
    "x3d_s": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 13,
        "sampling_rate": 6,
    },
    "x3d_m": {
        "side_size": 256,
        "crop_size": 256,
        "num_frames": 16,
        "sampling_rate": 5,
    }
}

# Get transform parameters based on model
transform_params = model_transform_params[video_model_name]

# Note that this transform is specific to the slow_R50 model.
transform = ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(transform_params["num_frames"]),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(size=transform_params["side_size"]),
            CenterCropVideo(crop_size=(transform_params["crop_size"], transform_params["crop_size"]))
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (transform_params["num_frames"] * transform_params["sampling_rate"])/frames_per_second

In [81]:
def _extract_video_embeddings(video_paths):
    embeddings = []
    for video_path in tqdm(video_paths, desc='Extracting video features...', total=len(video_paths)):
        '''# Select the duration of the clip to load by specifying the start and end duration
        # The start_sec should correspond to where the action occurs in the video
        start_sec = 0
        end_sec = start_sec + clip_duration

        # Initialize an EncodedVideo helper class and load the video
        video = EncodedVideo.from_path(video_path, decoder='pyav')

        # Load the desired clip
        video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

        # Apply a transform to normalize the video input
        video_data = transform(video_data)

        # Move the inputs to the desired device
        inputs = video_data["video"]
        # inputs = inputs.to(device)'''
        
        frames, _ = sample_video_from_mp4(video_path)
        
        video_data = list(map(transforms.ToTensor(), frames))
        
        video_data = {'video': torch.stack(video_data).permute(1,0,2,3)}
        
        video_data = transform(video_data)
        
        inputs = video_data['video']
        
        embedding = video_model(inputs[None, ...]).flatten()
    
        embeddings.append(embedding.cpu().detach().numpy())
    
    embeddings = np.array(embeddings, dtype=np.float32)
    
    return embeddings

In [None]:
vid_embeddings = _extract_video_embeddings(videos)

In [87]:
np.save('x3d_s_video_embeddings.npy', vid_embeddings)

In [None]:
print(vid_embeddings.shape)

## Image extractor: object detection

Motivations for DenseNet:
* https://www.biorxiv.org/content/10.1101/407007v1.full.pdf
* https://doi.org/10.1162/neco_a_01211
* https://openreview.net/forum?id=SkegNmFUIS
   
Motivation for Inceptionv3:
* https://www.jneurosci.org/content/jneuro/early/2018/07/13/JNEUROSCI.0388-18.2018.full.pdf 
    * "From these analyses, we selected the most human-consistent DCNN architecture 360 (Inception-v3, see Behavioral consistency below)"

In [88]:
image_model = torchvision.models.densenet169(pretrained=True)

In [89]:
# Simple way instead of extracting at each block
# Extract features just before classification
# Don't know whether per-block feature extraction is better, but this is a simpler solution in the meantime

image_model.features.norm5 = nn.Identity()
image_model.classifier = nn.Identity()

In [103]:
image_model.eval()
image_model.to(device)

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

In [104]:
# Block 1 = layer 4
# Block 2 = layer 6
# Block 3 = layer 8
# Block 4 = layer 10

# Block x = layer 2*x + 2

def _extract_features_from_block(model, block_idx, input_):
    """
    Util to extract features from a given DenseNet block.
    """
    layer_idx = block_idx * 2 + 2
    
    copy_model = copy.deepcopy(model)
    
    for i in range(layer_idx + 1, len(model.features)):
        copy_model.features[i] = nn.Identity()
        
    features = copy_model(input_).flatten()

    return features

In [109]:
def _extract_densenet_embeddings(video_paths):
    """Extract embeddings from an ImageNet-pretrained model.
    
    Future implementation will be more name-specific and include per-block feature extraction, e.g.:
    
    if model_name == 'densenet':
        for block_idx in len(densenet_blocks):
            extract_features_per_densenet_block(model, input)
    ...
    """
    resize_normalize = torchvision.transforms.Compose([
        torchvision.transforms.Resize((224, 224)),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    
    embeddings = []
    for video_path in tqdm(video_paths, desc='Extracting video features...', total=len(video_paths)):
        vid, num_frames = sample_video_from_mp4(video_path)

        embedding = []
        
        for frame, img in enumerate(vid):
            input_img = torch.autograd.Variable(resize_normalize(img).unsqueeze(0))
            # if torch.cuda.is_available():
                # input_img = input_img.cuda()
                
            frame_embedding = image_model(input_img.cuda()).flatten()
            
            embedding.append(frame_embedding.cpu().detach())
            
        embeddings.append(torch.stack(embedding).mean(0).numpy())
    
    embeddings = np.array(embeddings, dtype=np.float32)
    return embeddings

In [110]:
img_embeddings = _extract_densenet_embeddings(video_paths)

Extracting video features...:   0%|          | 0/1102 [00:00<?, ?it/s]

In [111]:
img_embeddings.shape

(1102, 1664)

In [113]:
np.save('densenet169_img_embeddings.npy', embeddings)