# Table of contents


1.   import *libraries*
2.   download data & extract file
3.   Read File Name & Set label & Clean Data
4.   Read Dataset
5.   Image Augmenation
6.   Randomize our data
7.   Using KFOLD to split our data and labels
8.   Create Train and Test Dataloaders



9. Define Model

> * CNN3DModel

> * ResNet 3D

> * DensNet 3D

> * Encoder Decoder (ConvLSTM)


10. Generating Model

11. Set train model & other parameters

12. Train model

13. visualization loss & accuracy















# ***Import library***

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pnd
from tqdm import tqdm
import sys
import math
import os
import zipfile
import six
import warnings
import random
import gc

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import cv2
import imgaug as ia
from imgaug import augmenters as iaa
from moviepy.editor import VideoFileClip
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch.utils.data import Dataset
import torch.utils.data as Data
from torchvision import transforms
import torchvision.models as models
from torch.autograd import Variable
import torch.optim as optim
from torch.utils.data.sampler import SubsetRandomSampler
from scipy.io import loadmat


Imageio: 'ffmpeg-linux64-v3.3.1' was not found on your computer; downloading it now.
Try 1. Download from https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg-linux64-v3.3.1 (43.8 MB)
Downloading: 8192/45929032 bytes (0.0%)3407872/45929032 bytes (7.4%)7020544/45929032 bytes (15.3%)10682368/45929032 bytes (23.3%)14385152/45929032 bytes (31.3%)18178048/45929032 bytes (39.6%)21864448/45929032 bytes (47.6%)25591808/45929032 bytes (55.7%)29114368/45929032 bytes (63.4%)32833536/45929032 bytes (71.5%)36626432/45929032 bytes (79.7%)40230912/45929032 bytes (87.6%)43778048/45929032 bytes (95.3%)

In [2]:
url = "https://uc0442da848516982bf140833a22.dl.dropboxusercontent.com/zip_download_get/AhsvTaMWi9zRKo9z4PzDqQvpvoBHETCw0kS3J1tdHnh-TVKjQxEXHOaFh1YXXvSbsoxU6m5QUbPDPli9nAfwxDdM3j5KTmPGMlU2WcGeJ7d0-w?_download_id=447858557738765268854499448370801617867357168088245854390127950184&_notify_domain=www.dropbox.com&dl=1"
target_path = '/content/brain.zip'
import requests, zipfile, io
response = requests.get(url, stream=True)
handle = open(target_path, "wb")
for chunk in response.iter_content(chunk_size=100):
    if chunk:  # filter out keep-alive new chunks
        handle.write(chunk)
handle.close()

In [3]:
import zipfile
zip_ref = zipfile.ZipFile('/content/brain.zip', 'r')
zip_ref.extractall('/content/home/train')
zip_ref.close()

# **Download Data & Extract File**

# Read File Name & Set label & Clean Data

In [4]:
road = '/content/home/train/brain4cars_data/road_camera/' 
face = '/content/home/train/brain4cars_data/face_camera/'
classes = os.listdir(face) # face and road have the same classes

face_filename=[]
road_filename=[]
speed_filename=[]
labels=[]

for i in range(len(classes)):
  path_face = face + classes[i]
  path_road = road + classes[i]
  face_check = os.listdir(path_face)
  road_check = os.listdir(path_road)

  for j in range(len(face_check)):
    if face_check[j]+'.avi' in road_check:
      video_face_path = path_face+'/'+face_check[j]+'/video_'+face_check[j]+'.avi'
      video_road_path = path_road+'/'+face_check[j]+'.avi'
      mat_speed_path = path_face+'/'+face_check[j]+'/params_'+face_check[j]+'.mat'
      
      try:
        clip_face = VideoFileClip(video_face_path)
        clip_road = VideoFileClip(video_road_path)
        a = clip_face.duration
        b = clip_road.duration
      except:
        a = 1 
        b = 2
      if a==b and a>5:
        face_filename.append(video_face_path)
        road_filename.append(video_road_path)
        x = loadmat(mat_speed_path)
        speed_filename.append(x["params"]["frame_data"][0][0][0][-1]["speed"][0][0][0][0])
        labels.append(i)
        gc.collect()

# Read Dataset

In [14]:
def readframe (file_name, input_size, sample_rate, num_frames):
  count =0
  data=[]
  cap = cv2.VideoCapture(file_name)
  if not cap.isOpened():
    print("Unable to connect to camera.")
  while cap.isOpened():
    ret, frame = cap.read()
    if ret == True and count%sample_rate==0 :
      frame = cv2.resize(frame, (input_size, input_size), interpolation = cv2.INTER_AREA)
      data.append(frame)
      if ret == False or len(data)==num_frames:
        break
      count=count+ 1
  return data

class BrainforCarsDataset(Dataset):
    def __init__(self, face_filename, road_filename,speed_filename, labels, input_size, sample_rate, num_frames, transform=None):
        self.face_filename = face_filename
        self.road_filename = road_filename
        self.speed_filename = speed_filename
        self.transform = transform
        self.labels = labels
        self.num_frames = num_frames
        self.sample_rate = sample_rate
        self.input_size = input_size
        self.num_imgs = len(self.face_filename)

    def __len__(self):
        return self.num_imgs


    def __getitem__(self, idx):

        data_face = readframe(self.face_filename[idx], self.input_size, self.sample_rate, self.num_frames)
        data_face = np.array(data_face)

        #data_road = readframe(self.road_filename[idx], self.input_size, self.sample_rate, self.num_frames)
        #data_road = np.array(data_road)

        lm = np.array(self.labels[idx])
        speed = np.array(self.speed_filename[idx])


        sample = {'image': data_face, "speed": speed, 'label': lm}
        sample = {'image': data_road, "speed": speed, 'label': lm}

        if self.transform:
            sample = self.transform(sample)
        return sample

## Augmentations (all of these Augmentatios dont change the labels of the data)
# https://gitee.com/alavaien/imgaug

In [15]:
class ImgAugTransform(object):
  def __init__(self):
    sometimes = lambda aug: iaa.Sometimes(0.2, aug)
    self.aug = iaa.Sequential(
        [
            # apply the following augmenters to most images
            iaa.LinearContrast((2.0, 2.5)), 
            iaa.Invert(1, per_channel=True), 
            sometimes(iaa.Affine(
                scale={"x": (0.9, 1.1), "y": (0.9, 1.1)}, # scale images to 80-120% of their size, individually per axis
                rotate=(-10, 10), # rotate by -45 to +45 degrees
                order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)
                
                mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)
            )),
            # execute 0 to 5 of the following (less important) augmenters per image
            # don't execute all of them, as that would often be way too strong
            iaa.SomeOf((0, 5),
                [
                    
                    iaa.OneOf([
                        iaa.GaussianBlur((0, 0.5)), # blur images with a sigma between 0 and 3.0
                        iaa.AverageBlur(k=(1, 3)), # blur image using local means with kernel sizes between 2 and 7
                        iaa.MedianBlur(k=(1, 3)), # blur image using local medians with kernel sizes between 2 and 7
                    ]),
                    iaa.Sharpen(alpha=(.9, 1.0), lightness=(0.5, 1.6)), # sharpen images
                    
                    # search either for all edges or for directed edges,
                    # blend the result with the original image using a blobby mask
                    iaa.SimplexNoiseAlpha(iaa.OneOf([
                        iaa.EdgeDetect(alpha=(0.0, 0.2)),
                        
                    ])),
                    
                    iaa.OneOf([
                        iaa.Dropout((0.01, 0.03), per_channel=0.5), # randomly remove up to 10% of the pixels                        
                    ]),
                    iaa.Invert(0.01, per_channel=True), # invert color channels
                    iaa.Add((-2, 2), per_channel=0.5), # change brightness of images (by -2 to 2 of original value)
                    iaa.AddToHueAndSaturation((-1, 1)), # change hue and saturation - add blue light
                    # either change the brightness of the whole image (sometimes
                    # per channel) or change the brightness of subareas
                    iaa.OneOf([
                        iaa.Multiply((0.9, 1.1), per_channel=0.5),
                        iaa.FrequencyNoiseAlpha( exponent=(-1, 0),first=iaa.Multiply((0.9, 1.1), per_channel=True),  # add dark light
                        second=iaa.ContrastNormalization((0.5, 1.5))
                        )
                    ]),
                    sometimes(iaa.ElasticTransformation(alpha=(0.3, 0.5), sigma=0.2)), # move pixels locally around (with random strengths)
                    sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.02))), # sometimes move parts of the image around
                    sometimes(iaa.PerspectiveTransform(scale=(0.01, 0.05))) # change perspective
                ],
                random_order=True
            )
        ],
        random_order=True
    )
      
  def __call__(self, sample):
    img = sample['image']
    img = img.astype(np.uint8)  #imgaug works with np.unit8
    img = torch.from_numpy(self.aug.augment_images(img).copy())


    sample_1 = {'image': img, "speed": sample["speed"], 'label':sample['label']}
    return sample_1 


# Randomize our data 

In [16]:
random.seed(1254)

combined = list(zip(face_filename, road_filename, speed_filename, labels))
random.shuffle(combined)

face_filename, road_filename, speed_filename, labels = zip(*combined)

# Using StratifiedKFold to split our data and labels

In [17]:
n_fold = 0
skf = StratifiedKFold(n_splits=5, shuffle=False, random_state=42)
face_filename = np.array(face_filename)
road_filename = np.array(road_filename)
speed_filename = np.array(speed_filename)
labels = np.array(labels)

for i ,(train_indices, test_indices) in enumerate(skf.split(face_filename, labels)):
  if i ==n_fold:
    face_filename_train = face_filename[train_indices]
    face_filename_test = face_filename[test_indices]
    road_filename_train = road_filename[train_indices]
    road_filename_test = road_filename[test_indices]
    speed_filename_train = speed_filename[train_indices]
    speed_filename_test = speed_filename[test_indices]
    labels_train = labels[train_indices]
    labels_test = labels[test_indices]

# Train and Test Dataloaders

In [18]:
def get_train_loader(input_size, sample_rate, num_frames):
    ImgAug = ImgAugTransform()
    composed = transforms.Compose([ImgAug])
    train_data = BrainforCarsDataset(face_filename_train,road_filename_train, speed_filename_train, labels_train, input_size, sample_rate, num_frames, transform=composed)
    train_loader = Data.DataLoader(train_data, batch_size=24, shuffle=False, num_workers=0)
    return train_loader


def get_test_loader(input_size, sample_rate, num_frames):
    test_data = BrainforCarsDataset(face_filename_test, road_filename_test, speed_filename_test, labels_test, input_size, sample_rate, num_frames, transform=None)
    test_loader = Data.DataLoader(test_data, batch_size=24, shuffle=False, num_workers=0)
    return test_loader


train_loader = get_train_loader(224, 9, 16)
test_loader = get_test_loader(224, 9, 16)

# Define Model

> CNN3DModel

> ResNet 3D

> DensNet 3D

> Encoder Decoder (ConvLSTM)







In [19]:
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet264']


def get_fine_tuning_parameters(model, ft_begin_index):
    if ft_begin_index == 0:
        return model.parameters()

    ft_module_names = []
    for i in range(ft_begin_index, 5):
        ft_module_names.append('denseblock{}'.format(ft_begin_index))
        ft_module_names.append('transition{}'.format(ft_begin_index))
    ft_module_names.append('norm5')
    ft_module_names.append('classifier')

    parameters = []
    for k, v in model.named_parameters():
        for ft_module in ft_module_names:
            if ft_module in k:
                parameters.append({'params': v})
                break
        else:
            parameters.append({'params': v, 'lr': 0.0})

    return parameters


class _DenseLayer(nn.Sequential):
    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
        super(_DenseLayer, self).__init__()
        self.add_module('norm1', nn.BatchNorm3d(num_input_features))
        self.add_module('relu1', nn.ReLU(inplace=True))
        self.add_module('conv1', nn.Conv3d(num_input_features, bn_size * growth_rate,
                                            kernel_size=1, stride=1, bias=False))
        self.add_module('norm2', nn.BatchNorm3d(bn_size * growth_rate))
        self.add_module('relu2', nn.ReLU(inplace=True))
        self.add_module('conv2', nn.Conv3d(bn_size * growth_rate, growth_rate,
                                            kernel_size=3, stride=1, padding=1, bias=False))
        self.drop_rate = drop_rate

    def forward(self, x):
        new_features = super(_DenseLayer, self).forward(x)
        if self.drop_rate > 0:
            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
        return torch.cat([x, new_features], 1)


class _DenseBlock(nn.Sequential):
    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
        super(_DenseBlock, self).__init__()
        for i in range(num_layers):
            layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
            self.add_module('denselayer%d' % (i + 1), layer)


class _Transition(nn.Sequential):
    def __init__(self, num_input_features, num_output_features):
        super(_Transition, self).__init__()
        self.add_module('norm', nn.BatchNorm3d(num_input_features))
        self.add_module('relu', nn.ReLU(inplace=True))
        self.add_module('conv', nn.Conv3d(num_input_features, num_output_features,
                                          kernel_size=1, stride=1, bias=False))
        self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2))


class DenseNet(nn.Module):
    """Densenet-BC model class
    Args:
        growth_rate (int) - how many filters to add each layer (k in paper)
        block_config (list of 4 ints) - how many layers in each pooling block
        num_init_features (int) - the number of filters to learn in the first convolution layer
        bn_size (int) - multiplicative factor for number of bottle neck layers
          (i.e. bn_size * k features in the bottleneck layer)
        drop_rate (float) - dropout rate after each dense layer
        num_classes (int) - number of classification classes
    """
    def __init__(self, sample_size, sample_duration, growth_rate=32, block_config=(6, 12, 24, 16),
                 num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, last_fc=True):

        super(DenseNet, self).__init__()

        self.last_fc = last_fc

        self.sample_size = sample_size
        self.sample_duration = sample_duration

        # First convolution
        # self.features = nn.Sequential(OrderedDict([
        #     ('conv0', nn.Conv3d(3, num_init_features, kernel_size=7,
        #                         stride=(1, 2, 2), padding=(3, 3, 3), bias=False)),
        #     ('norm0', nn.BatchNorm3d(num_init_features)),
        #     ('relu0', nn.ReLU(inplace=True)),
        #     ('pool0', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)),
        # ]))



        self.features = nn.Sequential(
            nn.Conv3d(4, num_init_features, kernel_size=7,
                                stride=(1, 2, 2), padding=(3, 3, 3), bias=False),
            nn.BatchNorm3d(num_init_features),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=3, stride=2, padding=1),
        )

        # Each denseblock
        num_features = num_init_features
        for i, num_layers in enumerate(block_config):
            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
                                bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
            self.features.add_module('denseblock%d' % (i + 1), block)
            num_features = num_features + num_layers * growth_rate
            if i != len(block_config) - 1:
                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
                self.features.add_module('transition%d' % (i + 1), trans)
                num_features = num_features // 2

        # Final batch norm
        self.features.add_module('norm5', nn.BatchNorm3d(num_features))

        # Linear layer
        self.classifier = nn.Linear(num_features, num_classes)

    def forward(self, x):
        features = self.features(x)
        out = F.relu(features, inplace=True)
        last_duration = math.ceil(self.sample_duration / self.sample_duration)
        last_size = math.floor(self.sample_size / 32)
        out = F.avg_pool3d(out, kernel_size=(last_duration, last_size, last_size)).view(features.size(0), -1)
        if self.last_fc:
            out = self.classifier(out)
        return out

def densenet121(**kwargs):
    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
                     **kwargs)
    return model


def densenet169(**kwargs):
    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
                     **kwargs)
    return model


def densenet201(**kwargs):
    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
                     **kwargs)
    return model


def densenet264(**kwargs):
    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 64, 48),
                     **kwargs)
    return model


In [11]:

def conv2D_output_size(img_size, padding, kernel_size, stride):
    # compute output shape of conv2D
    outshape = (np.floor((img_size[0] + 2 * padding[0] - (kernel_size[0] - 1) - 1) / stride[0] + 1).astype(int),
                np.floor((img_size[1] + 2 * padding[1] - (kernel_size[1] - 1) - 1) / stride[1] + 1).astype(int))
    return outshape

class ResCNNEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1] 
        modules[0] = nn.Conv2d(4, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)    
        self.resnet = nn.Sequential(*modules)
        self.fc1 = nn.Linear(resnet.fc.in_features, fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        for t in range(x_3d.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.resnet(x_3d[:, t, :, :, :])  # ResNet
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.fc1(x)
            x = F.relu(x)
            x = self.fc2(x)
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq


class DecoderRNN(nn.Module):
    def __init__(self, CNN_embed_dim=300, h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=50):
        super(DecoderRNN, self).__init__()

        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc1 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc2 = nn.Linear(self.h_FC_dim, self.num_classes)

    def forward(self, x_RNN):
        
        RNN_out, (h_n, h_c) = self.LSTM(x_RNN, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc1(RNN_out[:, -1, :])   # choose RNN_out at the last time step
        x = F.relu(x)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc2(x)

        return x

# Generating Model

In [20]:
def generate_model(model_name='densenet',n_classes=5,model_depth=161,sample_duration=16,sample_size=224,mode='score'):
    assert mode in ['score', 'feature']
    if mode == 'score':
        last_fc = True
    elif mode == 'feature':
        last_fc = False
  
    if model_name == 'densenet':
        assert model_depth in [121, 169, 201, 264]

        if model_depth == 121:
            model = densenet121(num_classes=n_classes,sample_size=sample_size, sample_duration=sample_duration,last_fc=last_fc)
        elif model_depth == 169:
            model = densenet169(num_classes=n_classes,
                                         sample_size=sample_size, sample_duration=sample_duration,
                                         last_fc=last_fc)
        elif model_depth == 201:
            model = densenet201(num_classes=n_classes,
                                         sample_size=sample_size, sample_duration=sample_duration,
                                         last_fc=last_fc)
        elif model_depth == 264:
            model = densenet264(num_classes=n_classes,
                                         sample_size=sample_size, sample_duration=sample_duration,
                                         last_fc=last_fc)

    return model

# Set train model & other parameters

In [21]:
######______________DenseNet 3D  ___________#############

model_save_location_and_name = '/content/torch_model.pth'
 
num_epochs = 150
model = generate_model(model_name='densenet', n_classes=5, model_depth=121, sample_duration=16, sample_size=224, mode='score')
model.cuda()


# Cross Entropy Loss 
error = nn.CrossEntropyLoss()


# SGD Optimizer
learning_rate = 0.0005
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
######______________Encoder Decoder(ConvLSTM)___________#############


model_save_location_and_name_encoder = '/content/encoder.pth'
model_save_location_and_name_decoder = '/content/decoder.pth'

 


num_epochs = 150


CNN_fc_hidden1, CNN_fc_hidden2 = 1024, 768
CNN_embed_dim = 512      # latent dim extracted by 2D CNN
img_x, img_y = 224, 224  # resize video 2d frame size
dropout_p = 0.0          # dropout probability

# DecoderRNN architecture
RNN_hidden_layers = 3
RNN_hidden_nodes = 512
RNN_FC_dim = 256


cnn_encoder = ResCNNEncoder(fc_hidden1=CNN_fc_hidden1, fc_hidden2=CNN_fc_hidden2, CNN_embed_dim=CNN_embed_dim)
rnn_decoder = DecoderRNN(CNN_embed_dim=CNN_embed_dim, h_RNN_layers=RNN_hidden_layers, h_RNN=RNN_hidden_nodes, h_FC_dim=RNN_FC_dim, num_classes=5)

cnn_encoder.cuda()
rnn_decoder.cuda()


# Cross Entropy Loss 
error = nn.CrossEntropyLoss()
crnn_params = list(cnn_encoder.parameters()) + list(rnn_decoder.parameters())
# SGD Optimizer
learning_rate = 0.0005
optimizer = torch.optim.SGD(crnn_params, lr=learning_rate)

# Traing model


> DenseNet 3D or ResNet 3D  

> Simple CNN 3D

> Encoder Decoder(ConvLSTM)



In [None]:
######_______________DenseNet 3D----- FACE___________#############

temp_accuracy = 0
count = 0
loss_list = []
iteration_list = []
accuracy_list = []
for epoch in range(num_epochs):
    for i, samples in enumerate(train_loader):
        
        imgs_face, speed, lms = samples['image'], samples["speed"], samples['label']
        imgs_face = imgs_face.view(24,3,16,224,224)
        a = torch.empty((24, 1, 16, 224, 224))
        a = torch.zeros_like(a) + speed
        imgs_face = torch.cat((imgs_face, a), dim =1)
        imgs_face = imgs_face.float()
        imgs_face = imgs_face.cuda()
        lms=lms.long()
        lms = lms.cuda()
        optimizer.zero_grad()
        # Forward propagation
        outputs = model(imgs_face)


        # Calculate softmax and ross entropy loss
        loss = error(outputs, lms)
        # Calculating gradients
        loss.backward()
        # Update parameters
        optimizer.step()
        
        count += 1
        if count % 10 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for j, samples in enumerate(test_loader):

                imgs_face, speed, lms = samples['image'], samples["speed"], samples['label']
                imgs_face = imgs_face.view(24,3,16,224,224)
                a = torch.empty((24, 1, 16, 224, 224))
                a = torch.zeros_like(a) + speed
                imgs_face = torch.cat((imgs_face, a), dim =1)
                imgs_face = imgs_face.float()
                imgs_face = imgs_face.cuda()
                lms = lms.cuda()
                # Forward propagation
                outputs = model(imgs_face)
                
                
                # Get predictions from the maximum value
                predicted = torch.max(outputs.data, 1)[1]
                
                # Total number of labels
                total += len(lms)
                correct += (predicted == lms).sum()
            
            accuracy = 100 * correct / float(total)
            if temp_accuracy<accuracy:
              temp_accuracy = accuracy
              torch.save(model.state_dict(), model_save_location_and_name )
            # store loss and iteration
            loss_list.append(loss.data)
            iteration_list.append(count)
            accuracy_list.append(accuracy)
        if count % 10 == 0:
            # Print Loss
            print('Iteration: {}  Loss: {}  Accuracy: {} %'.format(count, loss.data, accuracy))

In [None]:
######______________Encoder Decoder(ConvLSTM) -------road_data_________#############

temp_accuracy=0
count = 0
loss_list = []
iteration_list = []
accuracy_list = []
for epoch in range(num_epochs):
    for i, samples in enumerate(train_loader):
        
        imgs_road, speed, lms = samples['image'], samples["speed"], samples['label']
        imgs_road = imgs_road.view(24,16,3,224,224)
        a = torch.empty((24, 16, 1, 224, 224))
        a = torch.zeros_like(a) + speed  
        imgs_road = torch.cat((imgs_road, a), dim =2)



        imgs_road = imgs_road.float()
        imgs_road = imgs_road.cuda()

        lms = lms.cuda()
        optimizer.zero_grad()

        outputs = rnn_decoder(cnn_encoder(imgs_road)) 
        # outputs = rnn_decoder(outputs)
        # Calculate softmax and ross entropy loss
        loss = error(outputs, lms)
        # Calculating gradients
        loss.backward()
        # Update parameters
        optimizer.step()
        
        count += 1
        if count % 10 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for j, samples in enumerate(test_loader):
                imgs_road, speed, lms = samples['image'], samples["speed"], samples['label']
                imgs_road = imgs_road.view(24,16,3,224,224)
                a = torch.empty((24, 16, 1, 224, 224))
                a = torch.zeros_like(a) + speed  
                imgs_road = torch.cat((imgs_road, a), dim =2)
                imgs_road = imgs_road.float()

                imgs_road = imgs_road.cuda()
                lms = lms.cuda()


                outputs = rnn_decoder(cnn_encoder(imgs_road)) 
                # Get predictions from the maximum value
                predicted = torch.max(outputs.data, 1)[1]
                
                # Total number of labels
                total += len(lms)
                correct += (predicted == lms).sum()
            
            accuracy = 100 * correct / float(total)
            
            if temp_accuracy<accuracy:
              temp_accuracy = accuracy
              torch.save(cnn_encoder.state_dict(), model_save_location_and_name_encoder )
              torch.save(rnn_decoder.state_dict(), model_save_location_and_name_decoder )
            # store loss and iteration
            loss_list.append(loss.data)
            iteration_list.append(count)
            accuracy_list.append(accuracy)
        if count % 10 == 0:
            # Print Loss
            print('Iteration: {}  Loss: {}  Accuracy: {} %'.format(count, loss.data, accuracy))

KeyboardInterrupt: ignored

# visualization loss & accuracy

In [None]:
# visualization loss 
plt.plot(iteration_list,loss_list)
plt.xlabel("Number of iteration")
plt.ylabel("Loss")
plt.title("CNN: Loss vs Number of iteration")
plt.show()

# visualization accuracy 
plt.plot(iteration_list,accuracy_list,color = "red")
plt.xlabel("Number of iteration")
plt.ylabel("Accuracy")
plt.title("CNN: Accuracy vs Number of iteration")
plt.show()

# Feature Fusion



> load model


> class feature fusion


> train 








In [None]:
class BrainforCarsDataset(Dataset):
    def __init__(self, face_filename, road_filename,speed_filename, labels, input_size, sample_rate, num_frames, transform=None):
        self.face_filename = face_filename
        self.road_filename = road_filename
        self.speed_filename = speed_filename
        self.transform = transform
        self.labels = labels
        self.num_frames = num_frames
        self.sample_rate = sample_rate
        self.input_size = input_size
        self.num_imgs = len(self.face_filename)

    def __len__(self):
        return self.num_imgs


    def __getitem__(self, idx):

        count =0
        data_face=[]
        cap = cv2.VideoCapture(self.face_filename[idx])
        if not cap.isOpened():
            print("Unable to connect to camera.")
        while cap.isOpened():

            ret, frame = cap.read()
            
            if ret == True and count%self.sample_rate==0 :
              frame = cv2.resize(frame, (self.input_size, self.input_size), interpolation = cv2.INTER_AREA)
              data_face.append(frame)

            if ret == False or len(data_face)==self.num_frames:
              break
            count=count+ 1

        
        count =0
        data_road=[]
        cap = cv2.VideoCapture(self.road_filename[idx])
        if not cap.isOpened():
            print("Unable to connect to camera.")
        while cap.isOpened():

            ret, frame = cap.read()
            if ret == True and count%self.sample_rate==0 :
              frame = cv2.resize(frame, (self.input_size, self.input_size), interpolation = cv2.INTER_AREA)
              data_road.append(frame)

            if ret == False or len(data_road)== self.num_frames :
              break
            count=count+ 1
    

        lm = np.array(self.labels[idx])
        speed = np.array(self.speed_filename[idx])
        data_face = np.array(data_face)
        data_road = np.array(data_road)
        sample = {'image_face': data_face,'image_road': data_road, "speed": speed, 'label': lm}
        if self.transform:
            sample = self.transform(sample)
        return sample

In [None]:
class ImgAugTransform(object):
  def __init__(self):
    sometimes = lambda aug: iaa.Sometimes(0.2, aug)
    self.aug = iaa.Sequential(
        [
            # apply the following augmenters to most images
            iaa.LinearContrast((2.0, 2.5)), 
            iaa.Invert(1, per_channel=True), 
            sometimes(iaa.Affine(
                scale={"x": (0.9, 1.1), "y": (0.9, 1.1)}, # scale images to 80-120% of their size, individually per axis
                rotate=(-10, 10), # rotate by -45 to +45 degrees
                order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)
                
                mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)
            )),
            # execute 0 to 5 of the following (less important) augmenters per image
            # don't execute all of them, as that would often be way too strong
            iaa.SomeOf((0, 5),
                [
                    
                    iaa.OneOf([
                        iaa.GaussianBlur((0, 0.5)), # blur images with a sigma between 0 and 3.0
                        iaa.AverageBlur(k=(1, 3)), # blur image using local means with kernel sizes between 2 and 7
                        iaa.MedianBlur(k=(1, 3)), # blur image using local medians with kernel sizes between 2 and 7
                    ]),
                    iaa.Sharpen(alpha=(.9, 1.0), lightness=(0.5, 1.6)), # sharpen images
                    
                    # search either for all edges or for directed edges,
                    # blend the result with the original image using a blobby mask
                    iaa.SimplexNoiseAlpha(iaa.OneOf([
                        iaa.EdgeDetect(alpha=(0.0, 0.2)),
                        
                    ])),
                    
                    iaa.OneOf([
                        iaa.Dropout((0.01, 0.03), per_channel=0.5), # randomly remove up to 10% of the pixels                        
                    ]),
                    iaa.Invert(0.01, per_channel=True), # invert color channels
                    iaa.Add((-2, 2), per_channel=0.5), # change brightness of images (by -2 to 2 of original value)
                    iaa.AddToHueAndSaturation((-1, 1)), # change hue and saturation - add blue light
                    # either change the brightness of the whole image (sometimes
                    # per channel) or change the brightness of subareas
                    iaa.OneOf([
                        iaa.Multiply((0.9, 1.1), per_channel=0.5),
                        iaa.FrequencyNoiseAlpha( exponent=(-1, 0),first=iaa.Multiply((0.9, 1.1), per_channel=True),  # add dark light
                        second=iaa.ContrastNormalization((0.5, 1.5))
                        )
                    ]),
                    sometimes(iaa.ElasticTransformation(alpha=(0.3, 0.5), sigma=0.2)), # move pixels locally around (with random strengths)
                    sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.02))), # sometimes move parts of the image around
                    sometimes(iaa.PerspectiveTransform(scale=(0.01, 0.05))) # change perspective
                ],
                random_order=True
            )
        ],
        random_order=True
    )
      
  def __call__(self, sample):
    img_face = sample['image_face']
    img_face = img_face.astype(np.uint8)  #imgaug works with np.unit8
    img_face = torch.from_numpy(self.aug.augment_images(img_face).copy())

    img_road = sample['image_road']
    img_road = img_road.astype(np.uint8)
    img_road = torch.from_numpy(self.aug.augment_images(img_road).copy())

    sample_1 = {'image_face': img_face, 'image_road': img_road, "speed": sample["speed"], 'label':sample['label']}
    return sample_1 

In [None]:
def get_train_loader(input_size, sample_rate, num_frames):
    ImgAug = ImgAugTransform()
    composed = transforms.Compose([ImgAug])
    train_data = BrainforCarsDataset(face_filename_train,road_filename_train, speed_filename_train, labels_train, input_size, sample_rate, num_frames, transform=composed)
    train_loader = Data.DataLoader(train_data, batch_size=1, shuffle=False, num_workers=0)
    return train_loader


def get_test_loader(input_size, sample_rate, num_frames):
    test_data = BrainforCarsDataset(face_filename_test, road_filename_test, speed_filename_test, labels_test, input_size, sample_rate, num_frames, transform=None)
    test_loader = Data.DataLoader(test_data, batch_size=1, shuffle=False, num_workers=0)
    return test_loader


train_loader = get_train_loader(224, 5, 30)
test_loader = get_test_loader(224, 5, 30)

In [None]:
model_save_location_and_name = '/content/torch_model.pth'
model_face = generate_model(model_name='densenet', n_classes=5, model_depth=121, sample_duration=30, sample_size=224, mode='score')
model_face.load_state_dict(torch.load(model_save_location_and_name))



model_save_location_and_name_encoder = '/content/encoder.pth'
model_save_location_and_name_decoder = '/content/decoder.pth'

CNN_fc_hidden1, CNN_fc_hidden2 = 1024, 768
CNN_embed_dim = 512      # latent dim extracted by 2D CNN
dropout_p = 0.0          # dropout probability

# DecoderRNN architecture
RNN_hidden_layers = 3
RNN_hidden_nodes = 512
RNN_FC_dim = 256

model_encoder = ResCNNEncoder(fc_hidden1=CNN_fc_hidden1, fc_hidden2=CNN_fc_hidden2, CNN_embed_dim=CNN_embed_dim)
model_decoder = DecoderRNN(CNN_embed_dim=CNN_embed_dim, h_RNN_layers=RNN_hidden_layers, h_RNN=RNN_hidden_nodes, h_FC_dim=RNN_FC_dim, num_classes=5)


model_encoder.load_state_dict(torch.load(model_save_location_and_name_encoder))
model_decoder.load_state_dict(torch.load(model_save_location_and_name_decoder))

In [None]:
class Fusion(nn.Module):
  def __init__(self, model_face, model_encoder, model_decoder):
    super(Cnn, self).__init__()
    self.model_face = model_face
    self.model_face = nn.Sequential(*list(self.model_face.children())[:-1])

    self.model_encoder = model_encoder
    self.model_decoder = model_decoder
    self.model_decoder = nn.Sequential(*list(self.model_decoder.children())[:-2])

    self.fc1 = nn.Linear(50688, 2048)
    self.drop_1 = nn.Dropout(0.4)
    self.fc2 = nn.Linear(2048, 1024)
    self.drop_2 = nn.Dropout(0.2)
    self.fc3 = nn.Linear(1024, 5)

  def forward (self, face, road):
    x = self.model_face(face)
    x = x.view(x.size(0), -1)

    road = self.encoder(road)
    RNN_out, (h_n, h_c) = self.model_decoder(road)
    y = RNN_out[:, -1, :]
    y = y.view(y.size(0), -1)

    out = torch.cat((x,y),dim=1)
    out = self.fc1(out)
    out =  self.drop_1(out)
    out = self.fc2(out)
    out =  self.drop_2(out)
    out = self.fc3(out)
    return out


In [None]:
Fusionmodel = Fusion(model_face, model_encoder, model_decoder)
num_epochs = 50
Fusionmodel.cuda()


# Cross Entropy Loss 
error = nn.CrossEntropyLoss()


# SGD Optimizer
learning_rate = 0.0001
optimizer = torch.optim.SGD(Fusionmodel.parameters(), lr=learning_rate)

In [None]:
######______________Feature fusion _________#############

temp_accuracy=0
count = 0
loss_list = []
iteration_list = []
accuracy_list = []
for epoch in range(num_epochs):
    for i, samples in enumerate(train_loader):
        
        imgs_face, imgs_road, speed, lms = samples['image_face'],samples['image_road'], samples["speed"], samples['label']

        imgs_face = imgs_face.view(24, 3, 16,224,224)
        a = torch.empty((24, 1, 16, 224, 224))
        a = torch.zeros_like(a) + speed
        imgs_face = torch.cat((imgs_face, a), dim =1)
        imgs_face = imgs_face.float()
        imgs_face = imgs_face.cuda()

        imgs_road = imgs_road.view(24, 16, 3,224,224)
        a = torch.empty((24, 16, 1, 224, 224))
        a = torch.zeros_like(a) + speed  
        imgs_road = torch.cat((imgs_road, a), dim =2)
        imgs_road = imgs_road.float()
        imgs_road = imgs_road.cuda()
        lms = lms.long()
        lms = lms.cuda()
        optimizer.zero_grad()

        outputs = Fusionmodel(imgs_face,imgs_road)

        # outputs = rnn_decoder(outputs)
        # Calculate softmax and ross entropy loss
        loss = error(outputs, lms)
        # Calculating gradients
        loss.backward()
        # Update parameters
        optimizer.step()
        
        count += 1
        if count % 10 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for j, samples in enumerate(test_loader):
                imgs_face, imgs_road, speed, lms = samples['image_face'],samples['image_road'], samples["speed"], samples['label']

                imgs_face = imgs_face.view(24,3,16,224,224)
                a = torch.empty((24, 1, 16, 224, 224))
                a = torch.zeros_like(a) + speed
                imgs_face = torch.cat((imgs_face, a), dim =1)
                imgs_face = imgs_face.float()
                imgs_face = imgs_face.cuda()

                imgs_road = imgs_road.view(24,16,3,224,224)
                a = torch.empty((24, 16, 1 ,224, 224))
                a = torch.zeros_like(a) + speed  
                imgs_road = torch.cat((imgs_road, a), dim =2)
                imgs_road = imgs_road.float()
                imgs_road = imgs_road.cuda()
                lms = lms.long()
                lms = lms.cuda()

                outputs = Fusionmodel(imgs_face, imgs_road)
                # Get predictions from the maximum value
                predicted = torch.max(outputs.data, 1)[1]
                
                # Total number of labels
                total += len(lms)
                correct += (predicted == lms).sum()
            
            accuracy = 100 * correct / float(total)
            
            if temp_accuracy<accuracy:
              temp_accuracy = accuracy
              torch.save(cnn_encoder.state_dict(), model_save_location_and_name_encoder )
              torch.save(rnn_decoder.state_dict(), model_save_location_and_name_decoder )
            # store loss and iteration
            loss_list.append(loss.data)
            iteration_list.append(count)
            accuracy_list.append(accuracy)
        if count % 10 == 0:
            # Print Loss
            print('Iteration: {}  Loss: {}  Accuracy: {} %'.format(count, loss.data, accuracy))

torch.Size([1, 50688])
torch.Size([1, 50688])
torch.Size([1, 50688])
torch.Size([1, 50688])
torch.Size([1, 50688])
torch.Size([1, 50688])
torch.Size([1, 50688])
torch.Size([1, 50688])


KeyboardInterrupt: ignored

In [None]:
# visualization loss 
plt.plot(iteration_list,loss_list)
plt.xlabel("Number of iteration")
plt.ylabel("Loss")
plt.title("CNN: Loss vs Number of iteration")
plt.show()

# visualization accuracy 
plt.plot(iteration_list,accuracy_list,color = "red")
plt.xlabel("Number of iteration")
plt.ylabel("Accuracy")
plt.title("CNN: Accuracy vs Number of iteration")
plt.show()