##Loading all library required

In [0]:
import torchvision
from skimage.io import imread
import os
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
import torch.optim as optim
import os
import numpy as np
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
import torch.utils.data as data
import torchvision
from torch.autograd import Variable
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
import pandas as pd
from skimage.io import imread
from skimage import io, transform
from PIL import Image 

In [0]:
# EncoderCNN architecture
CNN_fc_hidden1, CNN_fc_hidden2 = 1024, 768
CNN_embed_dim = 512   # latent dim extracted by 2D CNN
res_size = 224        # ResNet image size
dropout_p = 0.3       # dropout probability

# DecoderRNN architecture
RNN_hidden_layers = 3
RNN_hidden_nodes = 512
RNN_FC_dim = 256

# training parameters
k = 5             # number of target category
epochs = 10        # training epochs
batch_size = 40  
learning_rate = 1e-3
log_interval = 10   # interval for displaying training info

# Select which frame to begin & end in videos
begin_frame, end_frame, skip_frame = 1, 29, 1
device = "cuda" if torch.cuda.is_available() else "cpu"

In [0]:
from model import *

###Loading model

In [0]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)
      
class MyResnet(nn.Module):
    def __init__(self, inp = 2048, h1=1024, out = 5, d=0.30):
        super().__init__()
        resnet = torchvision.models.resnet50()
        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)
        self.ap = nn.AdaptiveAvgPool2d((1,1))
        self.mp = nn.AdaptiveMaxPool2d((1,1))
        self.fla = Flatten()
        self.bn0 = nn.BatchNorm1d(inp*2,eps=1e-05, momentum=0.1, affine=True)
        self.dropout0 = nn.Dropout(d)
        self.fc1 = nn.Linear(inp*2, h1)
        self.bn1 = nn.BatchNorm1d(h1,eps=1e-05, momentum=0.1, affine=True)
        self.dropout1 = nn.Dropout(d)
        self.fc2 = nn.Linear(h1, out)
        for m in self.modules():
          if isinstance(m,nn.Linear):
            nn.init.kaiming_normal_(m.weight)
            nn.init.zeros_(m.bias)
        
    def forward(self, x):
        x = self.resnet(x)
        ap = self.ap(x)
        mp = self.mp(x)
        x = torch.cat((ap,mp),dim=1)
        x = self.fla(x)
        x = self.bn0(x)
        x = self.dropout0(x)
        x = F.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.dropout1(x)         
        x = torch.sigmoid_(self.fc2(x))
        
        return x

In [0]:
class DecoderRNN(nn.Module):
    def __init__(self, CNN_embed_dim=300, h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=5):
        super(DecoderRNN, self).__init__()

        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc1 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc2 = nn.Linear(self.h_FC_dim, self.num_classes)

    def forward(self, x_RNN):
        
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(x_RNN, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """
        X=[]
        X2=[]

        for t in range(RNN_out.size(1)):

          x1=RNN_out[:,t,:]
          x1 = self.fc1(x1)   # choose RNN_out at the last time step
          x1 = F.relu(x1)
          x1 = F.dropout(x1, p=self.drop_p, training=self.training)
          x1 = self.fc2(x1)
          x1 = torch.sigmoid(x1)
          X.append(x1)
        x3=torch.mean(torch.stack(X),dim=0)


        return x3

In [0]:
class ResCNNEncoder(nn.Module):
    def __init__(self,modelA, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()
        modules=list(modelA.children())[:-9]
        self.model=nn.Sequential(*modules)
        self.ap = nn.AdaptiveAvgPool2d((1,1))
        self.mp = nn.AdaptiveMaxPool2d((1,1))
        self.fla = Flatten()
        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p
        self.fc1 = nn.Linear(2048*2, fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        for t in range(x_3d.size(1)):
            with torch.no_grad():
                images=x_3d[:,t, :, :, :]
                images = images.view(-1,3, 224, 224)  
                x = self.model(images.type(torch.cuda.FloatTensor))  
            # FC layers
            ap = self.ap(x)
            mp = self.mp(x)
            x = torch.cat((ap,mp),dim=1)
            x = self.fla(x)
            x = self.bn1(self.fc1(x))
            x = F.relu(x)
            x = self.bn2(self.fc2(x))
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq

In [0]:
class Audio(nn.Module):
    def __init__(self):
        super().__init__()
        
        ### START CODE HERE ### (6 lines for linear, 5 lines for batch norm)         
        layer_sizes=[26,256,128]
        layer_sizes1=[256,128]
        self.fc= nn.ModuleList([nn.Linear(layer_sizes[i-1],layer_sizes[i]) for i in range(1,len(layer_sizes))])
        self.bn= nn.ModuleList([nn.BatchNorm1d(layer_sizes1[i]) for i in range(0,len(layer_sizes1))])
        self.dropout=nn.Dropout(0.3)
        ### END CODE HERE ###
        
        
        # Initialize all layers
        ### START CODE HERE ### (4 lines) 
        for m in self.modules():
          if isinstance(m,nn.Linear):
            nn.init.kaiming_normal_(m.weight)
            nn.init.zeros_(m.bias)
        ### END CODE HERE ###

                        
    def forward(self, x):
        ### START CODE HERE ### 
        x= x.view(x.size(0),-1)
        for i in range(0,len(self.fc)):
          x=torch.relu(self.dropout(self.bn[i](self.fc[i](x))))

        #x= self.fc[-1](x)
        #x=torch.sigmoid(x)

        # (7 to 18 lines - 1 line to flatten input, 6 lines for linear, 5 lines for bn, 6 lines for relu)  
        ### END CODE HERE ###
        
        return x
    

In [0]:
class MyEnsemble(nn.Module):
    def __init__(self, modelA, modelB,modelC):
        super(MyEnsemble, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        self.modelC=  modelC
        layer_sizes=[133,128,64,5]
        layer_sizes1=[128,64]
        self.fc= nn.ModuleList([nn.Linear(layer_sizes[i-1],layer_sizes[i]) for i in range(1,len(layer_sizes))])
        self.bn= nn.ModuleList([nn.BatchNorm1d(layer_sizes1[i]) for i in range(0,len(layer_sizes1))])
        self.dropout=nn.Dropout(0.3)
        
    def forward(self, x1, x2):
        x1 = self.modelC(self.modelB(x1))
        x2 = self.modelA(x2)
        x = torch.cat((x1, x2), dim=1)
        x= x.view(x.size(0),-1)
        for i in range(0,len(self.fc)-1):
          x=torch.relu(self.dropout(self.bn[i](self.fc[i](x))))

        x= self.fc[-1](x)
        x=torch.sigmoid(x)

        return x

###Import helper function

In [0]:
class CombineDataset(data.Dataset):
    """Face Landmarks dataset."""

    def __init__(self, csv_file, root_dir,frame, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.landmarks_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.frame=frame

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        X = []
        for i in range(1,16): 
          img_name = os.path.join(self.root_dir, self.landmarks_frame.iloc[idx, 0])
          image = Image.open(str(img_name)+'_'+str(i)+'.jpg')

          if self.transform is not None:
                image = self.transform(image)
          X.append(image)

        X = torch.stack(X, dim=0)
        landmarks = self.landmarks_frame.iloc[idx, 27:]
        landmarks = np.array(landmarks)
        landmarks = landmarks.astype('float').reshape(-1, 5)
        audio =self.landmarks_frame.iloc[idx, 1:27]
        audio = np.array(audio)
        audio = audio.astype('float').reshape(-1, 26)
        #image = image.transpose((2, 0, 1))

        return X,landmarks,audio

In [0]:
def train(model, device, train_loader, optimizer, epochs):
    # set model as training mode
    model.train()
    model.to(device)
    criterion=torch.nn.L1Loss()
    
    loss_over_time = [] # to track the loss as the network trains
    average_over_time=[]
    valid_accuracy=[]
    valid_loss=[]
    for epoch in range(epochs):
      N_count = 0   # counting total trained sample in one epoch
      running_loss = 0.0
      running_corrects=0
      sum_per_epoch=0
      running_loss_per_epoch = 0.0
      item=0
      running_corrects_per_epoch=0
      model.train()
      for batch_idx, (X, y,data1) in enumerate(train_loader):
          # distribute data to device
          X, y = X.to(device), y.to(device).view(y.size(0), -1)
          print(X.shape)
          print(data1.shape)
          data1 = data1.type(torch.cuda.FloatTensor)
          data1 = data1.to(device)

          optimizer.zero_grad()

          output =model(x1=X,x2=data1)   # output has dim = (batch, number of classes)

          loss = criterion(output, y)
          running_loss+=loss.item()

          # to compute accuracy
            # y_pred != output
          output_pts=output.cpu()
          key_pts=y.cpu()
          running_corrects += torch.mean(1-abs(output_pts.data-key_pts))
          #step_score = accuracy_score(y.cpu().data.squeeze().numpy(), output.cpu().data.squeeze().numpy())

          loss.backward()
          optimizer.step()

          # show information
          if batch_idx % 10 == 9:    # print every 10 batches
                running_corrects_per_epoch+=running_corrects/10
                running_loss_per_epoch+=running_loss/10
                print('Epoch: {}, Batch: {}, Avg. Loss: {}, average accuracy: {}'.format(epoch + 1, batch_idx+1, running_loss/10,running_corrects/10))
                running_loss = 0.0
                running_corrects=0
                sum_per_epoch+=1
      print('Epoch: {}, Avg. Loss: {}, average accuracy: {}'.format(epoch + 1, running_loss_per_epoch/sum_per_epoch,running_corrects_per_epoch/sum_per_epoch))
      loss_over_time.append(running_loss_per_epoch*10/sum_per_epoch)
      average_over_time.append(running_corrects_per_epoch*10/sum_per_epoch)
      torch.save(model.state_dict(), "model_ensemble"+str(epoch)+".pt")
      acc,loss=evaluate(model)
      print("evaluated")
      valid_accuracy.append(acc)
      valid_loss.append(loss)


    return loss_over_time, average_over_time,valid_loss,valid_accuracy

In [0]:
def evaluate(net, use_gpu=True):
    
    # set to evaluation mode
    net.eval()
    criterion=torch.nn.L1Loss()
    running_corrects = 0
    item=0
    accuracy_0=0
    accuracy_1=0
    accuracy_2=0
    accuracy_3=0
    accuracy_4=0
    valid_loss=[]

    for i, (X, y,data1) in enumerate(testloader):

            X, y = X.to(device), y.to(device).view(y.size(0), -1)
            data1 = data1.type(torch.cuda.FloatTensor)
            data1 = data1.to(device)
            with torch.no_grad():
                output =net(x1=X,x2=data1)

                item+=1
                output=output.cpu()
                y=y.cpu()
                loss = criterion(output, y)
                valid_loss.append(loss)
                z=1-abs(output.data-y)
                accuracy_0+=torch.mean(z[:,0])
                accuracy_1+=torch.mean(z[:,1])
                accuracy_2+=torch.mean(z[:,2])
                accuracy_3+=torch.mean(z[:,3])
                accuracy_4+=torch.mean(z[:,4])
                
                running_corrects += torch.mean(1-abs(output.data-y))
  
                
                



    print('Accuracy = {:.2f}%'.format(100*running_corrects/item))
    print("Accuracy first item: "+str(accuracy_0/item))
    print("Accuracy second item: "+str(accuracy_1/item))
    print("Accuracy third item: "+str(accuracy_2/item))
    print("Accuracy fourth item: "+str(accuracy_3/item))
    print("Accuracy fifth item: "+str(accuracy_4/item))
    valid_losses = np.average(valid_loss)
    return(running_corrects/item,valid_losses)

###Implementation

Dataloader for testing and training

In [0]:
from torch.utils.data import Dataset, DataLoader
dataset = CombineDataset(csv_file='/content/drive/My Drive/audio_validation.csv',
                                    root_dir='/content/ImageData_validation/validation_15',frame=7,
                                    transform = transforms.Compose([
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]))
device = "cuda" if torch.cuda.is_available() else "cpu"
testloader = DataLoader(dataset, batch_size=15,
                        shuffle=True, num_workers=8)

In [0]:
dataset = CombineDataset(csv_file='/content/drive/My Drive/audui_training_15.csv',
                                    root_dir='/content/ImageData_training/training_15frame',
                                    root_dir2='/content/ImageData_training-part3/training_15frame',
                                    frame=15,
                                    transform = transforms.Compose([
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]))
device = "cuda" if torch.cuda.is_available() else "cpu"

In [0]:
trainloader = DataLoader(dataset, batch_size=15,
                        shuffle=True, num_workers=8)

Build model

In [0]:
model=MyResnet()
_ = model.load_state_dict(torch.load("/content/drive/My Drive/My_ResNet15_frame (1).pt"))

In [0]:
device = "cuda" if torch.cuda.is_available() else "cpu"
cnn_encoder = ResCNNEncoder(model,fc_hidden1=CNN_fc_hidden1, fc_hidden2=CNN_fc_hidden2, drop_p=0.5, CNN_embed_dim=CNN_embed_dim).to(device)
rnn_decoder = DecoderRNN(CNN_embed_dim=CNN_embed_dim, h_RNN_layers=RNN_hidden_layers, h_RNN=RNN_hidden_nodes, 
                         h_FC_dim=RNN_FC_dim, drop_p=0.5, num_classes=k).to(device)
PATH= "/content/drive/My Drive/MyResnetLSTMOnly(New_RNN)_20epochcnn_encoder.pt"
PATH2="/content/drive/My Drive/MyResnetOnlyLSTM(New_RNN)_20epoch_rnn_decoder.pt"
cnn_encoder.load_state_dict(torch.load(PATH))
rnn_decoder.load_state_dict(torch.load(PATH2))

In [0]:
model_audio=Audio()

In [0]:
model_ensemble=MyEnsemble(modelA=model_audio,modelB=cnn_encoder,modelC=rnn_decoder)

Freeze layer

In [0]:
for param in cnn_encoder.parameters():
    param.requires_grad = False
for param in rnn_decoder.parameters():
    param.requires_grad = False

In [0]:
optimizer = torch.optim.Adam(model_ensemble.parameters(), lr=0.001, weight_decay=1e-5)

In [0]:
hist_loss_model_ensemble,hist_corrects_model_ensemble,hist_loss_evaluate_model_ensemble,hist_corrects_evaluate_model_ensemble=train(model=model_ensemble,device=device,train_loader=trainloader,optimizer=optimizer,epochs=10)