In [0]:
import torch 
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np
import time
import cv2 
import argparse
import os 
import os.path as osp
import pickle as pkl
import pandas as pd
import random
import torchvision.models as models
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import skimage
import pickle
#import bcolz
embedding_dim = 300
max_caption_length = 25
dim_image = 4096
dim_hidden1 = 1000
dim_hidden2 = 1000
dim_input1 = 1000
dim_input2 = 1300

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
yolo_weights_path = "drive/My Drive/end_to_end/data/yolov3.weights"#"./data/yolov3.weights"
coco_names_path =  "drive/My Drive/end_to_end/data/coco.names"       #"./data/coco.names"
yolo_config_path =  "drive/My Drive/end_to_end/data/yolov3.cfg"              #"./data/yolov3.cfg"
embedding_path =  "drive/My Drive/end_to_end/data/embedding.npy"  #"./data/embedding.npy"
wordtoindex_path =     "drive/My Drive/end_to_end/data/wordtoindex.pickle"                   #"./data/wordtoindex.pickle"
indextoword_path =    "drive/My Drive/end_to_end/data/indextoword.pickle"                  #"./data/indextoword.pickle"
video_path =    "drive/My Drive/end_to_end/videos/dance.mp4"    #"./videos/0hyZ__3YhZc_485_490.avi"
model_save_path = "/content/drive/My Drive/main/teacher_model"
model_num = 170

In [2]:
print("loading embeddings and vocab.....")
def save_dict(path , dct):
    with open(path , "wb") as handle:
        pickle.dump(dct , handle , protocol = pickle.HIGHEST_PROTOCOL)
def load_dict(path):
    with open(path, "rb") as handle:
        b = pickle.load(handle)
    return b

embedding = torch.tensor(np.load(embedding_path)).to(device).double()

wordtoindex = load_dict(wordtoindex_path)

indextoword = load_dict(indextoword_path)




loading embeddings and vocab.....


In [0]:
def parse_cfg(cfgfile):
    """
    Takes a configuration file
    
    Returns a list of blocks. Each blocks describes a block in the neural
    network to be built. Block is represented as a dictionary in the list
    
    """
    
    file = open(cfgfile, 'r')
    lines = file.read().split('\n')                        # store the lines in a list
    lines = [x for x in lines if len(x) > 0]               # get read of the empty lines 
    lines = [x for x in lines if x[0] != '#']              # get rid of comments
    lines = [x.rstrip().lstrip() for x in lines]           # get rid of fringe whitespaces
    
    block = {}
    blocks = []
    
    for line in lines:
        if line[0] == "[":               # This marks the start of a new block
            if len(block) != 0:          # If block is not empty, implies it is storing values of previous block.
                blocks.append(block)     # add it the blocks list
                block = {}               # re-init the block
            block["type"] = line[1:-1].rstrip()     
        else:
            key,value = line.split("=") 
            block[key.rstrip()] = value.lstrip()
    blocks.append(block)
    
    return blocks

In [0]:
def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = False):

    
    batch_size = prediction.size(0)
    stride =  inp_dim // prediction.size(2)
    grid_size = inp_dim // stride
    bbox_attrs = 5 + num_classes
    num_anchors = len(anchors)
    
    prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
    prediction = prediction.transpose(1,2).contiguous()
    prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
    anchors = [(a[0]/stride, a[1]/stride) for a in anchors]

    #Sigmoid the  centre_X, centre_Y. and object confidencce
    prediction[:,:,0] = torch.sigmoid(prediction[:,:,0])
    prediction[:,:,1] = torch.sigmoid(prediction[:,:,1])
    prediction[:,:,4] = torch.sigmoid(prediction[:,:,4])
    
    #Add the center offsets
    grid = np.arange(grid_size)
    a,b = np.meshgrid(grid, grid)

    x_offset = torch.FloatTensor(a).view(-1,1)
    y_offset = torch.FloatTensor(b).view(-1,1)

    if CUDA:
        x_offset = x_offset.cuda()
        y_offset = y_offset.cuda()

    x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0)

    prediction[:,:,:2] += x_y_offset

    #log space transform height and the width
    anchors = torch.FloatTensor(anchors)

    if CUDA:
        anchors = anchors.cuda()

    anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
    prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors
    
    prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes]))

    prediction[:,:,:4] *= stride
    
    return prediction
    

In [0]:
class EmptyLayer(nn.Module):
    def __init__(self):
        super(EmptyLayer, self).__init__()
        

class DetectionLayer(nn.Module):
    def __init__(self, anchors):
        super(DetectionLayer, self).__init__()
        self.anchors = anchors     
                

In [0]:
def create_modules(blocks):
    net_info = blocks[0]     #Captures the information about the input and pre-processing    
    module_list = nn.ModuleList()
    prev_filters = 3
    output_filters = []
    
    for index, x in enumerate(blocks[1:]):
        module = nn.Sequential()
    
        #check the type of block
        #create a new module for the block
        #append to module_list
        
        #If it's a convolutional layer
        if (x["type"] == "convolutional"):
            #Get the info about the layer
            activation = x["activation"]
            try:
                batch_normalize = int(x["batch_normalize"])
                bias = False
            except:
                batch_normalize = 0
                bias = True
        
            filters= int(x["filters"])
            padding = int(x["pad"])
            kernel_size = int(x["size"])
            stride = int(x["stride"])
        
            if padding:
                pad = (kernel_size - 1) // 2
            else:
                pad = 0
        
            #Add the convolutional layer
            conv = nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias = bias)
            module.add_module("conv_{0}".format(index), conv)
        
            #Add the Batch Norm Layer
            if batch_normalize:
                bn = nn.BatchNorm2d(filters)
                module.add_module("batch_norm_{0}".format(index), bn)
        
            #Check the activation. 
            #It is either Linear or a Leaky ReLU for YOLO
            if activation == "leaky":
                activn = nn.LeakyReLU(0.1, inplace = True)
                module.add_module("leaky_{0}".format(index), activn)
        
            #If it's an upsampling layer
            #We use Bilinear2dUpsampling
        elif (x["type"] == "upsample"):
            stride = int(x["stride"])
            upsample = nn.Upsample(scale_factor = 2, mode = "nearest")
            module.add_module("upsample_{}".format(index), upsample)
                
        #If it is a route layer
        elif (x["type"] == "route"):
            x["layers"] = x["layers"].split(',')
            #Start  of a route
            start = int(x["layers"][0])
            #end, if there exists one.
            try:
                end = int(x["layers"][1])
            except:
                end = 0
            #Positive anotation
            if start > 0: 
                start = start - index
            if end > 0:
                end = end - index
            route = EmptyLayer()
            module.add_module("route_{0}".format(index), route)
            if end < 0:
                filters = output_filters[index + start] + output_filters[index + end]
            else:
                filters= output_filters[index + start]
    
        #shortcut corresponds to skip connection
        elif x["type"] == "shortcut":
            shortcut = EmptyLayer()
            module.add_module("shortcut_{}".format(index), shortcut)
            
        #Yolo is the detection layer
        elif x["type"] == "yolo":
            mask = x["mask"].split(",")
            mask = [int(x) for x in mask]
    
            anchors = x["anchors"].split(",")
            anchors = [int(a) for a in anchors]
            anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)]
            anchors = [anchors[i] for i in mask]
    
            detection = DetectionLayer(anchors)
            module.add_module("Detection_{}".format(index), detection)
                              
        module_list.append(module)
        prev_filters = filters
        output_filters.append(filters)
    return (net_info, module_list)

In [0]:
class Darknet(nn.Module):
    def __init__(self, cfgfile):
        super(Darknet, self).__init__()
        self.blocks = parse_cfg(cfgfile)
        self.net_info, self.module_list = create_modules(self.blocks)
        
    def forward(self, x, CUDA):
        modules = self.blocks[1:]
        outputs = {}   #We cache the outputs for the route layer
        
        write = 0
        for i, module in enumerate(modules):        
            module_type = (module["type"])
            
            if module_type == "convolutional" or module_type == "upsample":
                x = self.module_list[i](x)
    
            elif module_type == "route":
                layers = module["layers"]
                layers = [int(a) for a in layers]
    
                if (layers[0]) > 0:
                    layers[0] = layers[0] - i
    
                if len(layers) == 1:
                    x = outputs[i + (layers[0])]
    
                else:
                    if (layers[1]) > 0:
                        layers[1] = layers[1] - i
    
                    map1 = outputs[i + layers[0]]
                    map2 = outputs[i + layers[1]]
                    x = torch.cat((map1, map2), 1)
                
    
            elif  module_type == "shortcut":
                from_ = int(module["from"])
                x = outputs[i-1] + outputs[i+from_]
    
            elif module_type == 'yolo':        
                anchors = self.module_list[i][0].anchors
                #Get the input dimensionsthis video. Please check back late
                inp_dim = int (self.net_info["height"])
        
                #Get the number of classes
                num_classes = int (module["classes"])
        
                #Transform 
                x = x.data
                x = predict_transform(x, inp_dim, anchors, num_classes, CUDA)
                if not write:              #if no collector has been intialised. 
                    detections = x
                    write = 1
        
                else:       
                    detections = torch.cat((detections, x), 1)
        
            outputs[i] = x
        
        return detections
    def load_weights(self , weightfile):
        fp = open(weightfile , "rb")
        header = np.fromfile(fp , dtype = np.int32 , count = 5)
        self.header = torch.tensor(header)
        self.seen = self.header[3]
        weights = np.fromfile(fp , dtype = np.float32)
        ptr = 0
        for i in range(len(self.module_list)):
            module_type = self.blocks[i + 1]["type"]
            if(module_type == "convolutional"):
                model = self.module_list[i]
                try:
                    batch_normalize = int(self.blocks[i + 1]["batch_normalize"])
                except:
                    batch_normalize = 0
                conv = model[0]
                if (batch_normalize):
                    bn = model[1]

                    #Get the number of weights of Batch Norm Layer
                    num_bn_biases = bn.bias.numel()

                    #Load the weights
                    bn_biases = torch.from_numpy(weights[ptr:ptr + num_bn_biases])
                    ptr += num_bn_biases

                    bn_weights = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
                    ptr  += num_bn_biases

                    bn_running_mean = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
                    ptr  += num_bn_biases

                    bn_running_var = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
                    ptr  += num_bn_biases

                    #Cast the loaded weights into dims of model weights. 
                    bn_biases = bn_biases.view_as(bn.bias.data)
                    bn_weights = bn_weights.view_as(bn.weight.data)
                    bn_running_mean = bn_running_mean.view_as(bn.running_mean)
                    bn_running_var = bn_running_var.view_as(bn.running_var)

                    #Copy the data to model
                    bn.bias.data.copy_(bn_biases)
                    bn.weight.data.copy_(bn_weights)
                    bn.running_mean.copy_(bn_running_mean)
                    bn.running_var.copy_(bn_running_var)

                else:
                    num_biases = conv.bias.numel()

                    #Load the weights
                    conv_biases = torch.tensor(weights[ptr: ptr + num_biases])
                    ptr = ptr + num_biases
                    #reshape the loaded weights according to the dims of the model weights
                    conv_biases = conv_biases.view_as(conv.bias.data)
                    #Finally copy the data
                    conv.bias.data.copy_(conv_biases)

                #Let us load the weights for the Convolutional layers
                num_weights = conv.weight.numel()

                #Do the same as above for weights
                conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights])
                ptr = ptr + num_weights

                conv_weights = conv_weights.view_as(conv.weight.data)
                conv.weight.data.copy_(conv_weights)


In [8]:
print("loading object detector...")
model = Darknet(yolo_config_path).to(device).eval()
model.load_weights(yolo_weights_path)


loading object detector...


In [0]:
def unique(tensor):
    tensor_np = tensor.cpu().numpy()
    unique_np = np.unique(tensor_np)
    unique_tensor = torch.tensor(unique_np)
    tensor_res = tensor.new(unique_tensor.shape)
    tensor_res.copy_(unique_tensor)
    return tensor_res

In [0]:
def bbox_iou(box1 , box2):
    b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3]
    b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3]
    
    inter_rect_x1 =  torch.max(b1_x1, b2_x1)
    inter_rect_y1 =  torch.max(b1_y1, b2_y1)
    inter_rect_x2 =  torch.min(b1_x2, b2_x2)
    inter_rect_y2 =  torch.min(b1_y2, b2_y2)
    inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp(inter_rect_y2 - inter_rect_y1 + 1, min=0)
    #Union Area
    b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
    b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
    
    iou = inter_area / (b1_area + b2_area - inter_area)
    return iou

In [0]:
def write_results(prediction , confidence , num_classes , nms_conf = .4):
    conf_mask = (prediction[: , : , 4] > confidence).float().unsqueeze(2)
    prediction = prediction*conf_mask
    box_corner = prediction.new(prediction.shape)
    box_corner[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2)
    box_corner[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2)
    box_corner[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 
    box_corner[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2)
    prediction[:,:,:4] = box_corner[:,:,:4]
    batch_size = prediction.size(0)
    write = False
    for ind in range(batch_size):
        image_pred = prediction[ind]
        max_conf , max_conf_score  = torch.max(image_pred[: , 5:5 + num_classes] , 1)
        max_conf = max_conf.float().unsqueeze(1)
        max_conf_score = max_conf_score.float().unsqueeze(1)
        seq = (image_pred[:,:5] , max_conf , max_conf_score)
        image_pred = torch.cat(seq , 1)
        non_zero_ind = (torch.nonzero(image_pred[:,4]))
        try:
            image_pred_ = image_pred[non_zero_ind.squeeze() , :].view(-1,7)
        except:
            continue
        if(image_pred_.shape[0] == 0):
            continue
        img_classes = unique(image_pred_[:,-1])
        for cls in img_classes:
            cls_mask = image_pred_*(image_pred_[:,-1] == cls).float().unsqueeze(1)
            class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze()
            image_pred_class = image_pred_[class_mask_ind].view(-1, 7)
            
            conf_sort_index = torch.sort(image_pred_class[:,4] , descending = True)[1]
            image_pred_class  =image_pred_class[conf_sort_index]
            idx  =image_pred_class.size(0)
            for i in range(idx):
                try:
                    ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
                except ValueError:
                    break
                except IndexError:
                    break
                iou_mask = (ious < nms_conf).float().unsqueeze(1)
                image_pred_class[i+1:] = image_pred_class[i+1:]*iou_mask
                non_zero_ind  = torch.nonzero(image_pred_class[: , 4])
                image_pred_class = image_pred_class[non_zero_ind].view(-1,7)
            batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) 
            seq = batch_ind, image_pred_class
            if not write:
                output = torch.cat(seq , 1)
                write = True
            else:
                out = torch.cat(seq,1)
                output  = torch.cat((output,out))
                
    try:
        return output
    except:
        return 0
                
    
        


In [0]:
def load_classes(namesfile):
    fp = open(namesfile, "r")
    names = fp.read().split("\n")[:-1]
    return names

In [13]:
print("loading object classes....")
num_classes = 80    #For COCO
classes = load_classes(coco_names_path)

loading object classes....


In [14]:
print("loading feature extractor...")
vgg16 = models.vgg16(pretrained=True)
vgg16.classifier = nn.Sequential(*[vgg16.classifier[i] for i in range(4)])
for p in vgg16.parameters():
    p.requires_grad = False
vgg16 = vgg16.to(device).eval()
num_frames = 80
dim_embedding = 300    

loading feature extractor...


In [0]:
def preprocess_frame(image, target_height=224, target_width=224):
    if len(image.shape) == 2:
        image = np.tile(image[:,:,None], 3)
    elif len(image.shape) == 4:
        image = image[:,:,:,0]

    image = skimage.img_as_float(image).astype(np.float32)
    height, width, rgb = image.shape
    if width == height:
        resized_image = cv2.resize(image, (target_height,target_width))

    elif height < width:
        resized_image = cv2.resize(image, (int(width * float(target_height)/height), target_width))
        cropping_length = int((resized_image.shape[1] - target_height) / 2)
        resized_image = resized_image[:,cropping_length:resized_image.shape[1] - cropping_length]

    else:
        resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width)))
        cropping_length = int((resized_image.shape[0] - target_width) / 2)
        resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length,:]

    return cv2.resize(resized_image, (target_height, target_width))

In [0]:
def convert_names(name):
  if(name == 'traffic light'):
    return "traffic"
  elif(name == "fire hydrant"):
    return "extinguisher"
  elif(name == "stop sign"):
    return "signboard"
  elif(name == "parking meter"):
    return "meter"
  elif(name == "sports ball"):
    return "ball"
  elif(name == "baseball bat"):
    return "bat"
  elif(name == "pottedplant"):
    return "plant"
  elif(name == "baseball glove"):
    return "glove"
  elif(name == "tennis racket"):
    return "racket"
  elif(name == "wine glass"):
    return "glass"
  elif(name == "hot dog"):
    return "food"
  elif(name == "diningtable"):
    return "table"
  elif(name == "tvmonitor"):
    return "tv"
  elif(name ==  'cell phone'):
    return "phone"
  elif(name == "teddy bear"):
    return "toy"
  elif(name == "hair drier"):
    return "drier"
  elif(name == "aeroplane"):
    return "plane" 
  else:
    return name

In [17]:
print("Reading your video...(hope i enjoy it)")
cap = cv2.VideoCapture(video_path)

Reading your video...(hope i enjoy it)


In [18]:
frame_count = 0
frame_list = []

while True:
    ret , frame = cap.read()   
    if(ret == False):
        break
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame_list.append(frame)
    frame_count = frame_count + 1
frame_list = np.array(frame_list)
if frame_count > 80:
    frame_indices = np.linspace(0, frame_count, num=num_frames, endpoint=False).astype(int)
    frame_list = frame_list[frame_indices]
cropped_array = []
for i in range(frame_list.shape[0]):
    cropped_array.append(preprocess_frame(frame_list[i]).transpose(2,0,1))
vgg_tensor = torch.tensor(cropped_array).to(device)

yolo_tensor  = torch.tensor(cropped_array).to(device)
norm = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
for i in range(vgg_tensor.size(0)):
    vgg_tensor[i] = norm(vgg_tensor[i])
object_list = np.zeros((yolo_tensor.size(0) , dim_embedding))
for yolo_frame in range(yolo_tensor.size(0)):
    inp = yolo_tensor[yolo_frame:yolo_frame+1]
    cuda = False
    if(torch.cuda.is_available()):
        cuda = True
    pred = model(inp , cuda)
    res = write_results(pred , .65 ,80 )
      
    if(type(res) is int):
        pass
    else:
        _, index = torch.sort(res[: , 5] , 0 , descending = True)
        res = res[index][0].to("cpu").numpy()
        class_index = int(res[-1].item())
        obj  = classes[class_index]
        object_list[yolo_frame] = embedding[wordtoindex[convert_names(obj)]].to("cpu").numpy() 
vgg_features  = vgg16(vgg_tensor).detach().to("cpu").numpy()
print("yaaaaawn .... That was a boring video :3")

yaaaaawn .... That was a boring video :3


In [19]:
object_features = torch.tensor(object_list).to(device).double().unsqueeze(0)
vgg_features = torch.tensor(vgg_features).to(device).double().unsqueeze(0)
print("Loading caption model")

Loading caption model


In [0]:
class Video_captioner(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(len(wordtoindex), embedding_dim)
        self.embedding.weight = nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False
        self.lstm1 = nn.LSTM(dim_input1 , dim_hidden1, batch_first = True)
        self.lstm2 = nn.LSTM(dim_input2 , dim_hidden2 , batch_first = True)
        self.img_encode = nn.Linear(dim_image , dim_input1)
        self.embed_word = nn.Linear(dim_hidden2 ,len(wordtoindex))
    def forward(self  , video , objects , feed , teacher = True ):
        emb = self.embedding(feed)
        out  = self.img_encode(video)
        #encoding
        out11 , state11 = self.lstm1(out)
        lstm2_inp = torch.cat( (objects , out11) , dim = 2   )
        out12 , state12 = self.lstm2(lstm2_inp)
        #decoding
        if(teacher == True):
            padding = torch.zeros(video.size(0) , feed.size(1) , dim_input1).to(device).double()
            out21 , state21 = self.lstm1(padding, state11)
            lstm2_inp2 = torch.cat( (emb , out21)  , dim = 2 )
            out22 , state22 = self.lstm2(lstm2_inp2 , state12)
        else:
            padding = torch.zeros(video.size(0) , max_caption_length-1 , dim_input1).to(device).double()
            out21 , state21 = self.lstm1(padding , state11)
            out22 = torch.zeros(video.size(0) , max_caption_length  -1, dim_hidden2).to(device).double()
            state__ = state12
            for i in range(max_caption_length -1):
                inp_lstm2 = torch.cat((emb , out21[: , i:i+1, :]) , dim = 2)
                out__ , state__ = self.lstm2(inp_lstm2 , state__)
                out22[: , i:i+1 , :] = out__
                with torch.no_grad():
                    out__  = self.embed_word(out__)
                    out__ = torch.argmax(out__ , dim = 2)                    
                    out__ = self.embedding(out__)
                    emb = out__
                    
                    
                
                
        out22 = out22.contiguous()    
        soft  = self.embed_word(out22)
        meaning_out = torch.matmul(soft , embedding)
        return soft.view(-1, len(wordtoindex)) , meaning_out 
        
        

In [0]:
class Attention_model(nn.Module):
    def __init__(self ):
        super().__init__()
        self.embedding = nn.Embedding(len(wordtoindex), embedding_dim)
        self.embedding.weight = nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False
        self.img_encode = nn.Linear(dim_image , dim_input1)
        self.lstm1 = nn.LSTM(dim_input1 , dim_hidden1, batch_first = True)
        self.lstm2 = nn.LSTM(dim_input2 , dim_hidden2 , batch_first = True)
        self.embed_word = nn.Linear(dim_hidden2 ,len(wordtoindex))
        self.attn = nn.Linear(dim_hidden1  , dim_hidden2)
        self.tanh = nn.Tanh()
        self.context_inp = nn.Linear(dim_hidden1 , dim_hidden1)
    def calculate_attention(self, prev_out , encoder_out):
        seq_len = encoder_out.size(1)
        encoder_out = self.attn(encoder_out)
        encoder_out = encoder_out.transpose(1,2)
        att_energy = torch.bmm(prev_out , encoder_out)
        att_energy =  F.softmax(att_energy , dim = 2)
        return att_energy
    
    def decoder_teacher(self , encoder_out , emb , decoder_state):
        prev_out = torch.zeros(encoder_out.size(0) , 1 , dim_hidden2).to(device).double()
        out22 = torch.zeros( encoder_out.size(0), max_caption_length -1 , dim_hidden2).to(device).double()
        for i in range(max_caption_length - 1):
            attention_energy = self.calculate_attention(prev_out , encoder_out)
            context = torch.bmm(attention_energy , encoder_out)
            context = self.context_inp(context)
            lstm2_inp2 = torch.cat((emb[: , i:i+1 , :] , context ), dim = 2)
            decoder_out , decoder_state = self.lstm2(lstm2_inp2 , decoder_state)
            out22[: , i:i+1 , :] = decoder_out
            prev_out = decoder_out
        return out22
    def decoder_non_teacher(self , encoder_out , emb , decoder_state):
        prev_out = torch.zeros(encoder_out.size(0) , 1 , dim_hidden2).to(device).double()
        out22 = torch.zeros(encoder_out.size(0) , max_caption_length -1 , dim_hidden2).to(device).double()
        for i in range(max_caption_length - 1):
            attention_energy = self.calculate_attention(prev_out , encoder_out)
            context = torch.bmm(attention_energy , encoder_out)
            context = self.context_inp(context)
            lstm2_inp2 = torch.cat((emb , context ), dim = 2)
            decoder_out , decoder_state = self.lstm2(lstm2_inp2 , decoder_state)
            out22[: , i:i+1 , :] = decoder_out
            prev_out = decoder_out
            with torch.no_grad():
                decoder_out  = self.embed_word(decoder_out)
                decoder_out = torch.argmax(decoder_out , dim = 2)                    
                emb = self.embedding(decoder_out)
        return out22
        
            
        
        

    def forward(self  , video , objects , feed , teacher = True):
        emb = self.embedding(feed)
        video  = self.img_encode(video)
        # encode
        out11  , state11 = self.lstm1(video)
        lstm2_inp = torch.cat( (objects , out11) , dim = 2 )
        out12  , decoder_state  = self.lstm2(lstm2_inp)
        encoder_out = out11
        # Decode
        if(teacher == True):
            out22  = self.decoder_teacher(encoder_out , emb , decoder_state)
        else:
            out22 = self.decoder_non_teacher(encoder_out , emb , decoder_state)
            
        out22 = out22.contiguous() 
        soft  = self.embed_word(out22)
        meaning_out = torch.matmul(soft , embedding)
        return soft.view(-1, len(wordtoindex)) , meaning_out 

In [0]:
video_model = Attention_model().to(device).double().eval()
#video_model =  Video_captioner().to(device).double().eval()
bos = torch.tensor(wordtoindex["<bos>"]).to(device)
bos = bos.unsqueeze(0).unsqueeze(0)

In [0]:
def generate(model_num):
    if(torch.cuda.is_available()):
        video_model.load_state_dict(torch.load(os.path.join(model_save_path , "model" + str(model_num) + ".pt")  ))
    else:
        video_model.load_state_dict(torch.load(os.path.join(model_save_path , "model" + str(model_num) + ".pt") ,map_location='cpu'   ))
    sent = []
    video , yolo = vgg_features , object_features
    out_ = video_model(video , yolo , bos , False)
    out_ = torch.argmax(out_[0] , dim = 1).to("cpu").numpy()
    for i in out_:
        w = indextoword[i]
        if(w == "<eos>"):
            break
        sent.append(w)
    sent = " ".join(sent)

    print(" ")
    print(" ")
    print(">> " , sent , " <<")
        


In [24]:
print("The caption is......drum rollllll")
generate(model_num)

The caption is......drum rollllll
 
 
>>  a woman is washing a box  <<
