In [0]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import pickle
import torch 
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import random
import matplotlib.pyplot as plt
import re
import torch.nn.functional as F
vgg_path = "drive/My Drive/common/cnn_features"
corpus_path = "drive/My Drive/common/video_corpus.csv"
model_save_path1 = "drive/My Drive/main/model_loss1"
yolo_path = "drive/My Drive/common/object_features"
wordtoindex_path = "drive/My Drive/common/wordtoindex.pickle"
indextoword_path = "drive/My Drive/common/indextoword.pickle"
embedding_path = "drive/My Drive/common/embedding.npy"
train_names_path = "drive/My Drive/common/train_names.npy"
test_names_path = "drive/My Drive/common/test_names.npy"
valid_names_path = "drive/My Drive/common/valid_names.npy"
sentence_model_path = "drive/My Drive/common/sentence_compare_model"
model_save_path2 = "drive/My Drive/main/model_loss2"
model_save_path3 = "drive/My Drive/main/model_loss3"
model_save_prob = "drive/My Drive/main/model_prob"
model_save_teach = "drive/My Drive/main/teacher_model"
result_path = "drive/My Drive/main/results_teach"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_caption_length = 25
dim_image = 4096
dim_hidden1 = 1000
dim_hidden2 = 1000
dim_input1 = 1000
dim_input2 = 1300
embedding_dim = 300



In [0]:
def save_dict(path , dct):
    with open(path , "wb") as handle:
        pickle.dump(dct , handle , protocol = pickle.HIGHEST_PROTOCOL)
def load_dict(path):
    with open(path, "rb") as handle:
        b = pickle.load(handle)
    return b

In [0]:
embedding = np.load(embedding_path).astype(np.float32)
wordtoindex = load_dict(wordtoindex_path)
indextoword = load_dict(indextoword_path)
embedding = torch.tensor(embedding).to(device).double()

In [0]:
def clean(s):
    regex = re.compile('[^a-zA-Z" "]')
    s =  regex.sub("" , s).strip().lower()
    new_w = [wordtoindex["<bos>"]]
    for w in s.split(" "):
        if(w in wordtoindex):
            new_w.append(wordtoindex[w])
    new_w.append(wordtoindex["<eos>"])
    return new_w
        

In [0]:
def get_data(tp):
    video_data = pd.read_csv(corpus_path, sep=',')
    video_data = video_data[video_data['Language'] == 'English']
    video_data['video_name'] = video_data.apply(lambda row: row['VideoID']+'_'+str(int(row['Start']))+'_'+str(int(row['End']))+'.avi.npy', axis=1)
    video_data['video_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(int(row['Start']))+'_'+str(int(row['End']))+'.avi.npy', axis=1)
    video_data['yolo_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(int(row['Start']))+'_'+str(int(row['End']))+'.avi.npy', axis=1)
    video_data['yolo_path'] = video_data['yolo_path'].map(lambda x: os.path.join(yolo_path, x))
    video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(vgg_path, x))
    if(tp == "train"):
        names = np.load(train_names_path , allow_pickle = True)

    elif(tp == "test"):
        names =  np.load(test_names_path , allow_pickle = True)
        video_data = video_data[video_data['video_name'].map(lambda x: x in names)]
        video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))]
        unique_filenames = sorted(video_data['video_path'].unique())
        data = video_data[video_data['video_path'].map(lambda x: x in unique_filenames)]
        return data.groupby(["video_path" , "yolo_path"])
        
    elif(tp == "valid"):
        names = np.load(valid_names_path , allow_pickle = True)
    names = names.tolist()
    video_data = video_data[video_data['video_name'].map(lambda x: x in names)]
    video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))]
    video_data["Description"] = video_data["Description"].map(lambda x : clean(x))
    video_data = video_data[video_data['Description'].map(lambda x: True if len(x) > 3 and len(x) < max_caption_length else False )]
    unique_filenames = sorted(video_data['video_path'].unique())
    data = video_data[video_data['video_path'].map(lambda x: x in unique_filenames)]
    print(len(data))
    return data.groupby(["video_path" , "yolo_path"])

In [0]:
df = get_data("test")

In [0]:
class Video_captioner(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(len(wordtoindex), embedding_dim)
        self.embedding.weight = nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False
        self.lstm1 = nn.LSTM(dim_input1 , dim_hidden1, batch_first = True)
        self.lstm2 = nn.LSTM(dim_input2 , dim_hidden2 , batch_first = True)
        self.img_encode = nn.Linear(dim_image , dim_input1)
        self.embed_word = nn.Linear(dim_hidden2 ,len(wordtoindex))
    def forward(self  , video , objects , feed , teacher = True ):
        emb = self.embedding(feed)
        out  = self.img_encode(video)
        #encoding
        out11 , state11 = self.lstm1(out)
        lstm2_inp = torch.cat( (objects , out11) , dim = 2   )
        out12 , state12 = self.lstm2(lstm2_inp)
        #decoding
        if(teacher == True):
            padding = torch.zeros(video.size(0) , feed.size(1) , dim_input1).to(device).double()
            out21 , state21 = self.lstm1(padding, state11)
            lstm2_inp2 = torch.cat( (emb , out21)  , dim = 2 )
            out22 , state22 = self.lstm2(lstm2_inp2 , state12)
        else:
            padding = torch.zeros(video.size(0) , max_caption_length-1 , dim_input1).to(device).double()
            out21 , state21 = self.lstm1(padding , state11)
            out22 = torch.zeros(video.size(0) , max_caption_length  -1, dim_hidden2).to(device).double()
            state__ = state12
            for i in range(max_caption_length -1):
                inp_lstm2 = torch.cat((emb , out21[: , i:i+1, :]) , dim = 2)
                out__ , state__ = self.lstm2(inp_lstm2 , state__)
                out22[: , i:i+1 , :] = out__
                with torch.no_grad():
                    out__  = self.embed_word(out__)
                    out__ = torch.argmax(out__ , dim = 2)                    
                    out__ = self.embedding(out__)
                    emb = out__
                    
                    
                
                
        out22 = out22.contiguous()    
        soft  = self.embed_word(out22)
        emb = torch.matmul(soft , embedding)
        return soft.view(-1, len(wordtoindex)) , emb 
        
        
        

        
        

In [0]:
class Attention_model(nn.Module):
    def __init__(self ):
        super().__init__()
        self.embedding = nn.Embedding(len(wordtoindex), embedding_dim)
        self.embedding.weight = nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False
        self.img_encode = nn.Linear(dim_image , dim_input1)
        self.lstm1 = nn.LSTM(dim_input1 , dim_hidden1, batch_first = True)
        self.lstm2 = nn.LSTM(dim_input2 , dim_hidden2 , batch_first = True)
        self.embed_word = nn.Linear(dim_hidden2 ,len(wordtoindex))
        self.attn = nn.Linear(dim_hidden1  , dim_hidden2)
        self.tanh = nn.Tanh()
        self.context_inp = nn.Linear(dim_hidden1 , dim_hidden1)
    def calculate_attention(self, prev_out , encoder_out):
        seq_len = encoder_out.size(1)
        encoder_out = self.attn(encoder_out)
        encoder_out = encoder_out.transpose(1,2)
        att_energy = torch.bmm(prev_out , encoder_out)
        att_energy =  F.softmax(att_energy , dim = 2)
        return att_energy
    
    def decoder_teacher(self , encoder_out , emb , decoder_state):
        prev_out = torch.zeros(encoder_out.size(0) , 1 , dim_hidden2).to(device).double()
        out22 = torch.zeros( encoder_out.size(0), max_caption_length -1 , dim_hidden2).to(device).double()
        for i in range(max_caption_length - 1):
            attention_energy = self.calculate_attention(prev_out , encoder_out)
            context = torch.bmm(attention_energy , encoder_out)
            context = self.context_inp(context)
            lstm2_inp2 = torch.cat((emb[: , i:i+1 , :] , context ), dim = 2)
            decoder_out , decoder_state = self.lstm2(lstm2_inp2 , decoder_state)
            out22[: , i:i+1 , :] = decoder_out
            prev_out = decoder_out
        return out22
    def decoder_non_teacher(self , encoder_out , emb , decoder_state):
        prev_out = torch.zeros(encoder_out.size(0) , 1 , dim_hidden2).to(device).double()
        out22 = torch.zeros(encoder_out.size(0) , max_caption_length -1 , dim_hidden2).to(device).double()
        for i in range(max_caption_length - 1):
            attention_energy = self.calculate_attention(prev_out , encoder_out)
            context = torch.bmm(attention_energy , encoder_out)
            context = self.context_inp(context)
            lstm2_inp2 = torch.cat((emb , context ), dim = 2)
            decoder_out , decoder_state = self.lstm2(lstm2_inp2 , decoder_state)
            out22[: , i:i+1 , :] = decoder_out
            prev_out = decoder_out
            with torch.no_grad():
                decoder_out  = self.embed_word(decoder_out)
                decoder_out = torch.argmax(decoder_out , dim = 2)                    
                emb = self.embedding(decoder_out)
        return out22
        
            
        
        

    def forward(self  , video , objects , feed , teacher = True):
        emb = self.embedding(feed)
        video  = self.img_encode(video)
        # encode
        out11  , state11 = self.lstm1(video)
        lstm2_inp = torch.cat( (objects , out11) , dim = 2 )
        out12  , decoder_state  = self.lstm2(lstm2_inp)
        encoder_out = out11
        # Decode
        if(teacher == True):
            out22  = self.decoder_teacher(encoder_out , emb , decoder_state)
        else:
            out22 = self.decoder_non_teacher(encoder_out , emb , decoder_state)
            
        out22 = out22.contiguous() 
        soft  = self.embed_word(out22)
        meaning_out = torch.matmul(F.softmax(soft , dim = 2) , embedding)
        return soft.view(-1, len(wordtoindex)) , meaning_out 

In [0]:
#video_model = Video_captioner().to(device).double().eval()
video_model = Attention_model().to(device).double().eval()


In [0]:
bos = torch.tensor(wordtoindex["<bos>"]).to(device)
bos = bos.unsqueeze(0).unsqueeze(0)


In [0]:
video_dict = {}
yolo_dict = {}

for g in df:
    video_dict[g[0][0]] = torch.tensor(np.load(g[0][0] , allow_pickle = False))
    yolo_dict[g[0][1]] = torch.tensor(np.load(g[0][1] ,  allow_pickle = False))


In [0]:

def generate(model_num):
    video_model.load_state_dict(torch.load(os.path.join(model_save_teach , "model" + str(model_num) + ".pt")  ))
    truth_dict_list = []
    gen_dict_list = []
    img_id = 0
    regex = re.compile('[^a-zA-Z" "]')
    for g in df:
        video = video_dict[g[0][0]].unsqueeze(0).double().to(device)
        yolo = yolo_dict[g[0][1]].unsqueeze(0).double().to(device)
        out_ = video_model(video , yolo , bos , False)
        out_ = torch.argmax(out_[0] , dim = 1).to("cpu").numpy()
        sent = []
        for i in out_:
            w = indextoword[i]
            if(w == "<eos>"):
                break
            sent.append(w)
        sent = " ".join(sent)
#        print(sent)
        gen_dict_list.append({'image_id': img_id, 'caption': sent})
        for j in g[1].Description.values:
            j =  regex.sub("" , j).strip().lower()
#            print(j)
            truth_dict_list.append({'image_id': img_id, 'caption': j})
        img_id = img_id + 1 
#        print("")
#        print("")
    np.save(os.path.join(result_path , str(model_num) + ".npy"  ) , [gen_dict_list , truth_dict_list , img_id])
        


In [0]:
for i in range(10,420,10):
    print(i)
    generate(i)

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
