In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function

import os
from torch.utils.data import Dataset
import numpy as np
import pickle
from dataloaders.rawvideo_util import RawVideoExtractor

ModuleNotFoundError: No module named 'dataloaders'

In [2]:
class MSVD_DataLoader(Dataset):
    """MSVD dataset loader."""
    def __init__(self, subset, data_path,features_path,tokenizer,max_words=30,feature_framerate=1.0,max_frames=100,image_resolution=224,frame_order=0,slice_framepos=0,):
        self.data_path = data_path
        self.features_path = features_path,
        self.feature_framerate = feature_framerate 
        self.max_words = max_words 
        self.max_frames = max_frames 
        self.tokenizer = tokenizer 

        # 0 : 일반적인 순서 1 : 역순서 2 무작위 순서 
        self.frame_order = frame_order 
        assert self.frame_order in [0,1,2]

        # 0 : cut from head freams; 1 : cut from tail frames, 2 : extract frames uniformly.
        self.slice_framepos = slice_framepos 
        assert self.slice_framepos in[0,1,2]

        self.subset = subset 
        assert self.subset in ["train","val","test"]

        #오 이게 좀 중요해보이는구나.
        video_id_path_dict={}

        video_id_path_dict["train"]= os.path.join(self.data_path,"train_list.txt")
        video_id_path_dict["val"] = os.path.join(self.data_path,"val_list.txt")
        video_id_path_dict["test"] = os.path.join(self.data_path,"test_list.txt")

        caption_file = os.path.join(self.data_path,"raw-captions.pkl")

        #self.subset은 train val test 중 하나임
        #txt파일을 하나씩 읽어와서 한줄로 나열한다. 좌우 공백을 전부 없앤다. 
        with open(video_id_path_dict[self.subset],'r') as fp:
            video_ids=[itm.strip() for itm in fp.readlines()]
            print("current video_ids ",video_ids)

        with open(caption_file,'rb') as f:
            captions = pickle.load(f)
            print("current captions ",captions)

        video_dict={}
        #features_path 는 모든 비디오의 내용이 들어가 있음. 
        with open(self.features_path,'r') as features:
            features = os.path.join(self.features_path,"all_videos")
        
        print("features!! :",features)

        for root,dub_dir,video_files in os.walk(self.features_path):
            for video_file in video_files:
                video_id_ = ".".join(video_file.split(".")[:-1])
                if video_id_ not in video_ids:
                    continue #걍 안전코딩 
            print("done!",video_id_)
            file_path_ = os.path.join(root,video_file)
            video_dict[video_id_]=file_path_
        self.video_dict = video_dict 
        print('done video_dict :',self.video_dict)

        self.sample_len=0
        self.sentences_dict={}
        self.cut_off_points=[]
        for video_id in video_ids:
            assert video_id in captions 
            for cap in captions[video_id]:
                cap_txt = " ".join(cap)
                self.sentences_dict[len(self.sentences_dict)]=(video_id,cap_txt)
            self.cut_off_points.append(len(self.sentences_dict))

        #self.sentences_dict는 각 video_id에 대하여 모든 caption을 일렬로 이어붙인 텍스트를 value로 갖는 딕셔너리
        
        self.multi_sentence_per_video = True # !!! important tag for eval
        
        if self.subset == "val" or self.subset == "test":
            self.sentence_num = len(self.sentences_dict)
            self.video_num = len(video_ids)

            assert len(self.cut_off_points) == self.video_num 
            print("For {}, sentence number : {}".format(self.subset,self.sentence_num))
            print("For {}, video number : {}".format(self.subset,self.video_num))
        
        print("Video number : {}".format(len(self.video_dict)))
        print("Total Paire : {}".format(len(self.sentences_dict)))

        self.sample_len = len(self.sentences_dict)

        #RawVideoExtractor는 말그대로 정제를 하는 역할을 한다.
        self.rawVideoExtractor = RawVideoExtractor(framerate=feature_framerate,size=image_resolution)
        self.SPECIAL_TOKEN = {"CLS_TOKEN": "<|startoftext|>", "SEP_TOKEN": "<|endoftext|>",
                              "MASK_TOKEN": "[MASK]", "UNK_TOKEN": "[UNK]", "PAD_TOKEN": "[PAD]"}
    
    def __len__(self):
        return self.sample_len 
    
    #BERT에 들어갈 수 있도록 조작함.
    def _get_text(self,video_id,caption):
        k=1 
        choice_video_ids = [video_id]
        #전체 길이만큼 쭉 zeros를 만든다.
        pairs_text = np.zeros((k,self.max_words), dtype=np.long)
        pairs_mask = np.zeros((k,self.max_words),dtype=np.long)
        pairs_segment = np.zeros((k,self.max_words),dtype=np.long)

        for i, video_id in enumerate(choice_video_ids):
            #토크나이즈를 하고난 뒤
            words = self.tokenizer.tokenize(caption)
            #CLS 토큰을 앞에다가 붙인다.
            words = [self.SPECIAL_TOKEN["CLS_TOKEN"]] + words

            total_length_with_CLS = self.max_words -1 
            if len(words) > total_length_with_CLS:
                words = words[:total_length_with_CLS]
            #그런 다음 뒤에 SEP 토큰을 붙인다. 문장 사이의 구분을 위해서.
            words = words + [self.SPECIAL_TOKEN["SEP_TOKEN"]] 

            input_ids = self.tokenizer.convert_tokens_to_ids(words)
            input_mask = [1] * len(input_ids)
            segment_ids = [0] * len(input_ids)
            while len(input_ids) < self.max_words:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)
            
            assert len(input_ids) == self.max_words
            assert len(input_mask) == self.max_words
            assert len(segment_ids) == self.max_words 

            pairs_text[i]=np.array(input_ids)
            pairs_mask[i] = np.array(input_mask)
            pairs_segment[i] = np.array(segment_ids)
        
        return pairs_text, pairs_mask, pairs_segment, choice_video_ids 

    
    def _get_rawvideo(self,choice_video_ids):
        video_mask = np.zeros((len(choice_video_ids),self.max_frames),dtype=np.long)
        print("before video_mask :",video_mask)
        max_video_length = [0] * len(choice_video_ids)

        # Pair x L x T x 3 x H x W 
        video = np.zeros((len(choice_video_ids),self.max_frames,1,3,self.rawVideoExtractor.size,self.rawVideoExtractor.size),dtype = np.float)
        print("raw video shape :",video.shape)
        print("len(choice_video_ids) :",len(choice_video_ids))
        print("self.max_frames :",self.max_frames)

        for i,video_id in enumerate(choice_video_ids):
            #video_dict는 video_id가 key이고, 해당 video의 경로가 value인 객체입니다.
            video_path = self.video_dict[video_id]

            #raw_video_data는 정제가 되어서 tensor로 바뀌어버린 것임 video -> tensor
            raw_video_data = self.rawVideoExtractor.get_video_data(video_path)
            print("raw_video_data ",raw_video_data)
            print("raw_video_data shape :",raw_video_data.shape)
            """
                def get_video_data(self, video_path, start_time=None, end_time=None):
                    image_input = self.video_to_tensor(video_path, self.transform, sample_fp=self.framerate, start_time=start_time, end_time=end_time)
                    return image_input
            """
            


            raw_video_data = raw_video_data['video']

            if len(raw_video_data.shape) > 3:
                raw_video_data_clip = raw_video_data 

            # L x T x 3 x H x W 
            # T : max_frames 
            # if self.max_frames < raw_video_slice.shape[0]:
            # 조건문으로 인해서 L이 max_frame이 됩니다.
                raw_video_slice = self.rawVideoExtractor.process_raw_data(raw_video_data_clip)
                tensor_size = raw_video_data_clip.size()
                tensor = raw_video_data_clip.view(-1,1,tensor_size[-3],tensor_size[-2],tensor_size[-1])
                raw_video_slice_ = tensor 
                
                if self.max_frames < raw_video_slice.shape[0]:
                    if self.slice_framepos == 0:
                        video_slice = raw_video_slice[:self.max_frames, ...]
                    elif self.slice_framepos == 1:
                        video_slice = raw_video_slice[-self.max_frames:, ...]
                    else:
                        sample_indx = np.linspace(0, raw_video_slice.shape[0] - 1, num=self.max_frames, dtype=int)
                        video_slice = raw_video_slice[sample_indx, ...]
                else:
                    video_slice = raw_video_slice

                video_slice = self.rawVideoExtractor.process_frame_order(video_slice, frame_order=self.frame_order)

                slice_len = video_slice.shape[0]
                max_video_length[i] = max_video_length[i] if max_video_length[i] > slice_len else slice_len
                if slice_len < 1:
                    pass
                else:
                    video[i][:slice_len, ...] = video_slice
            else:
                print("video path: {} error. video id: {}".format(video_path, video_id))

        for i, v_length in enumerate(max_video_length):
            video_mask[i][:v_length] = [1] * v_length

        return video, video_mask

    def __getitem__(self,idx):
        #키와 밸류를 각각 가져옵니다.
        video_id, caption = self.sentences_dict[idx]
        print("extracted video id from sentences_dict :",video_id)
        print("extracted caption in one line from sentences_dict :",caption)

        #각각 get_text와 get_rawvideo를 통해 산출
        pairs_text,pairs_mask,pairs_segment,choice_video_ids = self._get_text(video_id,caption)
        video,video_mask = self._get_rawvideo(choice_video_ids)
        
        return pairs_text,pairs_mask,pairs_segment, video,video_mask


In [3]:
def process_raw_data(self, raw_video_data):
    tensor_size = raw_video_data.size()
    #뭐야 그대로 반환하는데? 
    tensor = raw_video_data.view(-1,1,tensor_size[-3],tensor_size[-2],tensor_size[-1])
    return tensor

In [4]:
import torch
from torch.utils.data import DataLoader
from dataloaders.dataloader_msrvtt_retrieval import MSRVTT_DataLoader
from dataloaders.dataloader_msrvtt_retrieval import MSRVTT_TrainDataLoader
from dataloaders.dataloader_msvd_retrieval import MSVD_DataLoader
from dataloaders.dataloader_lsmdc_retrieval import LSMDC_DataLoader
from dataloaders.dataloader_activitynet_retrieval import ActivityNet_DataLoader
from dataloaders.dataloader_didemo_retrieval import DiDeMo_DataLoader
from modules.tokenization_clip import SimpleTokenizer as ClipTokenizer
from dataloaders.data_dataloaders import DATALOADER_DICT

ModuleNotFoundError: No module named 'dataloaders'

In [5]:
import easydict 
DATA_PATH = "/home/key2317/CLIP4Clip/msvd_data"
FEATURES_PATH = "/home/key2317/CLIP4Clip/msvd_data/MSVD_Videos"
args = easydict.EasyDict({
    "data_path":DATA_PATH,
    "features_path":FEATURES_PATH,
    "max_words":30,
    "feature_framerate":1,
    "max_frames":100,
    "image_resolution":224,
    "frame_order":0,
    "slice_framepos":0,
    "train_frame_order":0, #default 0, choice = [0,1,2]
    "batch_size":256,
    "n_gpu":torch.cuda.device_count(), #default :1
    "num_thread_reader":1,
    "datatype":"msvd",
    "eval_frame_order":0, #choices = [0, 1, 2]
    "batch_size_val":3500,
})

print(args.__dict__)
tokenizer = ClipTokenizer()
#train_dataloader, train_length, train_sampler = DATALOADER_DICT[args.datatype]["train"](args, tokenizer)

ModuleNotFoundError: No module named 'easydict'

In [6]:
def dataloader_msvd_train(args,tokenizer):
    msvd_dataset=MSVD_DataLoader(
        subset = "train",
        data_path = args.data_path,
        features_path = args.features_path,
        max_words = args.max_words,
        feature_framerate=args.feature_framerate,
        tokenizer = tokenizer,
        max_frames=args.max_frames,
        frame_order = args.train_frame_order, 
        slice_framepos = args.slice_framepos
    )

    #train_sampler = torch.utils.data.distributed.DistributedSampler(msvd_dataset)
    train_sampler = 0
    dataloader = DataLoader(
        msvd_dataset,
        batch_size = args.batch_size // args.n_gpu, 
        num_workers = args.num_thread_reader,
        pin_memory=False, 
        shuffle = (train_sampler is None), 
        sampler = train_sampler, 
        drop_last=True,

        
    )

    return msvd_dataset, dataloader, len(msvd_dataset),train_sampler

In [7]:
def dataloader_msvd_test(args, tokenizer, subset="test"):
    msvd_testset = MSVD_DataLoader(
        subset=subset,
        data_path=args.data_path,
        features_path=args.features_path,
        max_words=args.max_words,
        feature_framerate=args.feature_framerate,
        tokenizer=tokenizer,
        max_frames=args.max_frames,
        frame_order=args.eval_frame_order,
        slice_framepos=args.slice_framepos,
    )
    dataloader_msvd = DataLoader(
        msvd_testset,
        batch_size=args.batch_size_val,
        num_workers=args.num_thread_reader,
        shuffle=False,
        drop_last=False,
    )
    return msvd_testset, dataloader_msvd, len(msvd_testset)

In [8]:
msvd_dataset,train_dataloader,len_of_msvdtrain,train_sampler = dataloader_msvd_train(args,tokenizer)
msvd_testset, test_dataloader, len_of_msvdtest = dataloader_msvd_test(args,tokenizer,subset="test")

NameError: name 'args' is not defined

In [9]:
print("DATA PATH :",DATA_PATH)
print("FEATURE PATH :",FEATURES_PATH)

NameError: name 'DATA_PATH' is not defined

In [10]:
video_id_path_dict = {}
video_id_path_dict["train"] = os.path.join(DATA_PATH, "train_list.txt")
video_id_path_dict["val"] = os.path.join(DATA_PATH, "val_list.txt")
video_id_path_dict["test"] = os.path.join(DATA_PATH, "test_list.txt")
caption_file = os.path.join(DATA_PATH, "raw-captions.pkl")

NameError: name 'DATA_PATH' is not defined

In [11]:
with open(caption_file,'rb') as f:
    captions = pickle.load(f)

with open(video_id_path_dict["train"], 'r') as fp:
    video_ids = [itm.strip() for itm in fp.readlines()]

NameError: name 'caption_file' is not defined

In [12]:
video_dict = {} 
for root, dub_dir,video_files in os.walk(args.features_path):
    for video_file in video_files:
        video_id_ = ".".join(video_file.split(".")[:-1])
        if video_id_ not in video_ids:
            continue
        file_path_ = os.path.join(root,video_file)
        video_dict[video_id_] = file_path_ 

#print(video_dict)

NameError: name 'args' is not defined

In [13]:
sentences_dict = {} 
cut_off_points=[] 
for video_id in video_ids:
    assert video_id in captions 
    for cap in captions[video_id]:
        cap_txt = " ".join(cap)
        sentences_dict[len(sentences_dict)] = (video_id,cap_txt)
    cut_off_points.append(len(sentences_dict))
#print(sentences_dict)
#print(cut_off_points)

NameError: name 'video_ids' is not defined

In [14]:
len(sentences_dict) #total paire에 해당

0

In [15]:
idx = 1 
video_id,caption = sentences_dict[idx]
pairs_text, pairs_mask, pairs_segment, choice_video_ids  = msvd_dataset._get_text(video_id,caption)

print(pairs_text.shape)
print(pairs_mask.shape)
print(pairs_segment.shape)
print(choice_video_ids)


KeyError: 1

In [None]:
for idx in range(len(sentences_dict)):
    video_id,caption = sentences_dict[idx]
    pairs_text, pairs_mask, pairs_segment, choice_video_ids  = msvd_dataset._get_text(video_id,caption)
    print("pairs_text shape : {} pairs_mask shape : {} pair_segment shape : {} choice_video_ids : {}".format(pairs_text.shape,pairs_mask.shape,pairs_segment.shape,choice_video_ids))
    video, video_mask = msvd_dataset._get_rawvideo(choice_video_ids)
    print("video shape : {} video_mask shape: {}".format(video.shape,video_mask.shape))
    