## Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# %cd \your/working/dir
!git clone https://github.com/OpenGVLab/InternVideo.git

In [None]:
%cd /content/drive/MyDrive/src/AIC2023/InternVideo

/content/drive/MyDrive/src/AIC2023/InternVideo


In [None]:
# Download pretrained model and put in folder as below
# !wget -d ./Pretrain/Multi-Modalities-Pretraining/models https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/internvideo/pretrain/InternVideo-MM-L-14.ckpt


In [None]:
!pip install -r requirements.txt



## Data preparation

### File list

In [None]:
import glob
import json
import os

In [None]:
class File4Faiss:
    def __init__(self, root_database: str):
        self.root_database = root_database
    def write_json_file(self, json_out_path):
        des_path = os.path.join(json_out_path, "videos_id_path.json")
        videos_path = sorted(glob.glob(f'{self.root_database}/*.avi'))
        dict_videos_id_path = dict(enumerate(videos_path))
        with open(des_path, 'w') as f:
            f.write(json.dumps(dict_videos_id_path))
        print("Save videos id-path dict.")


In [None]:
file_creator = File4Faiss('/content/drive/MyDrive/src/AIC2023/CLIP4Clip/msvd_data/MSVD_Videos')

In [None]:
file_creator.write_json_file('/content/drive/MyDrive/src/AIC2023/InternVideo/Pretrain/Multi-Modalities-Pretraining/dict')

Save videos id-path dict.


In [None]:
# Check content of videos_id_path dict (first 100000 bytes)
!head --bytes 100000 /content/drive/MyDrive/src/AIC2023/InternVideo/Pretrain/Multi-Modalities-Pretraining/dict/videos_id_path.json

{"0": "/content/drive/MyDrive/src/AIC2023/CLIP4Clip/msvd_data/MSVD_Videos/-4wsuPCjDBc_5_15.avi", "1": "/content/drive/MyDrive/src/AIC2023/CLIP4Clip/msvd_data/MSVD_Videos/-7KMZQEsJW4_205_208.avi", "2": "/content/drive/MyDrive/src/AIC2023/CLIP4Clip/msvd_data/MSVD_Videos/-8y1Q0rA3n8_108_115.avi", "3": "/content/drive/MyDrive/src/AIC2023/CLIP4Clip/msvd_data/MSVD_Videos/-8y1Q0rA3n8_95_102.avi", "4": "/content/drive/MyDrive/src/AIC2023/CLIP4Clip/msvd_data/MSVD_Videos/-9CUm-2cui8_39_44.avi", "5": "/content/drive/MyDrive/src/AIC2023/CLIP4Clip/msvd_data/MSVD_Videos/-AwoiGR6c8M_10_14.avi", "6": "/content/drive/MyDrive/src/AIC2023/CLIP4Clip/msvd_data/MSVD_Videos/-Cv5LsqKUXc_17_25.avi", "7": "/content/drive/MyDrive/src/AIC2023/CLIP4Clip/msvd_data/MSVD_Videos/-Cv5LsqKUXc_71_76.avi", "8": "/content/drive/MyDrive/src/AIC2023/CLIP4Clip/msvd_data/MSVD_Videos/-DKuLXYoY3g_14_20.avi", "9": "/content/drive/MyDrive/src/AIC2023/CLIP4Clip/msvd_data/MSVD_Videos/-DRy7rBg0IQ_31_37.avi", "10": "/content/drive/MyD

### Save extracted features

In [None]:
import torch
import InternVideo
import faiss
import faiss.contrib.torch_utils

In [None]:
bin_path =  '/content/drive/MyDrive/src/AIC2023/InternVideo/Pretrain/Multi-Modalities-Pretraining/dict/faiss_cosine.bin'
videos = sorted(glob.glob(f'{file_creator.root_database}/*.avi'))

model = InternVideo.load_model("./models/InternVideo-MM-L-14.ckpt").cuda()

In [None]:
res = faiss.StandardGpuResources()
d = 768
index = faiss.GpuIndexFlatIP(res, d)

In [None]:
with torch.no_grad():
    for i, video in enumerate(videos):
        # Only get 1000 vids for demo
        if i == 1000:
            break
        vid = InternVideo.load_video(video).cuda()
        video_features = model.encode_video(vid.unsqueeze(0))
        video_features = torch.nn.functional.normalize(video_features, dim=1)

        index.add(video_features)
        print('feature added to index')

    index = faiss.index_gpu_to_cpu(index)
    print('writing')
    faiss.write_index(index, bin_path)
    print('written')

In [None]:
# Check n. of feats
index.ntotal

1000

In [None]:
text_cand = 'a man in black shirt and blue cap eating a spoon of something'
text = InternVideo.tokenize(text_cand).cuda()
with torch.no_grad():
    text_features = model.encode_text(text)
    text_features = torch.nn.functional.normalize(text_features, dim=1)

In [None]:
index = faiss.index_cpu_to_gpu(res, 0, index)

In [None]:
# import time

start = time.time()
scores, idx_image = index.search(text_features, k=9)
end = time.time()

idx_image = idx_image.flatten()
idx_image

print(f"Time: {end - start}")

Time: 0.0015559196472167969


In [None]:
idx_image

tensor([ 80,  79, 391, 476, 123, 344, 214, 807, 186], device='cuda:0')

## Run Demo

In [None]:
%cd /content/drive/MyDrive/src/AIC2023/InternVideo/Pretrain/Multi-Modalities-Pretraining

/content/drive/MyDrive/src/AIC2023/InternVideo/Pretrain/Multi-Modalities-Pretraining


In [None]:
!python /content/drive/MyDrive/src/AIC2023/InternVideo/Pretrain/Multi-Modalities-Pretraining/demo.py

  x.storage().data_ptr() + x.storage_offset() * 4)
features written
Label probs: 
an airplane is taking off     : 0.9562
an airplane is flying         : 0.0438
a dog is chasing a ball       : 0.0000
