In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%%capture
!pip install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

In [3]:
import glob
import re
import pandas as pd
import subprocess
import torch
import clip
from PIL import Image
import os
import time
from itertools import islice

In [4]:
def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    return [atoi(c) for c in re.split(r'(\d+)', text)]

def extract_frames(dest_folder_path, input_video):
    if not os.path.exists(dest_folder_path):
        os.makedirs(dest_folder_path)
    fps = 0.5
    query = "ffmpeg -i " + input_video + " -pix_fmt rgb24 -vf fps=" + str(fps) + " " + dest_folder_path + "/img_%06d.png"
    response = subprocess.Popen(query, shell=True, stdout=subprocess.PIPE).stdout.read()
    s = str(response).encode('utf-8')
    frames = []
    time.sleep(2)
    for file in glob.glob(dest_folder_path + '/*.png'):
        frames.append(file)
    frames.sort(key=natural_keys)
    return frames

In [5]:
classes = ['a person is talking to someone',
           'aircraft', 'alcohol', 'animals',
           'campfire', 'celebration', 'changing cloths', 'chopping meat and vegetables', 'cleaning',
           'people climbing', 'cooking', 'dancing', 'discrimination',
           'doing experiment in a lab', 'drugs', 'eating and dining', 'exercise',
           'fixing instruments', 'fixing vehicles', 'people talking', 'having sex',
           'horror', 'hugging baby', 'in the park',
           'in the zoo', 'jogging and running', 'kids playing', 'people kissing', 'listening', 'nature',
           'nudity', 'operation is going on in an operation theater', 'driving vehicle',
           'people gathering', 'playing games in a casino', 'playing music instrument',
           'playing sport', 'prayer', 'reading or writing', 'riding a bike',
           'riding an animal', 'saluting', 'sexuality', 'school and college',
           'ship', 'shopping', 'singing', 'smiling and laugh',
           'smoking', 'stunt', 'swimming', 'taking bath', 'people falling'
           'two persons are talking to third person who is not in the picture',
           'vehicle racing', 'violence', 'watching', 'people working']

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
text = clip.tokenize(classes).to(device)
# text_features = model.encode_text(text)
df_columns = ['file_name', 'm1', 'm2', 'm3', 'm4', 'm5']

for video in glob.glob('/content/drive/MyDrive/clip_test_video/*'):
  start_time = time.time()
  print(video)
  dd = video.split('/')
  filee = dd[len(dd) - 1]
  ff = filee.split('.')[0]
  frames = extract_frames('/content/drive/MyDrive/frames/' + ff, video)
  df = pd.DataFrame(columns=df_columns)
  for image_name in frames:
    start_time_frame = time.time()
    ddd = image_name.split('/')
    ddd = str(ddd[len(ddd) - 2]) + '/' + str(ddd[len(ddd) - 1])
    image = preprocess(Image.open(image_name)).unsqueeze(0).to(device)
    with torch.no_grad():
        # image_features = model.encode_image(image)
        logits_per_image, logits_per_text = model(image, text)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()
        probs = probs.tolist()[0]
    print("--- %s seconds ---" % (time.time() - start_time_frame))
    vv = {}
    for i, j in enumerate(probs):
      if i >= 0.40 and classes[i] == 'sexuality':
        vv[classes[i]] = j
      else:
        if i >= 0.20:
          vv[classes[i]] = j
        else:
          vv[classes[i]] = 0.0

    aa = {}
    maxx = {k: v for k, v in sorted(vv.items(), key=lambda item: item[1], reverse=True)}
    tmp = list(islice(maxx.items(), 5))
    tmp = [str(filee)] + tmp
    # tmp = []
    # tmp.append(str(filee))
    # for t in probs:
    #   tmp.append(t)
    df_length = len(df)
    df.loc[df_length] = tmp
  df.index += 1
  df.to_csv('/content/drive/MyDrive/TestData/' + ff + '.csv')
  print("--- %s seconds ---" % (time.time() - start_time))
# print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]


/content/drive/MyDrive/clip_test_video/3.mp4
--- 0.8550646305084229 seconds ---
--- 0.687201976776123 seconds ---
--- 0.6926555633544922 seconds ---
--- 0.6845121383666992 seconds ---
--- 0.6835830211639404 seconds ---
--- 0.6797914505004883 seconds ---
--- 0.6811280250549316 seconds ---
--- 0.6656239032745361 seconds ---
--- 0.6841657161712646 seconds ---
--- 0.6875026226043701 seconds ---
--- 0.6812524795532227 seconds ---
--- 0.6665093898773193 seconds ---
--- 0.6802287101745605 seconds ---
--- 0.6802592277526855 seconds ---
--- 0.6790826320648193 seconds ---
--- 0.6804869174957275 seconds ---
--- 0.6784002780914307 seconds ---
--- 0.6793267726898193 seconds ---
--- 0.6807963848114014 seconds ---
--- 0.6805281639099121 seconds ---
--- 0.6805214881896973 seconds ---
--- 0.6740372180938721 seconds ---
--- 0.6757843494415283 seconds ---
--- 0.6862413883209229 seconds ---
--- 0.6861660480499268 seconds ---
--- 0.6853628158569336 seconds ---
--- 0.6739957332611084 seconds ---
--- 0.67752