In [1]:
import torch
import open_clip
import whisper
from pathlib import Path
import cv2
from PIL import Image
from video import cap, delay_ms, video_path, runVideo

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = whisper.load_model("base")
result = model.transcribe(str(video_path), word_timestamps=True)
for segment in result["segments"]:
    print(segment["start"], segment["end"], segment["text"])



1.4400000000000015 6.58  Hi there, I'm going to show you one of the most popular ways to tie a tie, using the
6.58 14.7  Windsor knot. Also known as a double Windsor, or a full Windsor. Start with your
14.7 18.68  collar up in the tie around your neck. If you right-hand it on the wide end in
18.68 24.92  your right hand and the narrow end in your left hand. Now it's important to remember
25.34 28.88  that the longer you make the wide end below the tie will hang when you finished.
29.68 33.82  Basically you want to line up your tie like this. If you remember this you'll
33.82 35.38  save yourself a lot of time later on.
38.36 43.9  Now cross each end over, wide end over the top, and leave about this much on the narrow end.
44.64 46.98  That's about four inches or ten centimeters.
50.8 56.04  With your left hand take the wide end, thread it through the back of this opening from behind.
56.839999999999996 59.08  Thread it all the way through like this.
66.80000000000001 69.62  Now take th

In [3]:
def ms_to_timestamp(ms):
    total_seconds = int(ms // 1000)
    minutes = total_seconds // 60
    seconds = total_seconds % 60
    return f"{minutes:02d}:{seconds:02d}"

In [6]:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='openai')
tokenizer = open_clip.get_tokenizer('ViT-B-32')

text = ["the person is tying his tie", "water is blue", "fire earth mars", "his shirt is white"]
text_tokens = tokenizer(text)

with torch.no_grad():
    text_features = model.encode_text(text_tokens)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    for i, frame in enumerate(runVideo(wait=False)):
        if i % 10 != 0:
            continue
        print("[", ms_to_timestamp(delay_ms * i), "]: ", sep="", end="")
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        image_input = preprocess(image).unsqueeze(0)
        image_features = model.encode_image(image_input)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        similarity = (image_features @ text_features.T).squeeze(0)
        print(similarity)

[00:00]: tensor([0.3219, 0.1723, 0.1773, 0.2622])
[00:00]: tensor([0.3239, 0.1800, 0.1805, 0.2666])
[00:00]: tensor([0.3220, 0.1760, 0.1791, 0.2640])
[00:01]: tensor([0.3205, 0.1724, 0.1771, 0.2624])
[00:01]: tensor([0.3255, 0.1795, 0.1833, 0.2673])
[00:02]: tensor([0.3303, 0.1873, 0.1872, 0.2699])
[00:02]: tensor([0.3293, 0.2016, 0.1972, 0.2698])
[00:02]: tensor([0.3313, 0.1963, 0.1914, 0.2690])
[00:03]: tensor([0.3334, 0.2011, 0.1945, 0.2764])
[00:03]: 

KeyboardInterrupt: 