## Install Dependencies

In [None]:
%%capture
! pip install -q pose-format mediapipe

## Download some videos

`North` and `Norway` should be more similar to eachother than `fine` or `formal`, and vice versa.

In [33]:
from google.colab import drive
import shutil
import os

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Create the target directory if it doesn't exist
os.makedirs('videos', exist_ok=True)

# Step 3: List of videos to copy
videos_to_copy = [
    "/content/drive/MyDrive/FinalProject/videos/Hello_short_1.mp4",
    "/content/drive/MyDrive/FinalProject/videos/Hello_short_2.mp4",
    "/content/drive/MyDrive/FinalProject/videos/Thank_you_1.mp4",
    "/content/drive/MyDrive/FinalProject/videos/Thank_you_short_2.mp4"
]

# Step 4: Copy each video to the target directory
for video_path in videos_to_copy:
    shutil.copy(video_path, 'videos')

print("Specified videos have been successfully copied to videos")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Specified videos have been successfully copied to videos


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## Run pose estimation on each video

In [34]:
%%capture
! mkdir -p poses
! video_to_pose -i videos/Hello_short_1.mp4 --format mediapipe -o poses/Hello_1.pose
! video_to_pose -i videos/Hello_short_2.mp4 --format mediapipe -o poses/Hello_2.pose
! video_to_pose -i videos/Thank_you_1.mp4 --format mediapipe -o poses/Thank_you_1.pose
! video_to_pose -i videos/Thank_you_short_2.mp4 --format mediapipe -o poses/Thank_you_2.pose

## Add some utilities to embed poses and text

In [35]:
import requests
import json
import io
import base64
from pose_format import Pose
import numpy as np

def query_embedding(data):
  modality = 'text' if 'text' in data.keys() else 'pose'

  # url = f"http://172.23.63.59:3030/api/embed/{modality}" # UZH internal IP
  url = f"https://pub.cl.uzh.ch/demo/sign_clip/{modality}" # public

  headers = {
    'Content-Type': 'application/json'
  }
  payload = json.dumps(data)

  response = requests.request("GET", url, headers=headers, data=payload)
  result = json.loads(response.text)

  return np.asarray(result['embeddings'])

def embed_text(text, model_name='default'):
  return query_embedding({
      "text": text,
      "model_name": model_name,
  })

def read_pose(pose_path):
  with open(pose_path, "rb") as f:
    buffer = f.read()
    pose = Pose.read(buffer)
  return pose

def embed_pose(pose, model_name='default'):
  poses = pose if type(pose) == list else [pose]
  poses = [pose if isinstance(pose, Pose) else read_pose(pose) for pose in poses]

  pose_data = []

  for pose in poses:
    memory_file = io.BytesIO()
    pose.write(memory_file)
    encoded = base64.b64encode(memory_file.getvalue())
    pose_data.append(encoded.decode('ascii'))

  return query_embedding({
      "pose": pose_data,
      "model_name": model_name,
  })

## Actually load the data nad embed text

In [36]:
names = ["Hello_1", "Hello_2", "Thank_you_1", "Thank_you_2"]
data = {f"<en> <ase> {name}": read_pose(f"poses/{name}.pose") for name in names}

texts = list(data.keys())
poses = list(data.values())

poses_embeddings = np.array([embed_pose(pose)[0] for pose in poses])
texts_embeddings = embed_text(texts)

## Visualize text/pose similarity

In [37]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity_matrix = cosine_similarity(texts_embeddings, poses_embeddings)

df = pd.DataFrame(cosine_similarity_matrix, index=texts, columns=names)

for text in df.index:
  closest_video = df.loc[text].idxmax()
  print(f"The closest video to the text '{text}' is '{closest_video}'.")

print("Full similarity map. Rows are texts, columns are poses")
display(df)

The closest video to the text '<en> <ase> Hello_1' is 'Hello_2'.
The closest video to the text '<en> <ase> Hello_2' is 'Thank_you_1'.
The closest video to the text '<en> <ase> Thank_you_1' is 'Thank_you_1'.
The closest video to the text '<en> <ase> Thank_you_2' is 'Thank_you_1'.
Full similarity map. Rows are texts, columns are poses


Unnamed: 0,Hello_1,Hello_2,Thank_you_1,Thank_you_2
<en> <ase> Hello_1,0.563589,0.567994,0.542725,0.513053
<en> <ase> Hello_2,0.515768,0.519417,0.521788,0.476232
<en> <ase> Thank_you_1,0.528627,0.542519,0.559549,0.512074
<en> <ase> Thank_you_2,0.480484,0.493666,0.536721,0.488226


# Visualize pose/pose similarity

In [38]:
cosine_similarity_matrix = cosine_similarity(poses_embeddings, poses_embeddings)

df = pd.DataFrame(cosine_similarity_matrix, index=names, columns=names)

for text in df.index:
  similarity_series = df.loc[text].drop(text)
  closest_video = similarity_series.idxmax()
  print(f"The closest video to '{text}' is '{closest_video}'.")

print("Full similarity map. Rows & columns are poses")
display(df)

The closest video to 'Hello_1' is 'Hello_2'.
The closest video to 'Hello_2' is 'Hello_1'.
The closest video to 'Thank_you_1' is 'Thank_you_2'.
The closest video to 'Thank_you_2' is 'Thank_you_1'.
Full similarity map. Rows & columns are poses


Unnamed: 0,Hello_1,Hello_2,Thank_you_1,Thank_you_2
Hello_1,1.0,0.852533,0.526551,0.563474
Hello_2,0.852533,1.0,0.563397,0.603265
Thank_you_1,0.526551,0.563397,1.0,0.712864
Thank_you_2,0.563474,0.603265,0.712864,1.0


In [None]:
# Now this is fast.

dictionary_files list(Path("embeddings").glob("*.npy"))
dictionary_embedding = np.array([np.load(f) for f in dictionary_files])
candidate_embedding = np.load("candidate.npy")
# this is the similarity calculation. now we have a "distance" between the candidate and each of the dictionary items
similarity = dictionary_embedding.dot(candidate_embedding)
most_similar_id = np.argmax(similarity)
most_similar_file_name = dictionary_files[most_similar_id].name
print(f"The most similar file is {most_similar_file_name}")

SyntaxError: invalid syntax (<ipython-input-8-2631b76f695c>, line 3)