# Create Video Recommendation Function

In [14]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import DistanceMetric
import numpy as np
from typing import Tuple

### Define Utility Functions

In [17]:
def returnSearchResults(query : str, 
                        df_embedding : Tuple[pd.DataFrame,pd.DataFrame],
                        model : SentenceTransformer,
                        dist : util,
                        threshold : float=0.5, top_k : int=5) -> np.ndarray:
    
    # Embed Query
    query_embedding = model.encode(query).reshape(1,-1)

    # Compute distance between query and titles
    dist_arr = dist(df_embedding[0].values, query_embedding) + dist(df_embedding[1].values, query_embedding)

    # Identify videos that are close to query based on threshold
    idx_above_threshold = np.argwhere(dist_arr.flatten()>threshold).flatten()
    idx_sorted = np.argsort((-dist_arr[idx_above_threshold]), axis=0).flatten()

    # return indexes of top k search results
    return idx_above_threshold[idx_sorted][:top_k]


### Create Recommendation Function

In [18]:
# Load Data
df = pd.read_parquet('data/video-index.parquet')
df.head()

Unnamed: 0,video_id,datetime,title,transcript,title_embedding-0,title_embedding-1,title_embedding-2,title_embedding-3,title_embedding-4,title_embedding-5,...,transcript_embedding-758,transcript_embedding-759,transcript_embedding-760,transcript_embedding-761,transcript_embedding-762,transcript_embedding-763,transcript_embedding-764,transcript_embedding-765,transcript_embedding-766,transcript_embedding-767
0,S2rqRaL1P8w,2024-12-03 02:48:10,What Was It Like Boxing After So Many Muay Tha...,howdy welcome to my run cast I'm so excited be...,0.013207,0.011925,0.009281,0.044673,0.03389,0.012841,...,0.036485,0.009928,0.034232,0.020771,-0.008311,-0.027323,-0.039688,0.020193,-0.0024,0.006136
1,jquPsuLggI0,2024-11-30 07:52:58,"Post Fight Update - 1st ""Official"" Boxing Fight",hey so I am outside of the Tai payak gym here ...,0.015482,0.047861,0.016647,0.024766,-0.012244,0.007547,...,-0.000794,0.037912,0.004202,0.006043,-0.035546,0.070538,0.007746,0.027256,0.021039,-0.014314
2,J2b5ziNb2Mg,2024-11-29 03:17:38,Fight 177 - Sylvie vs Nantida Sitweerachat wit...,this is a fight up in nong bua cope which is k...,-0.019959,0.033178,0.023692,0.051658,-0.045184,0.015801,...,0.043578,-0.005331,-0.043642,-0.008313,-0.017888,0.062756,-0.045954,0.037095,-0.006452,-0.027445
3,5kFbuiLXvsU,2024-11-20 09:56:40,"Vlog - dogs, treats, and my favorite old lady.",so I just had to explain to my little dog frie...,0.017771,0.007655,-0.004522,-0.00287,0.011465,0.016122,...,0.02117,0.049695,-0.013841,-0.049253,0.024609,-0.054462,0.001284,0.012855,0.066325,0.00354
4,lswgCMqcOZQ,2024-11-12 03:54:44,"Boxing, Sparring, Breast Tissue Health, Gettin...",howdy howdy welcome to my runcast it's been a ...,0.031034,0.039797,0.002997,-0.001654,0.020791,0.044946,...,0.015878,-0.012307,0.068392,-0.023493,-0.031925,-0.031144,0.01589,0.005355,-0.037606,-0.032439


In [19]:
# Define Model & Similarity Metric
model = SentenceTransformer("all-mpnet-base-v2")
metric = util.cos_sim

In [20]:
query = "footwork"
idx_result = returnSearchResults(query, [df[[f'title_embedding-{i}' for i in range(768)]], df[[f'transcript_embedding-{i}' for i in range(768)]]], model, metric, threshold=0.3)

df[['video_id','title']].iloc[idx_result]

Unnamed: 0,video_id,title
202,oa10R8AfbIM,Sylvie's Technique Vlog - Your Ambient Footwo...
76,G8YLQcgjmYY,Sylvie's Technique Vlog - Improving Your Foot...
380,5qylRsFk6lE,Learning Yodkhunpon's Sublime Pressuring Footw...
286,K5N2n13UruQ,Sylvie's Technique Vlog - The Strongly Rising ...
343,uCquDekZ_2w,Basic Boxing Weight Transfer Footwork - Chatch...
