# Experimentation Notebook
In this notebook we carry out experiments to find the best model to use for our semantic search app

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import DistanceMetric

from utils import evalTrueRankings

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
df = pd.read_parquet('data/video-transcripts.parquet')
df_eval = pd.read_csv('data/eval-raw.csv')

In [3]:
df_eval

Unnamed: 0,query,video_id
0,ai consulting,INlCLmWlojY
1,fine tuning llm,eC6Hd1hFvos
2,When do you recommend fine tunning and when do...,eC6Hd1hFvos
3,llm from scratch,ZLbVdvOoTKM
4,"What if you could make a small language model,...",ZLbVdvOoTKM
...,...,...
59,Fat tails python,15Kd9OPn7tw
60,do more with less,poGxnBR3hEU
61,data science projects,03x2oYg9oME
62,how to become a data scientist,W6TkOTsI7vM


## Embed Titles and Transcripts

In [10]:
# Define 'parameters'
column_to_embed_list = ['title', 'transcript']
model_name_list = ["all-MiniLM-L6-v2", "multi-qa-distilbert-cos-v1", "multi-qa-mpnet-base-dot-v1"]

In [11]:
# Generate embeddings for each possible combination of columns and model

# Initialize dictionary to keep track of all text embeddings
text_embedding_dict = {}

for model_name in model_name_list:

    # Define Embedding Model
    model = SentenceTransformer(model_name)

    for column_name in column_to_embed_list:
        # Define text embedding identifier
        key_name = model_name + "_" + column_name
        print(key_name)

        # Generate text embeddings for text under column_name
        %time embedding_arr = model.encode(df[column_name].to_list())
        print('')

        # Append Embeddings to dictionary
        text_embedding_dict[key_name] = embedding_arr

all-MiniLM-L6-v2_title
CPU times: user 408 ms, sys: 164 ms, total: 572 ms
Wall time: 4.84 s

all-MiniLM-L6-v2_transcript
CPU times: user 1.21 s, sys: 190 ms, total: 1.4 s
Wall time: 1.56 s

multi-qa-distilbert-cos-v1_title
CPU times: user 208 ms, sys: 50.7 ms, total: 258 ms
Wall time: 595 ms

multi-qa-distilbert-cos-v1_transcript
CPU times: user 1.06 s, sys: 376 ms, total: 1.43 s
Wall time: 4.33 s

multi-qa-mpnet-base-dot-v1_title
CPU times: user 276 ms, sys: 151 ms, total: 427 ms
Wall time: 2.24 s

multi-qa-mpnet-base-dot-v1_transcript
CPU times: user 3.16 s, sys: 356 ms, total: 3.52 s
Wall time: 8.49 s



## Embed Queries in the Evaluation Dataset

In [13]:
# Initialize dictionary to keep track of query embeddings
query_embedding_dict = {}

for model_name in model_name_list:

    # Define Embedding Model
    model = SentenceTransformer(model_name)
    print(model_name)

    %time embedding_arr = model.encode(df_eval['query'].to_list())
    print('')

    query_embedding_dict[model_name] = embedding_arr

all-MiniLM-L6-v2
CPU times: user 200 ms, sys: 262 ms, total: 461 ms
Wall time: 1.6 s

multi-qa-distilbert-cos-v1
CPU times: user 148 ms, sys: 244 ms, total: 392 ms
Wall time: 599 ms

multi-qa-mpnet-base-dot-v1
CPU times: user 581 ms, sys: 377 ms, total: 958 ms
Wall time: 1.76 s



## Evaluate Semantic Search Methods

In [14]:
# Initialize distance metrics to experiment with
dist_name_list = ['euclidean', 'manhattan', 'chebyshev']
sim_name_list = ['cos_sim', 'dot_score']

In [15]:
# Evaluate all possible combinations of model, column to embed, and distance metric

# Initialize list in which to store results
eval_results = []

# Loop through embedding models
for model_name in model_name_list:
    query_embedding = query_embedding_dict[model_name]   # Get query embeddings

    # Loop through text columns
    for column_name in column_to_embed_list:
        embedding_arr = text_embedding_dict[model_name + "_" + column_name]   # Get text embeddings

        # Loop through distance metrics
        for dist_name in dist_name_list:
            # Compute distance between video text and query
            dist = DistanceMetric.get_metric(dist_name)
            dist_arr = dist.pairwise(embedding_arr, query_embedding)
            dist_arr_sorted = np.argsort(dist_arr, axis=0)  # Sort indexes of distance array from smallest -> largest

            # Define label for search method
            method_name = "_".join([model_name, column_name, dist_name])

            # Evaluate the ranking of the ground truth label - query distance for this method_name
            truth_rank_arr = evalTrueRankings(dist_arr_sorted, df, df_eval)
            eval_list = [method_name] + truth_rank_arr.tolist()[0]
            eval_results.append(eval_list)

        # Loop through sbert similarity scores
        for sim_name in sim_name_list:
            # Apply similarity score from sbert
            cmd = "dist_arr = -util." + sim_name + "(embedding_arr, query_embedding)"
            exec(cmd)
            dist_arr_sorted = np.argsort(dist_arr, axis=0)   # Sort indexes of distance array 

            # Define label for search method
            method_name = "_".join([model_name, column_name, sim_name.replace('_','-')])

            # Evaluate the ranking of the ground truth label - query distance for this method_name
            truth_rank_arr = evalTrueRankings(dist_arr_sorted, df, df_eval)
            eval_list = [method_name] + truth_rank_arr.tolist()[0]
            eval_results.append(eval_list)

In [16]:
# Compute rankings for title + transcript embedding
for model_name in model_name_list:

    # Generate text & query embeddings
    embedding_arr1 = text_embedding_dict[model_name+"_title"]
    embedding_arr2 = text_embedding_dict[model_name+"_transcript"]
    query_embedding = query_embedding_dict[model_name]

    # Loop through distance metrics
    for dist_name in dist_name_list:
        # Compute distance between video text and query
        dist = DistanceMetric.get_metric(dist_name)
        dist_arr = dist.pairwise(embedding_arr1, query_embedding) + dist.pairwise(embedding_arr2, query_embedding)
        dist_arr_sorted = np.argsort(dist_arr, axis=0)  # Sort indexes of distance array from smallest -> largest

        # Define label for search method
        method_name = "_".join([model_name, "title-transcript", dist_name])

        # Evaluate the ranking of the ground truth label - query distance for this method_name
        truth_rank_arr = evalTrueRankings(dist_arr_sorted, df, df_eval)
        eval_list = [method_name] + truth_rank_arr.tolist()[0]
        eval_results.append(eval_list)

    # Loop through sbert similarity scores
    for sim_name in sim_name_list:
        # Apply similarity score from sbert
        cmd = "dist_arr = -util." + sim_name + "(embedding_arr1, query_embedding) - util." + sim_name + "(embedding_arr2, query_embedding)"
        exec(cmd)
        dist_arr_sorted = np.argsort(dist_arr, axis=0)   # Sort indexes of distance array 

        # Define label for search method
        method_name = "_".join([model_name, column_name, sim_name.replace("_","-")])

        # Evaluate the ranking of the ground truth label - query distance for this method_name
        truth_rank_arr = evalTrueRankings(dist_arr_sorted, df, df_eval)
        eval_list = [method_name] + truth_rank_arr.tolist()[0]
        eval_results.append(eval_list)
        

In [40]:
# Store results as a pandas dataframe

data = {'method_name' : [eval_results[i][0] for i in range(len(eval_results))]}

for method_num in range(len(eval_results)):
    for query_num in range(len(eval_results[0])-1):
        if method_num == 0:
            data['rank-query-'+str(query_num)] = []
            data['rank-query-'+str(query_num)].append(eval_results[method_num][query_num+1])
        else:
            data['rank-query-'+str(query_num)].append(eval_results[method_num][query_num+1])

df_results = pd.DataFrame(data)
print(df_results.shape)
df_results.head()


In [74]:
# Calculate mean rank of ground truth for each method
df_results['rank-query-mean'] = df_results[[f'rank-query-{i}' for i in range(64)]].mean(axis=1)

# Calculate number of ground truth results which appear in top 3
for i in [1,3]:
    df_results[f'num_in_top-{i}'] = (df_results[[f'rank-query-{j}' for j in range(64)]] < i).sum(axis=1)


df_results.head()

Unnamed: 0,method_name,rank-query-0,rank-query-1,rank-query-2,rank-query-3,rank-query-4,rank-query-5,rank-query-6,rank-query-7,rank-query-8,...,rank-query-57,rank-query-58,rank-query-59,rank-query-60,rank-query-61,rank-query-62,rank-query-63,rank-query-mean,num_in_top-1,num_in_top-3
0,all-MiniLM-L6-v2_title_euclidean,0.0,1.0,22.0,0.0,16.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.53125,42,56
1,all-MiniLM-L6-v2_title_manhattan,0.0,1.0,9.0,0.0,14.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.265625,41,56
2,all-MiniLM-L6-v2_title_chebyshev,0.0,3.0,51.0,0.0,79.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,8.40625,36,46
3,all-MiniLM-L6-v2_title_cos-sim,0.0,1.0,22.0,0.0,16.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.53125,42,56
4,all-MiniLM-L6-v2_title_dot-score,0.0,1.0,22.0,0.0,16.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.53125,42,56


In [77]:
df_summary = df_results[['method_name', 'rank-query-mean','num_in_top-1', 'num_in_top-3']]
df_summary.sort_values(by='rank-query-mean').head()

Unnamed: 0,method_name,rank-query-mean,num_in_top-1,num_in_top-3
44,multi-qa-mpnet-base-dot-v1_transcript_dot-score,1.140625,37,57
31,all-MiniLM-L6-v2_title-transcript_manhattan,1.1875,37,55
30,all-MiniLM-L6-v2_title-transcript_euclidean,1.25,37,56
1,all-MiniLM-L6-v2_title_manhattan,1.265625,41,56
34,all-MiniLM-L6-v2_transcript_dot-score,1.34375,37,55


In [78]:
df_summary.sort_values(by='num_in_top-1', ascending=False).head()

Unnamed: 0,method_name,rank-query-mean,num_in_top-1,num_in_top-3
0,all-MiniLM-L6-v2_title_euclidean,1.53125,42,56
10,multi-qa-distilbert-cos-v1_title_euclidean,2.296875,42,58
3,all-MiniLM-L6-v2_title_cos-sim,1.53125,42,56
4,all-MiniLM-L6-v2_title_dot-score,1.53125,42,56
41,multi-qa-mpnet-base-dot-v1_title-transcript_ma...,2.296875,42,54


In [79]:
df_summary.sort_values(by='num_in_top-3', ascending=False).head()

Unnamed: 0,method_name,rank-query-mean,num_in_top-1,num_in_top-3
36,multi-qa-distilbert-cos-v1_title-transcript_ma...,2.3125,41,59
35,multi-qa-distilbert-cos-v1_title-transcript_eu...,2.109375,41,59
14,multi-qa-distilbert-cos-v1_title_dot-score,2.296875,42,58
13,multi-qa-distilbert-cos-v1_title_cos-sim,2.296875,42,58
10,multi-qa-distilbert-cos-v1_title_euclidean,2.296875,42,58


In [None]:
final_model_choice = "all-MiniLM-L6-v2_title_cos-sim"