# Execute and Example of the Semantic Search 

## Import Libraries

In [40]:
import gradio as gr
import numpy as np
import polars as pl
from sentence_transformers import SentenceTransformer
from  sklearn.metrics import DistanceMetric

## Load Data, Model, and Measurement Metrics

In [41]:
# Assign paths for the index file files
video_index_file_path = "/Users/lancehester/Documents/semantic_search_yt/data/video_index_full.parquet"

In [42]:
df = pl.scan_parquet(video_index_file_path)

In [43]:
transformer_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(transformer_name)

In [44]:
dist_measure = 'manhattan'
dist = DistanceMetric.get_metric(dist_measure)

# Make the Search Method/Function

In [45]:
def return_search_results(
    query: str,
    index: pl.lazyframe.frame.LazyFrame
) -> np.ndarray:
    """
    Method to return the indices of top search results

        Args:
            query (str): The user query string to search for videos
        index (pl.lazyframe.frame.LazyFrame): 

        Returns:
            np.ndarray: indices of search results
    """
    # Embed query. Use reshape(-1,1) for a single feature; i.e., single column
    query_embedding = model.encode(query).reshape(1, -1)

    # Get column names without triggering schema resolution warning
    # Polars LazyFrame provides an efficient way to handle large datasets through lazy evaluation. 
    # Unlike traditional DataFrames, LazyFrames don’t contain data but instead 
    # store a set of instructions known as a query plan 
    # ref: https://realpython.com/polars-lazyframe/
    # Index.collect_schema()
    # see what data types your LazyFrame is using without performing a full materialization, you can use .collect_schema() 
    column_names = index.collect_schema().names()

    # Compute distances between query and titles/transcripts
    dist_array = (
        dist.pairwise(index.select(column_names[4:388]).collect(), query_embedding) +
        dist.pairwise(index.select(column_names[388:]).collect(), query_embedding)
    )

    # Search paramaters
    threshold = 40 # eye balled threshold for manhattan distance 40 or less
    top_k = 5

    # Evaluate videos close to query based on threshold
    index_below_threshold = np.argwhere(dist_array.flatten()<threshold).flatten()
    # keep top k closest videos
    index_sorted = np.argsort(dist_array[index_below_threshold], axis=0).flatten()

    # return indexes of search results
    return index_below_threshold[index_sorted][:top_k]


In [46]:
#try sample query 
query = "LLM"
index_result = return_search_results(query, df)

print(df.select(['video_id', 'title']).collect()[index_result])

shape: (5, 2)
┌─────────────┬─────────────────────────────────┐
│ video_id    ┆ title                           │
│ ---         ┆ ---                             │
│ str         ┆ str                             │
╞═════════════╪═════════════════════════════════╡
│ ytmK_ErTWss ┆ LLMs EXPLAINED in 60 seconds #… │
│ ZLbVdvOoTKM ┆ How to Build an LLM from Scrat… │
│ Ylz779Op9Pw ┆ How to Improve LLMs with RAG (… │
│ tFHeUSJAYbE ┆ A Practical Introduction to La… │
│ eC6Hd1hFvos ┆ Fine-tuning Large Language Mod… │
└─────────────┴─────────────────────────────────┘


In [47]:
df.select(['title', 'video_id']).collect()[index_result].to_dict(as_series=False)

{'title': ['LLMs EXPLAINED in 60 seconds #ai',
  'How to Build an LLM from Scratch | An Overview',
  'How to Improve LLMs with RAG (Overview + Python Code)',
  'A Practical Introduction to Large Language Models (LLMs)',
  'Fine-tuning Large Language Models (LLMs) | w/ Example Code'],
 'video_id': ['ytmK_ErTWss',
  'ZLbVdvOoTKM',
  'Ylz779Op9Pw',
  'tFHeUSJAYbE',
  'eC6Hd1hFvos']}

In [48]:
type(df.select(['title', 'video_id']).collect()[index_result].to_dict(as_series=False))

dict

--

## Now Let's Build an Interface

--

In [49]:
def return_top_five(query: str) -> dict[str, list[str]]:
    """
    Method to return the top 5 search results

        Args:
            query (str): The user query string to search for videos
        
        Returns:
            dict([str, list[str]): Dictionary containing the top 5 resuts of keys: title and video_id
    """
    # return top 5 search results
    index_result = return_search_results(query, df)
    response = df.select(['title', 'video_id']).collect()[index_result].to_dict(as_series=False)

    return response

In [50]:
def format_result_text(title: str, video_id: str) -> str:
    """
    Method to format search result text

        Args:
            title (str): The YouTube video title.
            video_id (str): The unique id of the YouTube video.
        
        Returns:
            str: The formated text 
    """
    text = markdown_text = f"""<br> <br>
# {title}<br>

🔗 [Video Link](https://youtu.be/{video_id})"""

    return text

In [51]:
def format_video_embed(video_id: str) -> str:
    """
    Method to format to literally embed in the results (i.e., show video screen)

        Args:
            video_id (str): The unique id of the YouTube video.
        
        Returns:
            str: The formated text 
    """
    # other options
    # embed = '<iframe width="640" height="360" src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" </iframe>'
    # embed = '<a href="https://youtu.be/'+ video_id +'"> <img src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" style="width:576;height:324;"></a>'
    # embed = '<a href="www.youtube.com/watch?v='+ video_id +'"> <img src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" style="width:576;height:324;"></a>'
    return f"""<iframe width="576" height="324" src="https://www.youtube.com/embed/{video_id}"></iframe>"""

In [52]:
response = return_top_five(query)
output_list = []
for i in range(len(response['title'])):
    video_id = response['video_id'][i]
    title = response['title'][i]

    embed = gr.HTML(value = format_video_embed(video_id), visible=True)
    text = gr.Markdown(value = format_result_text(title, video_id), visible=True)

    print(f"tile = {title}, video_id = {video_id}")
    print(f"embed = {embed}, text = {text}")

    output_list.append(embed)
    output_list.append(text)


test = "99999"
text = f"""<iframe width="576" height="324" src="https://www.youtube.com/embed/{test}"></iframe>"""

print(text)



tile = LLMs EXPLAINED in 60 seconds #ai, video_id = ytmK_ErTWss
embed = <gradio.components.html.HTML object at 0x391aa6710>, text = <gradio.components.markdown.Markdown object at 0x388f3f110>
tile = How to Build an LLM from Scratch | An Overview, video_id = ZLbVdvOoTKM
embed = <gradio.components.html.HTML object at 0x391aa6fd0>, text = <gradio.components.markdown.Markdown object at 0x391b28a50>
tile = How to Improve LLMs with RAG (Overview + Python Code), video_id = Ylz779Op9Pw
embed = <gradio.components.html.HTML object at 0x391b28b90>, text = <gradio.components.markdown.Markdown object at 0x391b28910>
tile = A Practical Introduction to Large Language Models (LLMs), video_id = tFHeUSJAYbE
embed = <gradio.components.html.HTML object at 0x391b28cd0>, text = <gradio.components.markdown.Markdown object at 0x391b28e10>
tile = Fine-tuning Large Language Models (LLMs) | w/ Example Code, video_id = eC6Hd1hFvos
embed = <gradio.components.html.HTML object at 0x391b28f50>, text = <gradio.compone

In [53]:
def get_search_results(query: str) -> list[dict[str,[list[str]]]]:
    """
    Method to search results

        Args:
            query (str): The user query string to search for videos
        
        Returns:
            list (dict[str, list[str]): List of dictionaries containing the query resuts with keys: title and video_id
    """

    # pseudo API call
    response = return_top_five(query)

    # format search results

    # initialize list of outputs
    output_list = []

    # compute number of null search results (out of 5)
    num_empty_results = 5-len(response['title'])

    # display search results
    for i in range(len(response['title'])):
        video_id = response['video_id'][i]
        title = response['title'][i]

        embed = gr.HTML(value = format_video_embed(video_id), visible=True)
        text = gr.Markdown(value = format_result_text(title, video_id), visible=True)

        output_list.append(embed)
        output_list.append(text)

    # make null search result slots invisible
    for i in range(num_empty_results):
        
        # if no search results display "No results." text
        if num_empty_results==5 and i==0:
            embed = gr.HTML(visible=True)
            text = gr.Markdown(value = "No results. Try rephrasing your query.", visible=True)

            output_list.append(embed)
            output_list.append(text)
            continue

        embed = gr.HTML(visible=True)
        text = gr.Markdown(visible=False)

        output_list.append(embed)
        output_list.append(text)
        
    return output_list

# Okay Let us Demo this now

In [54]:
# demo
output_list = []

with gr.Blocks() as demo:
    gr.Markdown("# YouTube Search")

    with gr.Row():
        inp = gr.Textbox(placeholder="What are you looking for?", label="Query", scale=3)
        btn = gr.Button("Search")
        btn.click(fn=get_search_results, inputs=inp, outputs=output_list)
    
    for i in range(5):
        with gr.Row():
            output_list.append(gr.HTML())
            output_list.append(gr.Markdown())
             
    inp.submit(fn=get_search_results, inputs=inp, outputs=output_list)


#Note render in the browser http://127.0.0.1:7861 will allow you to play the videos. notebook has issues.
demo.launch()

* Running on local URL:  http://127.0.0.1:7867
* To create a public link, set `share=True` in `launch()`.


