In order to run the following noteboooks, if you haven't done yet, you need to set the openai key inside .env file as `OPENAI_API_KEY`

In [8]:
import os
from openai import OpenAI
import numpy as np
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

API_KEY = os.getenv("DASHSCOPE_API_KEY")
assert API_KEY, "ERROR: API Key is missing"
client = OpenAI(
    api_key=API_KEY,
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
model = "text-embedding-v3"

SIMILARITIES_RESULTS_THRESHOLD = 0.75
DATASET_NAME = "../embedding_index_3m.json"

Next, we are going to load the Embedding Index into a Pandas Dataframe. The Embedding Index is stored in a JSON file called `embedding_index_3m.json`. The Embedding Index contains the Embeddings for each of the YouTube transcripts up until late Oct 2023.

In [44]:
def load_dataset(source: str, num_rows: int) -> pd.core.frame.DataFrame:
    # Load the video session index
    pd_vectors = pd.read_json(source)
    pdv = pd_vectors.drop(columns=["text"], errors="ignore").fillna("")
    return pdv[:num_rows] if num_rows > 0 else pdv

In [47]:
pdv = load_dataset(DATASET_NAME, 10)
len(pdv),pdv.head()

(10,
                                  speaker  \
 0  Seth Juarez, Josh Lovejoy, Sarah Bird   
 1  Seth Juarez, Josh Lovejoy, Sarah Bird   
 2  Seth Juarez, Josh Lovejoy, Sarah Bird   
 3  Seth Juarez, Josh Lovejoy, Sarah Bird   
 4  Seth Juarez, Josh Lovejoy, Sarah Bird   
 
                                                title      videoId     start  \
 0  You're Not Solving the Problem You Think You'r...  -tJQm4mSh1s  00:00:00   
 1  You're Not Solving the Problem You Think You'r...  -tJQm4mSh1s  00:03:07   
 2  You're Not Solving the Problem You Think You'r...  -tJQm4mSh1s  00:06:13   
 3  You're Not Solving the Problem You Think You'r...  -tJQm4mSh1s  00:09:21   
 4  You're Not Solving the Problem You Think You'r...  -tJQm4mSh1s  00:12:24   
 
    seconds                                            summary  \
 0        0  Join Seth Juarez as he discusses ethical conce...   
 1      187  In this video, the speaker discusses the chall...   
 2      373  The video discusses the limita

Next, we are going to create a function called `get_videos` that will search the Embedding Index for the query. The function will return the top 5 videos that are most similar to the query. The function works as follows:

1. First, a copy of the Embedding Index is created.
2. Next, the Embedding for the query is calculated using the OpenAI Embedding API.
3. Then a new column is created in the Embedding Index called `similarity`. The `similarity` column contains the cosine similarity between the query Embedding and the Embedding for each video segment.
4. Next, the Embedding Index is filtered by the `similarity` column. The Embedding Index is filtered to only include videos that have a cosine similarity greater than or equal to 0.75.
5. Finally, the Embedding Index is sorted by the `similarity` column and the top 5 videos are returned.

In [None]:
# original
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_videos(
    query: str, dataset: pd.core.frame.DataFrame, rows: int,
    embeddings_column: str = "ada_v2",
) -> pd.core.frame.DataFrame:
    # create a copy of the dataset
    video_vectors = dataset.copy()

    # get the embeddings for the query    
    query_embeddings = client.embeddings.create(input=query, model=model).data[0].embedding

    # create a new column with the calculated similarity for each row
    video_vectors["similarity"] = video_vectors[embeddings_column].apply(
        lambda x: cosine_similarity(np.array(query_embeddings), np.array(x))
    )

    # filter the videos by similarity
    mask = video_vectors["similarity"] >= SIMILARITIES_RESULTS_THRESHOLD
    video_vectors = video_vectors[mask].copy()

    # sort the videos by similarity
    video_vectors = video_vectors.sort_values(by="similarity", ascending=False).head(
        rows
    )

    # return the top rows
    return video_vectors.head(rows)

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_videos(
    query: str, dataset: pd.core.frame.DataFrame, rows: int,
    embeddings: list
) -> pd.core.frame.DataFrame:
    # create a copy of the dataset
    video_vectors = dataset.copy()

    # get the embeddings for the query    
    query_embeddings = client.embeddings.create(input=query, model=model).data[0].embedding

    # create a new column with the calculated similarity for each row
    video_vectors["similarity"] = embeddings.apply(
        lambda x: cosine_similarity(np.array(query_embeddings), np.array(x))
    )

    # filter the videos by similarity
    mask = video_vectors["similarity"] >= SIMILARITIES_RESULTS_THRESHOLD
    video_vectors = video_vectors[mask].copy()

    # sort the videos by similarity
    video_vectors = video_vectors.sort_values(by="similarity", ascending=False).head(
        rows
    )

    # return the top rows
    return video_vectors.head(rows)

In [49]:
pd_vectors = load_dataset(DATASET_NAME,10)
pd_vectors.head(5)

Unnamed: 0,speaker,title,videoId,start,seconds,summary,ada_v2
0,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:00:00,0,Join Seth Juarez as he discusses ethical conce...,"[0.004357332363724, -0.028409153223037, 0.0111..."
1,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:03:07,187,"In this video, the speaker discusses the chall...","[-0.0038613036740570003, -0.004626247566193000..."
2,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:06:13,373,The video discusses the limitations of general...,"[0.00287682027556, -0.012365541420876001, 0.02..."
3,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:09:21,561,The video discusses the importance of consider...,"[0.015913352370262, 0.000721095071639, 0.02349..."
4,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:12:24,744,The video discusses the importance of understa...,"[5.447878720588051e-06, -0.011837740428745, 0...."


In [None]:
# gemini
import time
from IPython.display import display
import pandas as pd

# This cell assumes:
# 1. 'pd_vectors' DataFrame is loaded (e.g., from cell 'fbe19342' or '4cb9b31a').
# 2. 'client' (OpenAI client for DashScope) is initialized (from cell 'b8348a13').
# 3. 'model' variable (e.g., "text-embedding-v3") is defined (from cell 'b8348a13').

def create_qwen_embeddings_for_column(
    dataframe: pd.DataFrame,
    num_texts: int = -1,
    text_column: str = 'summary',
    new_embedding_column: str = 'qwen',
    model_name: str = model, # Uses the global 'model' variable by default
    batch_size: int = 5  # Number of texts to process in each API call
) -> pd.DataFrame:
    """
    Creates embeddings for a specified text column in the DataFrame using the Qwen model.

    Args:
        dataframe: Pandas DataFrame containing the text column.
        num_texts: num of texts to process. If -1, processes all texts.
        text_column: Name of the column with text to embed.
        new_embedding_column: Name for the new column to store embeddings.
        model_name: Name of the embedding model to use (from DashScope).
        batch_size: Size of batches for API calls.

    Returns:
        A new Pandas DataFrame with an added column for embeddings.
    """
    result_df = dataframe.copy()
    # Initialize the new column with None to store list objects (embeddings)
    result_df[new_embedding_column] = pd.Series([None] * len(result_df), dtype=object)

    if num_texts < 0:
        num_texts = len(result_df)
    print(f"Starting embedding generation for {num_texts} texts in column '{text_column}' using model '{model_name}'.")

    for i in range(0, num_texts, batch_size):
        batch_slice = result_df.iloc[i:i + batch_size]
        
        texts_to_embed = []
        # Keep track of original DataFrame indices for texts in the current batch that are valid
        valid_original_indices_in_batch = [] 

        for original_idx, text_content in zip(batch_slice.index, batch_slice[text_column]):
            if isinstance(text_content, str) and text_content.strip():
                texts_to_embed.append(text_content)
                valid_original_indices_in_batch.append(original_idx)
        
        if not texts_to_embed:
            # print(f"Batch starting at index {i}: No valid texts to embed.")
            continue  # Skip if no valid text in this batch
            
        try:
            response = client.embeddings.create(
                input=texts_to_embed,
                model=model_name
            )
            # Assign embeddings back to the correct rows in result_df using original indices
            for embedding_idx, original_df_idx in enumerate(valid_original_indices_in_batch):
                embedding_value = response.data[embedding_idx].embedding
                # Ensure the embedding is a list to handle potential NumPy array from client
                # and to ensure correct assignment if original_df_idx is duplicated.
                if hasattr(embedding_value, 'tolist'): # Check if it's like a NumPy array
                    result_df.loc[original_df_idx, new_embedding_column] = embedding_value.tolist()
                else: # Assume it's already a list or compatible
                    result_df.iloc[original_df_idx, new_embedding_column] = embedding_value
            
            print(f"Processed batch covering original indices {i} to {min(i + batch_size - 1, num_texts - 1)}. "
                  f"{len(texts_to_embed)} embeddings generated in this batch.")            
        except Exception as e:
            print(f"Error processing batch covering original indices {i} to {min(i + batch_size - 1, num_texts - 1)}: {e}")
            # Embeddings for this batch will remain None for the affected rows if an error occurred
        
        if i + batch_size < num_texts: # Small delay between batches if not the last one
             time.sleep(0.5) # API politeness: adjust as needed

    print(f"Embedding generation finished for column '{text_column}'.")
    return result_df

# Check if 'pd_vectors' DataFrame exists in the environment
if 'pd_vectors' in locals() or 'pd_vectors' in globals():
    # Generate embeddings for the 'summary' column
    # The result is a new DataFrame. You can choose to overwrite pd_vectors if desired.
    print(f"Processing the first 10 summaries from 'pd_vectors'.")
    pd_vectors_with_summary_embeddings = create_qwen_embeddings_for_column(
        pd_vectors,
        num_texts=10,  # Process only the first 10 summaries for demonstration
        text_column='summary',
        new_embedding_column='summary_embedding_qwen', # Name of the new column
        model_name=model # Uses the global 'model' variable ("text-embedding-v3")
    )

    # Display information about the generated embeddings
    if not pd_vectors_with_summary_embeddings.empty and \
       'summary_embedding_qwen' in pd_vectors_with_summary_embeddings.columns:
        
        num_successfully_embedded = pd_vectors_with_summary_embeddings['summary_embedding_qwen'].notna().sum()
        total_texts = len(pd_vectors_with_summary_embeddings)
        print(f"\nSuccessfully generated embeddings for {num_successfully_embedded} out of {total_texts} summaries.")

        # Check the length of the first valid embedding
        first_valid_embedding_series = pd_vectors_with_summary_embeddings[
            pd_vectors_with_summary_embeddings['summary_embedding_qwen'].notna()
        ]['summary_embedding_qwen']
        
        if not first_valid_embedding_series.empty:
            first_embedding_vector = first_valid_embedding_series.iloc[0]
            print(f"Length of the first valid summary embedding vector: {len(first_embedding_vector)}")
        
        print("\nDataFrame with new Qwen summary embeddings (first 5 rows):")
        # Display relevant columns: 'summary' and the new 'summary_embedding_qwen'
        display(pd_vectors_with_summary_embeddings[['summary', 'summary_embedding_qwen']].head())
        
        # To update the original pd_vectors DataFrame, you can uncomment the following line:
        # pd_vectors = pd_vectors_with_summary_embeddings
    else:
        print("\nFailed to generate embeddings, or the new embedding column is missing from the result.")
else:
    print("\nError: 'pd_vectors' DataFrame not found. Please ensure it is loaded in a previous cell.")


Processing the first 10 summaries from 'pd_vectors'.
Starting embedding generation for 10 texts in column 'summary' using model 'text-embedding-v3'.
Error processing batch covering original indices 0 to 4: Must have equal len keys and value when setting with an iterable
Error processing batch covering original indices 5 to 9: Must have equal len keys and value when setting with an iterable
Embedding generation finished for column 'summary'.

Successfully generated embeddings for 0 out of 1409 summaries.

DataFrame with new Qwen summary embeddings (first 5 rows):


Unnamed: 0,summary,summary_embedding_qwen
0,Join Seth Juarez as he discusses ethical conce...,
1,"In this video, the speaker discusses the chall...",
2,The video discusses the limitations of general...,
3,The video discusses the importance of consider...,
4,The video discusses the importance of understa...,


In [None]:
# claude 
def create_embeddings_for_summaries(dataframe, model_name='text-embedding-v3'):
    """
    Create embeddings for all summary texts in the dataframe using the specified model.
    
    Args:
        dataframe: Pandas DataFrame containing a 'summary' column
        model_name: Name of the embedding model to use
        
    Returns:
        DataFrame with added 'summary_embeddings' column containing the embeddings
    """
    result_df = dataframe.copy()
    result_df['summary_embeddings'] = None
    result_embeddings = []
    
    print(f"Creating embeddings for {len(result_df)} summary texts...")
    
    # Process in batches to avoid rate limits and improve performance
    batch_size = 10
    for i in range(0, len(result_df), batch_size):
        batch = result_df.iloc[i:i+batch_size]
        summaries = batch['summary'].tolist()
        
        # Skip empty summaries
        valid_indices = []
        valid_summaries = []
        for idx, summary in enumerate(summaries):
            if isinstance(summary, str) and summary.strip():
                valid_indices.append(idx)
                valid_summaries.append(summary)
        
        if not valid_summaries:
            continue
            
        # Get embeddings for valid summaries
        response = client.embeddings.create(
            input=valid_summaries,
            model=model_name
        )
        
        # Update the dataframe with embeddings
        for local_idx, global_idx in enumerate(valid_indices):
            # result_df.iloc[i + global_idx, result_df.columns.get_loc('summary_embeddings')] = response.data[local_idx].embedding
            result_embeddings.append(response.data[local_idx].embedding)
        print(f"Processed {i + len(batch)} out of {len(result_df)} items")
    return result_embeddings
    # return result_df

# Create embeddings for all summaries
# pd_vectors_subset = pd_vectors.head(10).copy()  # For demonstration, using a subset
# pd_vectors_with_qwen_embeddings = create_embeddings_for_summaries(pd_vectors_subset)
# result_embeddings = create_embeddings_for_summaries(pd_vectors_subset)
# Display the first few rows to confirm embeddings were created
# print(f"First row summary embedding length: {len(pd_vectors_with_qwen_embeddings.iloc[0]['summary_embeddings'])}")
# pd_vectors_with_qwen_embeddings.head(2)

In [None]:
pd_vectors = load_dataset(DATASET_NAME, 10)
result_embeddings = create_embeddings_for_summaries(pd_vectors)

In [52]:
len(result_embeddings),type(result_embeddings[0]), len(result_embeddings[0]),result_embeddings[0]

(10,
 list,
 1024,
 [-0.06689998507499695,
  -0.05340397730469704,
  0.015613218769431114,
  -0.026972681283950806,
  -0.09110772609710693,
  -0.02726271003484726,
  -0.03362400829792023,
  0.021829502657055855,
  -0.009121406823396683,
  0.026643982157111168,
  0.0004670672642532736,
  -0.013022294268012047,
  0.006332295946776867,
  0.008236818946897984,
  0.040294673293828964,
  -0.025580542162060738,
  0.038593169301748276,
  0.0008126849425025284,
  -0.05464143306016922,
  -0.06790541857481003,
  -0.044123053550720215,
  -0.002137754112482071,
  -0.036156926304101944,
  -0.010789072141051292,
  0.09667627513408661,
  0.03886386379599571,
  -0.04860883206129074,
  -0.05796709656715393,
  0.017353391274809837,
  -0.03841915354132652,
  -0.04420039430260658,
  0.04157079756259918,
  -0.0028882038313895464,
  -0.025889907032251358,
  -0.0654691755771637,
  -0.03053036890923977,
  -0.013650690205395222,
  -0.019045226275920868,
  -0.07560084760189056,
  -0.03122643753886223,
  0.012770

This function is very simple, it just prints out the results of the search query.

In [55]:
def display_results(videos: pd.core.frame.DataFrame, query: str):
    def _gen_yt_url(video_id: str, seconds: int) -> str:
        """convert time in format 00:00:00 to seconds"""
        return f"https://youtu.be/{video_id}?t={seconds}"

    print(f"\nVideos similar to '{query}':")
    for _, row in videos.iterrows():
        youtube_url = _gen_yt_url(row["videoId"], row["seconds"])
        print(f" - {row['title']}")
        print(f"   Summary: {' '.join(row['summary'].split()[:15])}...")
        print(f"   YouTube: {youtube_url}")
        print(f"   Similarity: {row['similarity']}")
        print(f"   Speakers: {row['speaker']}")

1. First, the Embedding Index is loaded into a Pandas Dataframe.
2. Next, the user is prompted to enter a query.
3. Then the `get_videos` function is called to search the Embedding Index for the query.
4. Finally, the `display_results` function is called to display the results to the user.
5. The user is then prompted to enter another query. This process continues until the user enters `exit`.

![](../images/notebook-search.png?WT.mc_id=academic-105485-koreyst)

You will be prompted to enter a query. Enter a query and press enter. The application will return a list of videos that are relevant to the query. The application will also return a link to the place in the video where the answer to the question is located.

Here are some queries to try out:

- What is Azure Machine Learning?
- How do convolutional neural networks work?
- What is a neural network?
- Can I use Jupyter Notebooks with Azure Machine Learning?
- What is ONNX?

In [57]:
# pd_vectors = load_dataset(DATASET_NAME,10)

# get user query from imput
while True:
    query = input("Enter a query: ")
    if query == "exit":
        break
    videos = get_videos(query, pd_vectors, 5, result_embeddings)
    display_results(videos, query)

KeyError: "None of [Index([(-0.06689998507499695, -0.05340397730469704, 0.015613218769431114, -0.026972681283950806, -0.09110772609710693, -0.02726271003484726, -0.03362400829792023, 0.021829502657055855, -0.009121406823396683, 0.026643982157111168, 0.0004670672642532736, -0.013022294268012047, 0.006332295946776867, 0.008236818946897984, 0.040294673293828964, -0.025580542162060738, 0.038593169301748276, 0.0008126849425025284, -0.05464143306016922, -0.06790541857481003, -0.044123053550720215, -0.002137754112482071, -0.036156926304101944, -0.010789072141051292, 0.09667627513408661, 0.03886386379599571, -0.04860883206129074, -0.05796709656715393, 0.017353391274809837, -0.03841915354132652, -0.04420039430260658, 0.04157079756259918, -0.0028882038313895464, -0.025889907032251358, -0.0654691755771637, -0.03053036890923977, -0.013650690205395222, -0.019045226275920868, -0.07560084760189056, -0.03122643753886223, 0.012770935893058777, -0.00293654203414917, -0.01947060227394104, -0.08507512509822845, 0.007671262603253126, -0.016850674524903297, 0.0016144937835633755, -0.045940566807985306, -0.0012060365406796336, 0.05131576955318451, -0.0022356389090418816, 0.014143738895654678, 0.024207739159464836, -0.02544519677758217, 0.018561845645308495, 0.06314894556999207, 0.041725482791662216, 0.031439125537872314, -0.09219049662351608, 0.0068495143204927444, -0.07471142709255219, -0.052359871566295624, 0.0027721922378987074, -0.053481318056583405, 0.020398695021867752, 0.044935133308172226, -0.011881514452397823, 0.015990255400538445, -0.0061147743836045265, -0.0024169068783521652, -0.036466293036937714, -0.010102670639753342, -0.04346565529704094, 0.04686865955591202, -0.03223187103867531, 0.04269224405288696, -0.03986929729580879, 0.02449776791036129, -0.008923220448195934, 0.031883835792541504, 0.04771941155195236, -0.000538366031832993, 0.007671262603253126, -0.04102941229939461, 0.016222279518842697, 0.05560819432139397, -0.06902686506509781, 0.07123108208179474, -0.006975193507969379, -0.022912276908755302, -0.02631528303027153, -0.02059204690158367, 0.001793344970792532, -0.021210774779319763, -0.015874244272708893, 0.01431775651872158, -0.024787796661257744, 0.015651889145374298, -0.0058634160086512566, -0.014646455645561218, ...),\n                 (-0.030094854533672333, -0.029847081750631332, 0.011835911311209202, 0.017353618517518044, -0.06072337180376053, -0.017229732125997543, -0.0413971021771431, 0.06426842510700226, -0.059236735105514526, 0.03198174014687538, -0.019507335498929024, -0.028303267434239388, 0.038709722459316254, 0.0008940068073570728, 0.028188910335302353, -0.026873808354139328, 0.047534242272377014, 0.014332697726786137, -0.02334781363606453, -0.06705110520124435, 0.008219382725656033, 0.0010983002139255404, 0.006003723479807377, -0.007285470608621836, 0.07829617708921432, 0.09125659614801407, -0.01833518035709858, -0.024948805570602417, 0.08096449822187424, -0.018468596041202545, -0.04543770104646683, -0.01563826948404312, -0.011511901393532753, -0.014237401075661182, 0.016419706866145134, -0.0486396886408329, -0.03051416203379631, -0.03602233901619911, -0.04639067500829697, 0.052070386707782745, 0.026282966136932373, 0.009863259270787239, -0.036784715950489044, -0.059960994869470596, 0.027102522552013397, -0.03796640411019325, -0.01802069880068302, -0.052070386707782745, 0.03672754019498825, -0.0004297307750675827, 0.04719117283821106, -0.008019259199500084, -0.023424049839377403, -0.010835290886461735, 0.0007701204740442336, 0.03952927514910698, -0.015523913316428661, 0.0022299543488770723, -0.08081202208995819, 0.0644209012389183, -0.08256548643112183, -0.02075572870671749, -0.00790490210056305, -0.03245822340250015, 0.009463011287152767, 0.042769379913806915, -0.026035193353891373, -0.022737910971045494, -0.017077257856726646, -0.08065954595804214, 0.003552202833816409, 0.042083241045475006, -0.04463720694184303, 0.00423119543120265, -0.0769238993525505, 0.014504233375191689, -0.05931297317147255, -0.014361287467181683, -0.02073667012155056, 0.010692344978451729, 0.020107708871364594, -0.025615885853767395, 0.0785248875617981, -0.0009523763437755406, -0.003056657500565052, 0.01489495113492012, -0.019917113706469536, 0.014351757243275642, 0.030609458684921265, -0.019068969413638115, -0.045857008546590805, -0.02121315523982048, -0.005198461934924126, -0.024720093235373497, 0.0037904458586126566, -0.004467056132853031, -0.03243916481733322, 0.00239791558124125, 0.00252775801345706, 0.01632441021502018, ...),\n                                   (-0.06124632805585861, -0.02686375007033348, -0.00703709339722991, 0.016756784170866013, -0.07579280436038971, -0.02070511318743229, -0.015764979645609856, 0.05376528576016426, -0.05523882433772087, 0.0221975427120924, -0.043866124004125595, -0.03698960691690445, 0.041070178151130676, 0.01254397351294756, 0.06468458473682404, -0.009955834597349167, 0.02395445480942726, 0.021101834252476692, -0.0445084348320961, -0.06638482213020325, 0.0027983074542135, -0.01043756864964962, -0.011004313826560974, -0.028904035687446594, 0.042468152940273285, 0.08130913227796555, -0.051044903695583344, -0.03169998154044151, 0.03831201419234276, -0.01922212727367878, -0.02221643552184105, -0.019684970378875732, 0.03392918035387993, -0.01959051378071308, -0.03166219964623451, -0.08780781179666519, -0.015434376895427704, -0.01179775781929493, -0.03899211063981056, 0.013186285272240639, 0.03141660988330841, 0.0296219140291214, -0.01934492215514183, -0.06887850165367126, 0.009842485189437866, -0.055918917059898376, -0.07035204768180847, -0.06105741485953331, 0.06117076426744461, 0.010569809004664421, 0.009148221462965012, 0.012827346101403236, -0.027336038649082184, -0.001382623566314578, 0.025654692202806473, 0.054143115878105164, 0.010975976474583149, -0.013384646736085415, -0.06952081620693207, 0.016067244112491608, -0.06460902094841003, -0.033626917749643326, -0.005955553613603115, -0.027827218174934387, 0.03270123153924942, 0.03619616478681564, 0.00522822979837656, -0.007532996125519276, -0.046586502343416214, -0.058714862912893295, -0.003464233595877886, 0.03763192147016525, -0.014858185313642025, 0.0021536340937018394, -0.06710270047187805, 0.016804013401269913, -0.04057899862527847, -0.01663398928940296, -0.020138368010520935, -0.03817977383732796, 0.04647315666079521, -0.012742334976792336, 0.051233820617198944, -0.021252967417240143, 0.030472032725811005, 0.05346301943063736, -0.001479442697018385, 0.0037121849600225687, 0.03564830869436264, 0.010210869833827019, 0.013564116321504116, -0.04892905429005623, 0.006031119730323553, -0.019968343898653984, -0.005766638554632664, -0.01780526340007782, -0.04141022637486458, 0.03003752790391445, 0.02399223856627941, 0.04175027459859848, ...),\n                         (-0.037724271416664124, -0.02499185875058174, -0.005926343146711588, 0.039616163820028305, -0.08210799843072891, -0.0136310625821352, -0.02597564086318016, 0.04230264574289322, -0.03532157465815544, 0.02047024294734001, 0.0026510099414736032, -0.04215129464864731, -0.0051885065622627735, -0.015135114081203938, 0.03465941175818443, -0.0020550647750496864, 0.019751325249671936, 0.011332416906952858, -0.008191880770027637, -0.05550803244113922, 0.012022956274449825, 0.012751334346830845, 0.008433097042143345, -0.02277834713459015, 0.08142691850662231, 0.08846474438905716, -0.06405936926603317, -0.05210263282060623, 0.07855124771595001, -0.014539169147610664, -0.024102669209241867, 0.0018268555868417025, -0.017263490706682205, -0.0298918504267931, -0.031726982444524765, -0.05441073700785637, -0.01750943623483181, -0.03727021813392639, -0.06564855575561523, 0.0259945597499609, 0.006195937283337116, 0.010736471973359585, -0.020110784098505974, -0.04143237695097923, 0.016582408919930458, -0.05459992587566376, -0.029929688200354576, -0.03670265153050423, 0.04355129227042198, 0.006427694112062454, 0.010358094237744808, -0.024310776963829994, -0.041886430233716965, -0.024254020303487778, 0.00019155378686264157, 0.055432356894016266, -0.04449723660945892, -0.002915874356403947, -0.09255122393369675, 0.03923778608441353, -0.06088100001215935, -0.013848629780113697, 0.016639167442917824, -0.04987020418047905, 0.020413486286997795, 0.028964824974536896, -0.021340511739253998, -0.027564827352762222, -0.032540496438741684, -0.05127020180225372, -0.002842563670128584, 0.02081078290939331, -0.011389173567295074, 0.007889178581535816, -0.0616377554833889, 0.042226970195770264, -0.01459592580795288, -0.03482968360185623, 0.01476619578897953, 0.041545890271663666, -0.00548647902905941, -0.01889997348189354, 0.0790809765458107, -0.033505361527204514, 0.046729665249586105, 0.015210790559649467, -0.02323240041732788, 0.027356719598174095, 0.06557288020849228, 0.010348634794354439, -0.026354018598794937, -0.019808081910014153, 0.03791346028447151, -0.014331061393022537, 0.01999727077782154, 0.01178647018969059, -0.06038910523056984, 0.011871605180203915, -0.025218885391950607, 0.017244571819901466, ...),\n                    (-0.05193201079964638, -0.026286806911230087, -0.04706338420510292, 0.03985479846596718, -0.06827399134635925, 0.0005658244481310248, -0.02413555234670639, 0.022550418972969055, -0.03317459300160408, 0.030796891078352928, 0.016860920935869217, 0.0029721264727413654, -0.0038165876176208258, -0.032382022589445114, 0.06031057611107826, 0.018210170790553093, 0.030815761536359787, -0.018776290118694305, -0.003191497642546892, -0.03830740600824356, -0.005854617338627577, -0.02626793645322323, 0.014077500440180302, 0.025984875857830048, 0.08129473030567169, 0.08642754703760147, -0.05113944411277771, -0.058763183653354645, 0.06850043684244156, -0.015228609554469585, -0.0038755584973841906, 0.002305756788700819, -0.037495967000722885, -0.010765702463686466, -0.014888937585055828, -0.04921463876962662, -0.01049207802861929, 0.004812014289200306, -0.062386348843574524, -0.028758861124515533, 0.010850620456039906, 0.03474085405468941, -0.009841040708124638, -0.01766292192041874, 0.0051422505639493465, -0.03966609388589859, -0.05510227754712105, -0.04264765605330467, 0.011360127478837967, -0.018446054309606552, -0.0010078103514388204, -0.024229906499385834, -0.0016570783918723464, -0.03513713926076889, 0.012152694165706635, 0.06355632841587067, -0.04842207208275795, -0.02638115920126438, -0.05902737379074097, 0.014407736249268055, -0.07457678020000458, -0.02170124091207981, 0.03592970594763756, -0.02821161225438118, 0.016728825867176056, 0.02489037811756134, 0.009402298368513584, -0.04151541739702225, -0.03351426124572754, -0.04695016145706177, -0.014596442691981792, 0.012379142455756664, 0.0055526867508888245, 0.01913483254611492, -0.0506865493953228, 0.031004467979073524, -0.03381619229912758, -0.06468856334686279, 0.008340824395418167, 0.04913915693759918, -0.008680496364831924, 0.030551571398973465, 0.06408470869064331, -0.023286374285817146, 0.020418036729097366, -0.02336185611784458, -0.011369562707841396, -0.011190291494131088, 0.05008268728852272, -0.00883146096020937, 0.002589995739981532, -0.01630423590540886, 0.04117574542760849, -0.006170700304210186, 0.004644537344574928, 0.03677888587117195, -0.06834947317838669, 0.007647328078746796, 0.005812158342450857, 0.030532700940966606, ...),\n                     (-0.03477071225643158, 0.0024079824797809124, -0.03558104857802391, -0.02874845452606678, -0.05094056949019432, -0.02657528594136238, -0.04022205248475075, 0.06659475713968277, 0.07399826496839523, 0.02758820354938507, -0.042689889669418335, -0.01714593917131424, -0.01728406548500061, -0.036280881613492966, 0.042984556406736374, -0.00881238654255867, -0.014917519874870777, -0.03635454922914505, 0.008973532356321812, -0.03403404355049133, 0.0180667731910944, 0.047257229685783386, 0.021823778748512268, -0.033923543989658356, 0.07289326190948486, 0.0746612623333931, -0.08007577061653137, -0.06044358015060425, 0.01944802515208721, 0.04887789860367775, -0.0462258942425251, 0.009189928881824017, 0.004159870091825724, -0.001880804542452097, -0.04968823119997978, -0.01704464852809906, -0.0144478939473629, -0.023923281580209732, -0.050682734698057175, 0.015626562759280205, 0.0007619906100444496, 0.02740403637290001, 0.012394432909786701, -0.029116788879036903, -0.025028282776474953, -0.04493672773241997, -0.00044315162813290954, -0.07587676495313644, 0.0462258942425251, 0.0013398142764344811, 0.00389973446726799, 0.011280223727226257, -0.005106027703732252, -0.034255046397447586, -0.053666237741708755, 0.048362232744693756, -0.009806888177990913, 0.008651240728795528, -0.06722092628479004, 0.02629903517663479, -0.001220105797983706, 0.0018209503032267094, -0.013232392258942127, -0.0423583909869194, -0.009549054317176342, 0.022615697234869003, -0.024936199188232422, 0.022173695266246796, -0.04449472576379776, -0.11801415681838989, -0.011455181986093521, 0.019098108634352684, -0.02326028048992157, 0.0030663791112601757, -0.05860191211104393, -0.016501355916261673, -0.025120366364717484, 0.00929122045636177, -0.05075640231370926, -0.014724144712090492, 0.006666842382401228, -0.020258359611034393, 0.06431108713150024, 0.020387277007102966, 0.01872977428138256, -0.008319740183651447, 0.005000131670385599, -0.0027578994631767273, 0.06335341930389404, -0.027514535933732986, 0.0009075975394807756, 0.0031538584735244513, 0.034457627683877945, -0.03620721399784088, -0.013057434000074863, 0.060885582119226456, -0.01675918884575367, -0.02114236168563366, -0.001042845193296671, 0.014991186559200287, ...),\n                         (-0.043655842542648315, -0.045805998146533966, -0.027798444032669067, 0.03457527607679367, -0.05348512902855873, -0.007285573054105043, 0.0044682929292321205, 0.01751801185309887, -0.01395681593567133, 0.04116012901067734, -0.004204322583973408, 0.00277648470364511, 0.030409347265958786, -0.009301343932747841, 0.029411060735583305, -0.012785748578608036, 0.015646222978830338, -0.018449105322360992, -0.01220021490007639, -0.06216254457831383, -0.004878646228462458, 0.013188902288675308, -0.051411762833595276, -0.024266045540571213, 0.09537477046251297, 0.07725203037261963, -0.056326404213905334, -0.0725293681025505, 0.029391862452030182, -0.04991433396935463, -0.029161488637328148, 0.026147430762648582, 0.03413372486829758, -0.0320027656853199, -0.04688107967376709, -0.05571207404136658, -0.044961296021938324, 0.00207576435059309, -0.05974361672997475, -0.013841629028320312, 0.04519166797399521, 0.03167640417814255, -0.02666577324271202, -0.08692773431539536, 0.012814545072615147, -0.06788349151611328, 0.0012106625363230705, -0.05671036243438721, 0.0511045977473259, 0.028144005686044693, 0.003100448055192828, 0.0016054176958277822, 0.0038395640440285206, -0.039777882397174835, -0.013428875245153904, 0.04772578179836273, -0.005408986005932093, 0.010309229604899883, -0.05302438139915466, 0.04979914799332619, -0.040776170790195465, -0.03486324101686478, 0.007251976523548365, -0.028873521834611893, 0.020234502851963043, 0.031234854832291603, -0.0019161824602633715, 0.00942612998187542, -0.0028484766371548176, -0.030601326376199722, -0.0169132798910141, 0.01937060058116913, 0.014916706830263138, 0.02142476849257946, -0.06949611008167267, 0.01369764469563961, -0.03730136528611183, -0.012948930263519287, 0.006954410579055548, 0.05191090703010559, 0.027376092970371246, 0.02420845255255699, 0.05893730744719505, -0.0037843703757971525, 0.035957518965005875, 0.058783724904060364, -0.005005831830203533, 0.04565241560339928, 0.03981627896428108, -0.06346799433231354, -0.030082985758781433, -0.010117251425981522, -0.00863901898264885, -0.020618459209799767, 0.005951324477791786, -0.0018597888993099332, -0.05743987858295441, -0.00521220825612545, -0.00845663994550705, 0.024861177429556847, ...),\n                        (-0.04950153827667236, -0.011154876090586185, -0.010246590711176395, 0.03985100984573364, -0.07587964832782745, -0.05960620939731598, 0.018998296931385994, 0.050826121121644974, 0.010596659034490585, 0.015034010633826256, -0.01611259952187538, 0.026283498853445053, -0.030730312690138817, -0.008586132898926735, 0.07228435575962067, -0.03860211744904518, 0.0109088821336627, -0.00961268413811922, -0.02015257626771927, -0.025980737060308456, -0.036482784897089005, -0.036974772810935974, -0.006533029489219189, -0.035650189965963364, 0.04598193243145943, 0.06566144526004791, -0.06074156612157822, -0.026321344077587128, 0.025621209293603897, 0.005449710413813591, -0.01210100669413805, 0.025280602276325226, 0.03898056969046593, 0.010965649969875813, -0.03057893179357052, -0.026340266689658165, -0.039056260138750076, 0.005364558659493923, -0.03322809562087059, -0.05283191427588463, 0.0037206574343144894, 0.06316366046667099, 0.0012169601395726204, -0.05252915248274803, -0.009423458017408848, -0.0487067885696888, -0.05146948993206024, -0.05752472206950188, 0.033057793974876404, 0.019414594396948814, 0.008884163573384285, -0.008898355066776276, 0.009106503799557686, -0.0348554402589798, -0.0509396567940712, 0.020814865827560425, -0.0030725582037121058, -0.03795874863862991, -0.03126014396548271, 0.03572588041424751, -0.06108217313885689, 0.03330378606915474, 0.040759291499853134, -0.04590624198317528, 0.07277633994817734, 0.011646864004433155, 0.0027958150021731853, -0.033398400992155075, -0.01937674917280674, -0.05211285501718521, -0.03818581998348236, 0.008174565620720387, -0.005951159633696079, -0.01633967086672783, -0.06331504136323929, 0.02707824856042862, -0.03226304426789284, -0.0482526458799839, -0.010454739443957806, -0.00933830626308918, 0.007616349030286074, -0.015242159366607666, 0.06395840644836426, -0.01644374430179596, 0.0031719019170850515, 0.04980430006980896, -0.025848280638456345, -5.6324319302802905e-05, 0.01849684678018093, -0.007550119888037443, -0.002202118281275034, -0.0042504905723035336, -0.009068658575415611, -0.004647864960134029, 0.03273610770702362, 0.04087282717227936, -0.03182782232761383, 0.01958489790558815, 0.023274805396795273, -0.007029748056083918, ...),\n                    (-0.05637377128005028, 0.014122421853244305, 0.013803653419017792, 0.025675375014543533, -0.0605853833258152, -0.02924944832921028, -0.011437037028372288, 0.020111409947276115, -0.017590239644050598, 0.03438838571310043, -0.003151463810354471, 0.03044724650681019, 0.017058957368135452, -0.010171621106564999, 0.060005802661180496, -0.024033231660723686, 0.02212061919271946, -0.00808513443917036, -0.038252249360084534, -0.006785910576581955, 0.007703578099608421, 0.00538043025881052, -0.005829604342579842, -0.014663362875580788, 0.05791931599378586, 0.10486526042222977, -0.04520720615983009, -0.03220530226826668, 0.04818237945437431, -0.013330330140888691, -0.03695785626769066, 0.009234634228050709, 0.0238400399684906, -0.03647487238049507, 0.005438388325273991, -0.06549248844385147, -0.01315645594149828, -0.02503783628344536, -0.03535435348749161, 0.02416846714913845, -0.007143318187445402, 0.04949609562754631, -0.00018142047338187695, -0.0824548527598381, 0.03933413326740265, -0.07607947289943695, -0.008394244126975536, -0.060392189770936966, 0.03968188166618347, 0.01843062974512577, 0.017339088022708893, -0.048414211720228195, -0.011127927340567112, -0.019898898899555206, 0.00956789217889309, 0.01731976866722107, 0.00286891870200634, -0.03790450468659401, -0.04064784571528435, 0.04729369282722473, -0.08454133570194244, 0.007037061732262373, -0.0011917604133486748, -0.013137136586010456, 0.08067747205495834, 0.016073672100901604, -0.003064526943489909, -0.010548347607254982, -0.01305019948631525, -0.07781821489334106, -0.019454553723335266, 0.030485885217785835, 0.011784784495830536, -0.023028627038002014, -0.08794153481721878, 0.005423898808658123, -0.03761471435427666, -0.0007739802240394056, -0.0015274336328729987, 0.030196094885468483, 0.038426123559474945, -0.012586535885930061, 0.052046243101358414, -0.012943943031132221, 0.003960460424423218, -0.014576425775885582, -0.02289339154958725, 0.019019868224859238, 0.050346143543720245, -0.0022108545526862144, -0.029983581975102425, -0.03786586597561836, 0.0065492489375174046, -0.03724764660000801, 0.005433558486402035, 0.03597257286310196, -0.019898898899555206, -0.001972985453903675, 0.014054804109036922, 0.007181956432759762, ...),\n                                  (-0.06208082288503647, -0.0639725923538208, 0.01649504527449608, 0.02966984547674656, -0.08014912158250809, -0.06567131727933884, -0.009960728697478771, 0.04934035614132881, -0.02893630415201187, 0.02656194381415844, 0.0163792222738266, 0.015925584360957146, 0.015317516401410103, -0.016832860186696053, 0.06486055999994278, -0.01153398398309946, 0.048761241137981415, -0.02341543510556221, -0.022353729233145714, -0.04999668151140213, -0.027951812371611595, -0.04158025234937668, 0.022604677826166153, -0.011485724709928036, 0.06154031679034233, 0.06069095432758331, -0.036754317581653595, -0.06574853509664536, 0.045132141560316086, -0.025673972442746162, -0.024631569162011147, 0.0069493455812335014, -0.030963195487856865, -0.024785999208688736, -0.02036544308066368, -0.06076816841959953, 0.02644612081348896, -0.016726689413189888, -0.05937829986214638, -0.02080943062901497, 0.027295485138893127, 0.031021106988191605, -0.03227585181593895, -0.07347002625465393, 0.02633029967546463, -0.04984225332736969, -0.060613736510276794, -0.028801176697015762, 0.0266970694065094, 0.025172075256705284, 0.04702390730381012, -0.005660821218043566, 0.0017494013300165534, -0.006920390296727419, 0.0011986414901912212, 0.04953339323401451, -0.021253416314721107, -0.0295347198843956, -0.06103841960430145, 0.02046196348965168, -0.06930042058229446, -0.027198966592550278, 0.03320242837071419, -0.04918592423200607, 0.02059708908200264, 0.07401053607463837, 0.010057247243821621, -0.02009519189596176, -0.021736009046435356, -0.021504364907741547, -0.01176562812179327, 0.03837583214044571, -0.01896592229604721, 0.018377158790826797, -0.03472742438316345, 0.06389537453651428, -0.03517141193151474, -0.012595688924193382, 0.01045297458767891, 0.06428144872188568, 0.028009723871946335, 0.005255442578345537, 0.01626339927315712, -0.06208082288503647, 0.03905146196484566, 0.02625308372080326, -0.0473327673971653, -0.009415398351848125, 0.034129008650779724, -0.03465021029114723, -0.024612266570329666, -0.01011515874415636, 0.013802172616124153, -0.019680161029100418, 0.024361317977309227, -0.0036411676555871964, -0.021427148953080177, -0.0014019339578226209, -0.00725820567458868, -0.003421587636694312, ...)],\n      dtype='object')] are in the [columns]"

In [10]:
pd_vectors = load_dataset(DATASET_NAME)
pd_vectors.head(5)

Unnamed: 0,speaker,title,videoId,start,seconds,summary,ada_v2
0,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:00:00,0,Join Seth Juarez as he discusses ethical conce...,"[0.004357332363724, -0.028409153223037, 0.0111..."
1,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:03:07,187,"In this video, the speaker discusses the chall...","[-0.0038613036740570003, -0.004626247566193000..."
2,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:06:13,373,The video discusses the limitations of general...,"[0.00287682027556, -0.012365541420876001, 0.02..."
3,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:09:21,561,The video discusses the importance of consider...,"[0.015913352370262, 0.000721095071639, 0.02349..."
4,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:12:24,744,The video discusses the importance of understa...,"[5.447878720588051e-06, -0.011837740428745, 0...."
