### Retrival Augmentented Generation

#### Vector embeddings

In [3]:
%pip install langchain_cohere -q
%pip install spacy -q 

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
# now you need to run this in a terminal window
# python -m spacy download en_core_web_md
# now restart your kernel

Standard imports for the libraires we will be using in this notebook.  Try to keep your imports in the first cell so this can this code can more easliy be converted into a python program later

In [30]:
import boto3
import pandas as pd
import json
import numpy as np
from langchain.embeddings import BedrockEmbeddings
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_cohere import CohereEmbeddings

# Create the AWS client for the Bedrock runtime with boto3
aws_client = boto3.client(service_name="bedrock-runtime")

#### Lets define functions that will use various embedding models so we can generate vector embeddings
Spacey

In [2]:
def generate_spacy_vector_embedding(text):
    embedder = SpacyEmbeddings(model_name="en_core_web_md")
    query_embedding = embedder.embed_query(text)

    return(np.array(query_embedding))

Huggingface Bert

In [3]:
def generate_huggingface_vector_embedding(text):
    embeddings = HuggingFaceEmbeddings(model_name="bert-base-uncased")
    embedded_texts = embeddings.embed_query(text_data)
    return(embedded_texts)

Cohere

In [4]:
# send in an array size of one and only return the 0th element
def generate_cohere_vector_embedding(text_data):
    input_type = "clustering"
    truncate = "NONE" # optional
    model_id = "cohere.embed-english-v3" # or "cohere.embed-multilingual-v3"
    
    # Create the JSON payload for the request
    json_params = {
            'texts': [text_data],
            'truncate': truncate, 
            "input_type": input_type
        }
    json_body = json.dumps(json_params)
    params = {'body': json_body, 'modelId': model_id,}
    
    # Invoke the model and print the response
    result = aws_client.invoke_model(**params)
    response = json.loads(result['body'].read().decode())
    return(np.array(response['embeddings'][0]))


In [5]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity



Amazon Titan

In [6]:
# Let's generate a dense vector using Amazon Titan with LangChain
def generate_titan_vector_embedding(text):
    #create an Amazon Titan Text Embeddings client
    embeddings_client = BedrockEmbeddings(region_name="us-west-2") 

    #Invoke the model
    embedding = embeddings_client.embed_query(text)
    return(np.array(embedding))



Now let's generate vectors for simple words and compare how we can compute cosine similarity

In [7]:
#Titan
king_vector = generate_titan_vector_embedding("King")
queen_vector = generate_titan_vector_embedding("Queen")
man_vector = generate_titan_vector_embedding("man")
woman_vector = generate_titan_vector_embedding("woman")
print(king_vector[:5])

[-0.25       -0.91015625  0.55078125  0.59375     0.5546875 ]


In [8]:
calculated_queen_vector = king_vector - man_vector + woman_vector

similarity = cosine_similarity(calculated_queen_vector, queen_vector)
print(f"Cosine Similarity distance between Titan King - Queen: {similarity:.4f}")

similarity = cosine_similarity(king_vector, queen_vector)
print(f"Cosine Similarity of Titan King to Queen: {similarity:.4f}")

Cosine Similarity distance between Titan King - Queen: 0.6676
Cosine Similarity of Titan King to Queen: 0.6291


In [9]:
# Input cohere for embedding 
king_vector = generate_cohere_vector_embedding('King')
queen_vector = generate_cohere_vector_embedding("Queen")
man_vector = generate_cohere_vector_embedding("man")
woman_vector = generate_cohere_vector_embedding("woman")
print(king_vector[:5])


[ 0.02391052 -0.02467346  0.02523804 -0.04049683 -0.06323242]


In [10]:
calculated_queen_vector = king_vector - man_vector + woman_vector

similarity = cosine_similarity(calculated_queen_vector, queen_vector)
print(f"Cosine Similarity distance between Cohere King - Queen: {similarity:.4f}")

similarity = cosine_similarity(king_vector, queen_vector)
print(f"Cosine Similarity of Cohere King to Queen: {similarity:.4f}")

Cosine Similarity distance between Cohere King - Queen: 0.7226
Cosine Similarity of Cohere King to Queen: 0.7328


In [11]:
#Spacy
king_vector = generate_spacy_vector_embedding("King")
queen_vector = generate_spacy_vector_embedding("Queen")
man_vector = generate_spacy_vector_embedding("man")
woman_vector = generate_spacy_vector_embedding("woman")
print(king_vector[:5])

[ 0.19343001 -4.19689989  4.81750011 -0.72863001  2.31769991]


In [12]:
calculated_queen_vector = king_vector - man_vector + woman_vector

similarity = cosine_similarity(calculated_queen_vector, queen_vector)
print(f"Cosine Similarity distance between Spacey King - Queen: {similarity:.4f}")

similarity = cosine_similarity(king_vector, queen_vector)
print(f"Cosine Similarity of Spacey King to Queen: {similarity:.4f}")

Cosine Similarity distance between Spacey King - Queen: 0.6892
Cosine Similarity of Spacey King to Queen: 0.6593


Let's examine other phrases

In [25]:
similarity = cosine_similarity(generate_titan_vector_embedding("cat"), generate_titan_vector_embedding("book"))
print(f"Cosine Similarity of cat to book using Titan: {similarity:.4f}")

Cosine Similarity of cat to book using Titan: 0.3514


In [14]:
similarity = cosine_similarity(generate_cohere_vector_embedding("cat"), generate_cohere_vector_embedding("book"))
print(f"Cosine Similarity of cat to book using Cohere: {similarity:.4f}")

Cosine Similarity of cat to book using Cohere: 0.5575


In [24]:
similarity = cosine_similarity(generate_spacy_vector_embedding("cat"), generate_spacy_vector_embedding("book"))
print(f"Cosine Similarity of cat to book using Spacey: {similarity:.4f}")

Cosine Similarity of cat to book using Spacey: 0.0696


Now let's look at a larger sentences and see how larger models with more complexity handle the same task
Here are 2 sentences that semantically similar but use different words and phrasing.

1. The majestic, towering skyscrapers, their gleaming windows reflecting the golden rays of the setting sun, stood as a testament to human ingenuity and the 
 indomitable spirit of progress, while the bustling streets below teemed with life as people from all walks of life hurried to their destinations, their faces 
 a mix of determination and weariness, yet each individual contributing to the vibrant tapestry of the city's existence.

2. The awe-inspiring, colossal high-rises, their polished glass facades mirroring the warm, amber glow of the fading daylight, served as a powerful symbol of human
 innovation and the unyielding drive for advancement, as the lively thoroughfares beneath pulsed with energy, filled with individuals from diverse backgrounds rushing
 to their intended locations, their expressions an amalgamation of resolve and fatigue, yet all playing a vital role in the dynamic, intricate mosaic that shaped the 
 city's vibrant identity.


In [19]:
sentence1 = "The majestic, towering skyscrapers, their gleaming windows reflecting the golden rays of the setting sun, stood as a testament to human ingenuity and the indomitable spirit of progress, while the bustling streets below teemed with life as people from all walks of life hurried to their destinations, their faces a mix of determination and weariness, yet each individual contributing to the vibrant tapestry of the city's existence."
sentence2 = "The awe-inspiring, colossal high-rises, their polished glass facades mirroring the warm, amber glow of the fading daylight, served as a powerful symbol of human innovation and the unyielding drive for advancement, as the lively thoroughfares beneath pulsed with energy, filled with individuals from diverse backgrounds rushing to their intended locations, their expressions an amalgamation of resolve and fatigue, yet all playing a vital role in the dynamic, intricate mosaic that shaped the city's vibrant identity."
similarity = cosine_similarity(generate_spacy_vector_embedding(sentence1), generate_spacy_vector_embedding(sentence2))
print(f"Cosine Similarity of S1 to S2 using Spacey: {similarity:.4f}")

Cosine Similarity of S1 to S2 using Spacey: 0.9776


In [20]:
sentence1 = "The majestic, towering skyscrapers, their gleaming windows reflecting the golden rays of the setting sun, stood as a testament to human ingenuity and the indomitable spirit of progress, while the bustling streets below teemed with life as people from all walks of life hurried to their destinations, their faces a mix of determination and weariness, yet each individual contributing to the vibrant tapestry of the city's existence."
sentence2 = "The awe-inspiring, colossal high-rises, their polished glass facades mirroring the warm, amber glow of the fading daylight, served as a powerful symbol of human innovation and the unyielding drive for advancement, as the lively thoroughfares beneath pulsed with energy, filled with individuals from diverse backgrounds rushing to their intended locations, their expressions an amalgamation of resolve and fatigue, yet all playing a vital role in the dynamic, intricate mosaic that shaped the city's vibrant identity."
similarity = cosine_similarity(generate_titan_vector_embedding(sentence1), generate_titan_vector_embedding(sentence2))
print(f"Cosine Similarity of S1 to S2 using Titan: {similarity:.4f}")

Cosine Similarity of S1 to S2 using Titan: 0.8726


In [21]:
sentence1 = "The majestic, towering skyscrapers, their gleaming windows reflecting the golden rays of the setting sun, stood as a testament to human ingenuity and the indomitable spirit of progress, while the bustling streets below teemed with life as people from all walks of life hurried to their destinations, their faces a mix of determination and weariness, yet each individual contributing to the vibrant tapestry of the city's existence."
sentence2 = "The awe-inspiring, colossal high-rises, their polished glass facades mirroring the warm, amber glow of the fading daylight, served as a powerful symbol of human innovation and the unyielding drive for advancement, as the lively thoroughfares beneath pulsed with energy, filled with individuals from diverse backgrounds rushing to their intended locations, their expressions an amalgamation of resolve and fatigue, yet all playing a vital role in the dynamic, intricate mosaic that shaped the city's vibrant identity."
similarity = cosine_similarity(generate_cohere_vector_embedding(sentence1), generate_cohere_vector_embedding(sentence2))
print(f"Cosine Similarity of S1 to S2 using Cohere: {similarity:.4f}")

Cosine Similarity of S1 to S2 using Cohere: 0.8156


#### Conclusions: 
 Bigger models aren't necessarily more capable for all tasks.  On simple sentences Spacey appears to be the best model for similarity comparision but depending on the task, other models will likey perform better.  Here is a list of benchmarks for large embedded tasks.

 [Massive Text Embedding Benchmark](https://huggingface.co/spaces/mteb/leaderboard)

#### Persistance of embeddings for later retrieval
 In order to do semantic search and retrieve relevant content we need to store that content for later use.  We can store the embedding in several different persistence technologies.  To start simply let's store the data in memory using pandas dataframe.

In [41]:
def clean_value(value):
    value_str = str(value)
    cleaned_value = ''.join(char for char in value_str if char.isalnum())
    return cleaned_value

In [None]:
# Read the CSV file into a DataFrame
#df = pd.read_csv('data/latest_research_articles.csv')
#df['abstract'] = df['abstract'].apply(clean_value)
# Add a new column 'square_root' by applying the function to an existing column
df['embedded_abstract'] = df['abstract'].apply(generate_spacy_vector_embedding)

# Display the updated DataFrame
#df.info

In [40]:
for index, row in df.iterrows():
    # Extract the value from the specified column
    value = row['abstract']
    #print(type(value))
    # Convert the value to a string
    value_str = str(value)
    cleaned_value = ''.join(char for char in value_str if char.isalnum())
    # Print or process the value as needed
    print(cleaned_value)

<class 'str'>
AccurateestimatesofthereproductionratioarecrucialforprojectingtheevolutionofaninfectiousdiseaseepidemicandforguidingthepublichealthresponseHereweprovethatestimatesofthereproductionratiobasedoninferencefromsurveillancedatacanbeinaccurateifthepopulationcomprisesspatiallydistinctcommunitiesasthespacemobilityinterplaymayhidethetrueevolutionoftheepidemicfromsurveillancedataConsequentlysurveillancemayunderestimatethereproductionratiooverlongperiodsevenmistakinggrowingepidemicsassubsidingToaddressthisweusethespectralpropertiesofthematrixdescribingthespatialepidemicspreadtoreweightsurveillancedataWeproposeacorrectionthatremovesthebiasacrossallepidemicphasesWevalidatethiscorrectionagainstsimulatedepidemicsanduseCOVID19asacasestudyHoweverourresultsapplytoanyepidemicinwhichmobilityisadriverofcirculationOurfindingsmayhelpimproveepidemicmonitoringandsurveillanceandinformstrategiesforpublichealthresponsesSpatialdynamicscanobscureepidemictrendsfromsurveillancedatabiasingreproductionrati

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
df