# Rag From Scratch - Part 2 TFIDF and Cosine Similarity to Improve Similarity Search

In [None]:
!pip install scikit-learn
!pip install pinecone-client
!pip install sentence-transformers

In [34]:
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import requests

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

![Bad Similarity Problems in Retrieval Augmented Generation](images/a-key-challenge-of-retrieval-augmented-generation-systems-semantics.jpg)

In [35]:

# Define the corpus of documents
corpus_of_documents = [
    "Take a leisurely walk in the park and enjoy the fresh air.",
    "Visit a local museum and discover something new.",
    "Attend a live music concert and feel the rhythm.",
    "Go for a hike and admire the natural scenery.",
    "Have a picnic with friends and share some laughs.",
    "Explore a new cuisine by dining at an ethnic restaurant.",
    "Take a yoga class and stretch your body and mind.",
    "Join a local sports league and enjoy some friendly competition.",
    "Attend a workshop or lecture on a topic you're interested in.",
    "Visit an amusement park and ride the roller coasters."
]


# Generate embeddings for the documents
doc_embeddings = model.encode(corpus_of_documents)

In [36]:
query = "What's the best outside activity?"

In [37]:
doc_embeddings

array([[ 0.07121077, -0.01088003,  0.11746485, ...,  0.01414924,
        -0.13175762, -0.00402598],
       [ 0.04881528, -0.03166641,  0.07468717, ..., -0.0627827 ,
        -0.11120284,  0.03045147],
       [ 0.05019967, -0.09127751,  0.08517756, ...,  0.01286453,
        -0.07415231, -0.06140357],
       ...,
       [ 0.05416266, -0.03030902,  0.02475943, ..., -0.01272294,
        -0.06512289,  0.05848261],
       [-0.00401894, -0.04562395, -0.00900753, ...,  0.03939738,
        -0.12731643,  0.05255723],
       [ 0.0504604 ,  0.0143044 ,  0.08787955, ..., -0.01778724,
        -0.05246406, -0.02887336]], dtype=float32)

In [38]:
similarities = cosine_similarity(model.encode([query]), doc_embeddings)

In [39]:
similarities[0]

array([0.502352  , 0.32826388, 0.31544408, 0.50193346, 0.44371974,
       0.18485212, 0.21045846, 0.25540656, 0.2216403 , 0.45777753],
      dtype=float32)

In [40]:
indexed = list(enumerate(similarities[0]))

In [41]:
indexed

[(0, 0.502352),
 (1, 0.32826388),
 (2, 0.31544408),
 (3, 0.50193346),
 (4, 0.44371974),
 (5, 0.18485212),
 (6, 0.21045846),
 (7, 0.25540656),
 (8, 0.2216403),
 (9, 0.45777753)]

In [42]:
sorted_index = sorted(indexed, key=lambda x: x[1], reverse=True)

In [43]:
sorted_index

[(0, 0.502352),
 (3, 0.50193346),
 (9, 0.45777753),
 (4, 0.44371974),
 (1, 0.32826388),
 (2, 0.31544408),
 (7, 0.25540656),
 (8, 0.2216403),
 (6, 0.21045846),
 (5, 0.18485212)]

In [44]:
recommended_documents = []
for value, score in sorted_index:
    formatted_score = "{:.2f}".format(score)
    print(f"{formatted_score} => {corpus_of_documents[value]}")
    if score > 0.3:
        recommended_documents.append(corpus_of_documents[value])

0.50 => Take a leisurely walk in the park and enjoy the fresh air.
0.50 => Go for a hike and admire the natural scenery.
0.46 => Visit an amusement park and ride the roller coasters.
0.44 => Have a picnic with friends and share some laughs.
0.33 => Visit a local museum and discover something new.
0.32 => Attend a live music concert and feel the rhythm.
0.26 => Join a local sports league and enjoy some friendly competition.
0.22 => Attend a workshop or lecture on a topic you're interested in.
0.21 => Take a yoga class and stretch your body and mind.
0.18 => Explore a new cuisine by dining at an ethnic restaurant.


## Adding in our LLM: Llama 2

In [45]:
import requests
import json

In [61]:

prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences and do not include extra information.

These are potential activities:

{recommended_activities}


The user's query is: {user_input}

Provide the user with 2 recommended activities based on their query.
"""

recommended_activities = "\n".join(recommended_documents)

In [62]:
print(recommended_activities)

Take a leisurely walk in the park and enjoy the fresh air.
Go for a hike and admire the natural scenery.
Visit an amusement park and ride the roller coasters.
Have a picnic with friends and share some laughs.
Visit a local museum and discover something new.
Attend a live music concert and feel the rhythm.


In [63]:
user_input = "I like to hike"

In [64]:
full_prompt = prompt.format(user_input=user_input, recommended_activities=recommended_activities)

In [65]:
url = 'http://localhost:11434/api/generate'
data = {
    "model": "llama2",
    "prompt": full_prompt
}

headers = {'Content-Type': 'application/json'}

response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)
full_response=[]
try:
    count = 0
    for line in response.iter_lines():
        #filter out keep-alive new lines
        # count += 1
        # if count % 5== 0:
        #     print(decoded_line['response']) # print every fifth token
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            
            full_response.append(decoded_line['response'])
finally:
    response.close()
print(''.join(full_response))

 Sure, here are two recommended activities for someone who likes to hike:

1. Go for a hike and admire the natural scenery.
2. Visit an amusement park and ride the roller coasters.


![simplified version of retrieval augmented generation](images/simplified-version-of-retrieval-augmented-generation.jpg)

The LLM (if you're lucky) will handle the user input that goes against the recommended document. We can see that below.

1. Pinecone documentation: https://docs.pinecone.io/docs/overview
2. Sentence Transformers documentation: https://www.sbert.net/docs/quickstart.html
3. scikit-learn TF-IDF documentation: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
4. scikit-learn cosine similarity documentation: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html
