A Simple RAG implementation from scratch

In [24]:
corpus_of_documents = [
    "Take a leisurely walk in the park and enjoy the fresh air.",
    "Visit a local museum and discover something new.",
    "Attend a live music concert and feel the rhythm.",
    "Go for a hike and admire the natural scenery.",
    "Have a picnic with friends and share some laughs.",
    "Explore a new cuisine by dining at an ethnic restaurant.",
    "Take a yoga class and stretch your body and mind.",
    "Join a local sports league and enjoy some friendly competition.",
    "Attend a workshop or lecture on a topic you're interested in.",
    "Visit an amusement park and ride the roller coasters.",
    "yoga is very good for living healthy lifesytle."
]
corpus_of_documents

['Take a leisurely walk in the park and enjoy the fresh air.',
 'Visit a local museum and discover something new.',
 'Attend a live music concert and feel the rhythm.',
 'Go for a hike and admire the natural scenery.',
 'Have a picnic with friends and share some laughs.',
 'Explore a new cuisine by dining at an ethnic restaurant.',
 'Take a yoga class and stretch your body and mind.',
 'Join a local sports league and enjoy some friendly competition.',
 "Attend a workshop or lecture on a topic you're interested in.",
 'Visit an amusement park and ride the roller coasters.',
 'yoga is very good for living healthy lifesytle.']

In [2]:
from collections import Counter
import math

In [21]:
def cosine_similarity(query, document):
    query_tokens = query.lower().split(" ")
    document_tokens = document.lower().split(" ")

    query_counter = Counter(query_tokens)
    document_counter = Counter(document_tokens)
    print(f"Query Counter: {query_counter}")
    print(f"Document Counter: {document_counter}")

    dot_product = sum(query_counter[token] * document_counter[token] for token in query_counter.keys() & document_counter.keys())
    print(f"Dot Product: {dot_product}")

    query_magnitude = math.sqrt(sum(count ** 2 for count in query_counter.values()))
    document_magnitude = math.sqrt(sum(count ** 2 for count in document_counter.values()))

    cosine_similarity = dot_product / (query_magnitude * document_magnitude) if query_magnitude and document_magnitude else 0.0
    print(f"Cosine Similarity: {cosine_similarity}")

    return cosine_similarity





In [None]:
query = "is yoga good for health"
document = "yoga is very good for living healthy lifesytle."
cosine_similarity(query, document)


Query Counter: Counter({'is': 1, 'yoga': 1, 'good': 1, 'for': 1, 'health': 1})
Document Counter: Counter({'yoga': 1, 'is': 1, 'very': 1, 'good': 1, 'for': 1, 'living': 1, 'healthy': 1, 'lifesytle.': 1})
Dot Product: 4
Cosine Similarity: 0.6324555320336759


0.6324555320336759

In [25]:
most_similar_document = None
highest_similarity = 0.0
for document in corpus_of_documents:
    similarity = cosine_similarity(query, document)
    if similarity > highest_similarity:
        highest_similarity = similarity
        most_similar_document = document

print(f"Most similar document: {most_similar_document}")
print(f"Highest similarity score: {highest_similarity}")

Query Counter: Counter({'is': 1, 'yoga': 1, 'good': 1, 'for': 1, 'health': 1})
Document Counter: Counter({'the': 2, 'take': 1, 'a': 1, 'leisurely': 1, 'walk': 1, 'in': 1, 'park': 1, 'and': 1, 'enjoy': 1, 'fresh': 1, 'air.': 1})
Dot Product: 0
Cosine Similarity: 0.0
Query Counter: Counter({'is': 1, 'yoga': 1, 'good': 1, 'for': 1, 'health': 1})
Document Counter: Counter({'visit': 1, 'a': 1, 'local': 1, 'museum': 1, 'and': 1, 'discover': 1, 'something': 1, 'new.': 1})
Dot Product: 0
Cosine Similarity: 0.0
Query Counter: Counter({'is': 1, 'yoga': 1, 'good': 1, 'for': 1, 'health': 1})
Document Counter: Counter({'attend': 1, 'a': 1, 'live': 1, 'music': 1, 'concert': 1, 'and': 1, 'feel': 1, 'the': 1, 'rhythm.': 1})
Dot Product: 0
Cosine Similarity: 0.0
Query Counter: Counter({'is': 1, 'yoga': 1, 'good': 1, 'for': 1, 'health': 1})
Document Counter: Counter({'go': 1, 'for': 1, 'a': 1, 'hike': 1, 'and': 1, 'admire': 1, 'the': 1, 'natural': 1, 'scenery.': 1})
Dot Product: 1
Cosine Similarity: 0.1

In [37]:
prompt = """
"You are an intelligent assistant that recommends activities based on user input. Compare the user's query with a corpus of documents and identify the most relevant activity. Respond politely and concisely.

User Input: {user_input}
Most Relevant Activity (from corpus): {relevant_document}
Based on the user's input and the relevant activity, provide a short and helpful recommendation."

"""

In [26]:
import requests
import json

In [40]:
url = 'http://localhost:11434/api/generate'
query = "I like spending time outdoors and staying active. What are some good activities for me?"
data = {
    "model": "llama3.1:latest",
    "prompt": prompt.format(user_input=query, relevant_document=corpus_of_documents)
}
headers = {'Content-Type': 'application/json'}

response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)

In [41]:
full_response = []
try:
    for line in response.iter_lines():
        # filter out keep-alive new lines
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            # print(decoded_line['response'])  # uncomment to results, token by token
            full_response.append(decoded_line['response'])
finally:
    response.close()
    
    
print(''.join(full_response))

Based on your love of spending time outdoors and staying active, I highly recommend taking a hike to admire the natural scenery! It's an excellent way to get some exercise while enjoying the beauty of nature. Give it a try and see where the trail takes you!
