<a href="https://colab.research.google.com/github/arockiaranjini/testsep1/blob/main/rag_solution2_week_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
pip install langchain_community



In [13]:
import os
from langchain_community.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import pipeline
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [14]:
# Step 1: Loading a File
from langchain_community.document_loaders import TextLoader
loader = TextLoader("/content/sample_data/tennis_details.md")
text_doc = loader.load()
#print(text_doc)
#print(text_doc[0].page_content)

In [15]:
# Step 2: divide the data into chunks
from langchain_text_splitters import MarkdownHeaderTextSplitter
split_condition = [("##", "title")]
splitter = MarkdownHeaderTextSplitter(split_condition)
doc_splits = splitter.split_text(text_doc[0].page_content)
#print(doc_splits)
text_chunks = [split.page_content for split in doc_splits]
print(text_chunks)


['# Tennis', "Tennis is a popular sport played between two players (singles) or two teams of two players each (doubles). The game involves using a racket to hit a ball over a net into the opponent's court.", '- A match can be played as best of three or five sets.\n- Each set consists of games, and each game consists of points.\n- Points are scored as **0 (Love), 15, 30, 40**, and then **game**.\n- A player must win a game by at least **two points**.\n- The ball must land within the designated court boundaries.', '```plaintext\n0 points  -> Love\n1 point   -> 15\n2 points  -> 30\n3 points  -> 40\n4 points  -> Game (if leading by 2)\nDeuce     -> 40-40 (must win two consecutive points to win the game)\nAdvantage -> If a player wins a point at deuce, they gain the advantage\n```', '- **Grand Slam Events**:\n- Australian Open\n- French Open\n- Wimbledon\n- US Open', '- **Racket**: Used to hit the ball.\n- **Tennis Ball**: Yellow-green in color, designed for optimal bounce.\n- **Court**: Ca

In [16]:
print(len(text_chunks))

7


In [17]:
# Step 3: Generate Embeddings each chunck 384 embedding

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
#text embedding as input return vector embedding

def embed_chunk(chunk):
  return embedding_model.encode([chunk], normalize_embeddings = True)

In [18]:
print(text_chunks[1])

Tennis is a popular sport played between two players (singles) or two teams of two players each (doubles). The game involves using a racket to hit a ball over a net into the opponent's court.


In [19]:
sample_embedding = embed_chunk(text_chunks[1]).tolist()[0]

In [20]:
print(sample_embedding)

[0.043128401041030884, 0.013731017708778381, 0.040937382727861404, -0.060120921581983566, -0.11004157364368439, 0.03762723505496979, 0.06258527934551239, 0.058430612087249756, 0.07231482118368149, 0.13938894867897034, -0.08466644585132599, 0.03008531779050827, -0.008123097009956837, 0.01305976789444685, 0.028446480631828308, -0.0328884981572628, 0.01718791201710701, -0.0006705721607431769, 0.03371882066130638, 0.03483973443508148, 0.006430466193705797, -0.06199755147099495, 0.02903241105377674, -0.1022266075015068, -0.023730630055069923, -0.0007567994180135429, -0.04043148085474968, 0.06892861425876617, -0.07998340576887131, 0.03472358360886574, -0.027686500921845436, 0.01230208296328783, -0.03259429708123207, 0.04529410973191261, -0.19071798026561737, 0.003529939102008939, -0.017625831067562103, 0.04938877746462822, -0.021366601809859276, 0.017831694334745407, 0.03800325468182564, -0.031719643622636795, 0.017327267676591873, 0.04867412894964218, 0.007676777429878712, 0.105700135231018

In [21]:
len(sample_embedding)

384

In [5]:
#vector db
!pip install chromadb




In [9]:
!pip install chromadb langchain sentence-transformers




In [10]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

In [22]:
# Step 4: Store embeddings in ChromaDB

#vector_db = Chroma.from_texts(text_chunks, HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"), persist_directory="/tmp/chroma_db")
# Create embedding model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_db = Chroma.from_texts(
    texts=text_chunks,
    embedding=embeddings,
    persist_directory="/tmp/chroma_db"
)


In [23]:
vector_db._collection.get(include=['embeddings','documents'])

{'ids': ['98e4ba5b-f2d5-4888-8f1f-9056a509894b',
  '9e1ed40f-13f1-4b3e-9753-52cefa589bb8',
  'a5551e6d-cc3f-4ebf-ad36-01bd38c3c739',
  '82928a61-83ae-4dea-91b4-d5b7e7522929',
  '525a3687-b3a3-4412-b785-e5dfc74e44ac',
  '7205b2f8-25ba-4b4c-aacf-9edd2baa3de0',
  'a24b283d-2d6c-412e-85f6-5115b5b324d6'],
 'embeddings': array([[ 0.0227568 ,  0.05737348,  0.06708645, ..., -0.09128203,
          0.03132669,  0.02229537],
        [ 0.04312836,  0.013731  ,  0.04093744, ..., -0.01512668,
         -0.00087324,  0.03250713],
        [ 0.03571488,  0.02733695, -0.01912353, ...,  0.03562167,
         -0.01786362,  0.02190564],
        ...,
        [ 0.01810161,  0.02553317,  0.02062308, ..., -0.07975578,
         -0.04529489,  0.0281473 ],
        [ 0.05494464,  0.04179734,  0.03473552, ..., -0.04557597,
          0.05043149,  0.04526827],
        [ 0.02283715,  0.03019896,  0.07116921, ..., -0.01202571,
         -0.01404738,  0.03577407]]),
 'documents': ['# Tennis',
  "Tennis is a popular sport p

In [24]:
#step 5: Set up a LLM
pipe = pipeline("text-generation", model="Qwen/Qwen2.5-1.5B-Instruct")

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [30]:
# Step 6: Retrieval and Generation
def retrieve_and_generate(query, threshold=1):
    """Retrieves relevant context from the vector database and generates an answer."""
    search_results = vector_db.similarity_search_with_score(query, k=1)

    print(search_results)

    if not search_results or search_results[0][1] > threshold:
        return "I don't know the answer. There is no available context in vector DB."

    retrieved_context = search_results[0][0].page_content
    similarity_score = search_results[0][1]
    print(f"Similarity Score: {similarity_score}")
    print(f"Retrieved Context: {retrieved_context}")

    prompt = f"Answer the question using the given context\nContext: {retrieved_context}\nQuestion: {query}\nAnswer: "
    print(prompt)
    response = pipe(prompt, max_new_tokens=100)
    return response[0]["generated_text"]

In [26]:
question = "what is tennis"
response = retrieve_and_generate(question)
print(response)


[(Document(metadata={}, page_content="Tennis is a popular sport played between two players (singles) or two teams of two players each (doubles). The game involves using a racket to hit a ball over a net into the opponent's court."), 0.2799421548843384)]
Similarity Score: 0.2799421548843384
Retrieved Context: Tennis is a popular sport played between two players (singles) or two teams of two players each (doubles). The game involves using a racket to hit a ball over a net into the opponent's court.
Answer the question using the given context
Context: Tennis is a popular sport played between two players (singles) or two teams of two players each (doubles). The game involves using a racket to hit a ball over a net into the opponent's court.
Question: what is tennis
Answer: 
Answer the question using the given context
Context: Tennis is a popular sport played between two players (singles) or two teams of two players each (doubles). The game involves using a racket to hit a ball over a net i

In [33]:
question3 = "what is cricket?"
response = retrieve_and_generate(question3)
print(response)

[(Document(metadata={}, page_content="Tennis is a popular sport played between two players (singles) or two teams of two players each (doubles). The game involves using a racket to hit a ball over a net into the opponent's court."), 1.066832423210144)]
I don't know the answer. There is no available context in vector DB.


In [35]:
question3 = "what is scoring system?"
response = retrieve_and_generate(question3)
print(response)

[(Document(metadata={}, page_content='- A match can be played as best of three or five sets.\n- Each set consists of games, and each game consists of points.\n- Points are scored as **0 (Love), 15, 30, 40**, and then **game**.\n- A player must win a game by at least **two points**.\n- The ball must land within the designated court boundaries.'), 0.9572556018829346)]
Similarity Score: 0.9572556018829346
Retrieved Context: - A match can be played as best of three or five sets.
- Each set consists of games, and each game consists of points.
- Points are scored as **0 (Love), 15, 30, 40**, and then **game**.
- A player must win a game by at least **two points**.
- The ball must land within the designated court boundaries.
Answer the question using the given context
Context: - A match can be played as best of three or five sets.
- Each set consists of games, and each game consists of points.
- Points are scored as **0 (Love), 15, 30, 40**, and then **game**.
- A player must win a game by at

In [36]:
question3 = "what is the basic rules?"
response = retrieve_and_generate(question3)
print(response)

[(Document(metadata={}, page_content='- A match can be played as best of three or five sets.\n- Each set consists of games, and each game consists of points.\n- Points are scored as **0 (Love), 15, 30, 40**, and then **game**.\n- A player must win a game by at least **two points**.\n- The ball must land within the designated court boundaries.'), 1.2696161270141602)]
I don't know the answer. There is no available context in vector DB.
