In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

import os
import tempfile
import uuid
import pandas as pd
import re
from dotenv import load_dotenv


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
load_dotenv()

True

In [3]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [4]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats")

AIMessage(content='Why did the cat sit on the computer?\n\nBecause it wanted to keep an eye on the mouse!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 13, 'total_tokens': 34, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0392822090', 'id': 'chatcmpl-BORAUnzEvGG8l8EYFl2Pe1lDgkajn', 'finish_reason': 'stop', 'logprobs': None}, id='run-3a3a43e6-0495-44bd-a7f6-afea629f2017-0', usage_metadata={'input_tokens': 13, 'output_tokens': 21, 'total_tokens': 34, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## 1. Process PDF

The pdf is loaded and split in pages, this pages are split then in chunks of size n. The chunks store content information and some metadata such as page or document of origin


In [5]:
loader = PyPDFLoader("../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf")
pages = loader.load()

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, 
                                               chunk_overlap=200, 
                                               length_function=len, separators = ["\n\n", "\n", ""])
chunks = text_splitter.split_documents(pages)

In [7]:
chunks[:5]

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.5 (Windows)', 'creationdate': '2023-10-17T12:11:44-04:00', 'moddate': '2023-10-17T12:11:49-04:00', 'trapped': '/False', 'source': '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf', 'total_pages': 76, 'page': 0, 'page_label': '1'}, page_content='This Page Intentionally Left Blank  \nIt is here to hold a place for cover for screen version.  \nDO NOT INCLUDE AS PART OF PRINT FILE!\nOFFICIAL\nRULES'),
 Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.5 (Windows)', 'creationdate': '2023-10-17T12:11:44-04:00', 'moddate': '2023-10-17T12:11:49-04:00', 'trapped': '/False', 'source': '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf', 'total_pages': 76, 'page': 1, 'page_label': '2'}, page_content='- 2 -\nRULES INDEX\nCourt Diagram 8\nRule No. 1 – Court Dimensions – Equipment 9\n Section I – Court and Dimensions 9\n Section II – Equipment  9\nRule No. 

In [8]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [9]:
calculate_chunk_ids(chunks)

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.5 (Windows)', 'creationdate': '2023-10-17T12:11:44-04:00', 'moddate': '2023-10-17T12:11:49-04:00', 'trapped': '/False', 'source': '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf', 'total_pages': 76, 'page': 0, 'page_label': '1', 'id': '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf:0:0'}, page_content='This Page Intentionally Left Blank  \nIt is here to hold a place for cover for screen version.  \nDO NOT INCLUDE AS PART OF PRINT FILE!\nOFFICIAL\nRULES'),
 Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.5 (Windows)', 'creationdate': '2023-10-17T12:11:44-04:00', 'moddate': '2023-10-17T12:11:49-04:00', 'trapped': '/False', 'source': '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf', 'total_pages': 76, 'page': 1, 'page_label': '2', 'id': '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf:1:0'}, page_content='- 2 -\nRULES

In [10]:
len(chunks)

195

## 1.1 Store chunks in a Chroma db

Using OpenAI embeddings function and local db from Chroma. The chunks are identified by an id so that we avoid uploading the same chunk twice to the database.

In [11]:
def get_embeddings():
    embeddings = OpenAIEmbeddings( model= "text-embedding-ada-002", openai_api_key = OPENAI_API_KEY)
    return embeddings
embedding_function = get_embeddings()
#test_vector = embedding_function.embed_query("cat")

In [12]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator = "embedding_distance", embeddings = embedding_function)
evaluator.evaluate_strings(prediction="Amsterdam", reference = "coffeeshop")

{'score': 0.1745443723078154}

In [13]:
def add_to_chroma(chunks: list, vectorstore_path: str):
    # Load the existing database.
    db = Chroma(
        persist_directory= vectorstore_path, embedding_function=get_embeddings()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        new_source = [chunk.metadata["source"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids,source=new_source)
        db.persist()
    else:
        print("✅ No new documents to add")

In [14]:
# Create vectorstore
add_to_chroma(chunks, "football_basketball_data")

  db = Chroma(


Number of existing documents in DB: 482
✅ No new documents to add


## 2. Access to database

Accessing Chroma db by creating a variable db and set a retriever. The retriever will perform a cosine similarity on the input text with the database and deliver the k (default 5) chunks more similar to the input.

In [36]:
db = Chroma(persist_directory="football_basketball_data", embedding_function=get_embeddings())

In [40]:
## Query relevant data
retriever = db.as_retriever(seach_type="similarity")
relevant_chunks = retriever.invoke("Which document contains rules about basketball?")
sources = [doc.metadata.get("id", None) for doc in relevant_chunks]

In [41]:
sources

['../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf:4:0',
 '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf:3:0',
 '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf:1:0',
 '../documents/Laws_of_the Game_2024_25.pdf:216:0']

## 3. Prompt creation

The prompt is a copy of Than Vu used prompt in her video RAG elements. Defines *instructions* for the model to not make up any piece of information. Forming the answer from a *context* which would be the k most relevant pieces of information from the pdf and a *question* which will be the user's input.

In [31]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING. Always cite the sources where the context comes from.

{context}

---
The context available comes from the following documents: {sources}
Answer the question based on the above context: {question}
"""

In [32]:
PROMPT_TEMPLATE ="""You are an assistant for document source identification and question-answering tasks.

Use the following retrieved context to determine **which document(s)** most likely contain the answer to the user's question.
If the answer is unclear, say so. DO NOT MAKE UP ANY INFORMATION.
Always mention the source document(s) the information comes from.

Context:
{context}

---
The context above comes from the following documents: {sources}

Question:
{question}

Instructions:
- Identify which document(s) contain the relevant information.
- Mention the document title or source.
- Optionally mention the page number or chunk if available.
- Then briefly explain your reasoning."""


In [42]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])
question = "Which document contains information about basketball?"
# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, sources=sources,
                                question=question)
print(prompt)

Human: You are an assistant for document source identification and question-answering tasks.

Use the following retrieved context to determine **which document(s)** most likely contain the answer to the user's question.
If the answer is unclear, say so. DO NOT MAKE UP ANY INFORMATION.
Always mention the source document(s) the information comes from.

Context:
- 5 -
 Section VIII – Eight-Second Rule 36
 Section IX – Ball in Backcourt  36
 Section X – Swinging of Elbows 37
 Section XI – Entering Basket from Below 37
 Section XII – Illegal Assist in Scoring 37
 Section XIII – Traveling  37
 Section XIV – Offensive Screen Set Out-of-Bounds 38
 Section XV – Offensive Player Out-of-Bounds 38
 Section XVI – Five-Second Back-to-the-Basket Violation 38
 Section XVII – Flopping  38
Rule No. 11 – Basket Interference – Goaltending  39
 Section I – A Player Shall Not: 39
Rule No. 12A – Fouls and Penalties (Technical Foul) 39
 Section I – Excessive Timeouts 39
 Section II – Delay of Game 40
 Section

In [43]:
llm.invoke(prompt)

AIMessage(content='The documents that contain information about basketball are:\n\n1. **2023-24 NBA Season Official Playing Rules** (Document: `../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf`)\n   - Relevant sections include rules about various aspects of basketball such as shot clocks, fouls, and out-of-bounds regulations. \n\nThe indication that this document contains basketball-related information comes from its title, which specifically mentions "NBA Season Official Playing Rules," suggesting that it encompasses the rules governing basketball gameplay.\n\n2. **Laws of the Game 2024/25** (Document: `../documents/Laws_of_the Game_2024_25.pdf`)\n   - Though less specific, this document also contains general sports laws, which could potentially relate to basketball among other sports.\n\nOverall, the 2023-24 NBA Season Official Playing Rules is the primary document focused on basketball rules, while the Laws of the Game may include broader information that could apply to ba

In [22]:
sources

['../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf:49:2',
 '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf:37:1',
 '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf:32:2',
 '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf:70:2']

In [23]:
relevant_chunks

[Document(metadata={'creationdate': '2023-10-17T12:11:44-04:00', 'creator': 'Adobe InDesign 18.5 (Windows)', 'id': '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf:49:2', 'moddate': '2023-10-17T12:11:49-04:00', 'page': 49, 'page_label': '50', 'producer': 'Adobe PDF Library 17.0', 'source': '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf', 'total_pages': 76, 'trapped': '/False'}, page_content='four remaining players in the game. If the offended player is unable to attempt his free \nthrows as a result of being ejected, any of the four remaining players may attempt \nthe free throws. The ball will be awarded to the offended team at the free throw line \nextended in the frontcourt. The injured player may return to the game. \nRULE NO. 13—INST ANT REPLA Y\nSection I—Instant Replay Review T riggers\na. Instant replay will be triggered in the following situations:\n(1) A field goal made with  no time remaining on the clock (0:00) at the end of any  \nperiod.\nNOTE: Ins

In [24]:
relevant_chunks[0].page_content

'four remaining players in the game. If the offended player is unable to attempt his free \nthrows as a result of being ejected, any of the four remaining players may attempt \nthe free throws. The ball will be awarded to the offended team at the free throw line \nextended in the frontcourt. The injured player may return to the game. \nRULE NO. 13—INST ANT REPLA Y\nSection I—Instant Replay Review T riggers\na. Instant replay will be triggered in the following situations:\n(1) A field goal made with  no time remaining on the clock (0:00) at the end of any  \nperiod.\nNOTE: Instant replay will NOT be used to check a successful basket in subsection (1) \nabove if the throw-in, free throw attempt, or jump ball started with .2 or .1 on the game \nclock. The officials will judge the legality of the basket in these situations based on the \nguidelines as set forth in Comments on the Rules, Section II(L).'

In [25]:
relevant_chunks

[Document(metadata={'creationdate': '2023-10-17T12:11:44-04:00', 'creator': 'Adobe InDesign 18.5 (Windows)', 'id': '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf:49:2', 'moddate': '2023-10-17T12:11:49-04:00', 'page': 49, 'page_label': '50', 'producer': 'Adobe PDF Library 17.0', 'source': '../documents/2023-24-NBA-Season-Official-Playing-Rules.pdf', 'total_pages': 76, 'trapped': '/False'}, page_content='four remaining players in the game. If the offended player is unable to attempt his free \nthrows as a result of being ejected, any of the four remaining players may attempt \nthe free throws. The ball will be awarded to the offended team at the free throw line \nextended in the frontcourt. The injured player may return to the game. \nRULE NO. 13—INST ANT REPLA Y\nSection I—Instant Replay Review T riggers\na. Instant replay will be triggered in the following situations:\n(1) A field goal made with  no time remaining on the clock (0:00) at the end of any  \nperiod.\nNOTE: Ins

This auxiliary function will store in a dataframe information from the k most relevant chunks. This information will entail document name, in case there are multiple documents in the db, document page where to find the given information, and content of the chunk.

In [26]:
def process_relevant_chunks(relevant_chunks):
    data = []
    
    for doc in relevant_chunks:
        metadata = doc.metadata
        data.append({
            "document_name": metadata.get("source", "").split("/")[-1],  # Extract filename
            "page": metadata.get("page", ""),  # Page number
            "content": doc.page_content.strip()  # Clean text content
        })
    
    return pd.DataFrame(data)

process_relevant_chunks(relevant_chunks)

Unnamed: 0,document_name,page,content
0,2023-24-NBA-Season-Official-Playing-Rules.pdf,49,four remaining players in the game. If the off...
1,2023-24-NBA-Season-Official-Playing-Rules.pdf,37,"EXCEPTION: (1) injury , (2) inbounding the bal..."
2,2023-24-NBA-Season-Official-Playing-Rules.pdf,32,until the ball is legally touched by a player ...
3,2023-24-NBA-Season-Official-Playing-Rules.pdf,70,wound up and/or followed through after making ...
