RAG AGENT

ADD ALL PRE REQS

In [78]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") or getpass(
    "Enter OPENAI_API_KEY: "
)

CREATE A BASIC LLM AND ASK UP TO DATE QUESTIONS TO MAKE SURE THE PARAMETRIC KNOWLEDGE IS LIMITED

In [79]:
from agents import Agent

agent = Agent(
    name="Agent",
    model="gpt-4.1-mini"
)

In [80]:
from agents import Runner

query = "What is my name?"

result = await Runner.run(
    starting_agent=agent,
    input=query,
)

print(result.final_output)

I don't have access to your personal information, so I don't know your name. How can I assist you today?


CREATE ANOTHER LLM WITH ADDITIONAL SOURCE DATA TO SHOW THAT LLMS CAN USE ADDITIONAL DATA TO CREATE AN ANSWER

In [81]:
agent = Agent(
    name="Agent",
    instructions="The user's name is James",
    model="gpt-4.1-mini"
)

In [82]:
query = "What is my name?"

result = await Runner.run(
    starting_agent=agent,
    input=query,
)

print(result.final_output)

Your name is James. How can I assist you today, James?


GET THE HUGGING FACE DATASET

In [83]:
from datasets import load_dataset

dataset = load_dataset(
    "aurelio-ai/jfk-files",
    split="train"
)

In [84]:
dataset[0]

{'id': 'doc_21c0d725_0fa9_40ef_a217_c062909cc236',
 'filename': '104-10110-10340.pdf',
 'url': 'https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10110-10340.pdf',
 'date': datetime.datetime(2025, 3, 18, 0, 0),
 'content': '[704-10710-10340}\n\n<!-- image -->',
 'pages': 1}

CREATE A KNOWLEDGE BASE USING PINECONE

In [85]:
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY") or getpass(
    "Enter PINECONE_API_KEY: "
)

In [86]:
from pinecone import Pinecone, ServerlessSpec
    
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "jfk-example"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

index = pc.Index(index_name)

In [87]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 670}},
 'total_vector_count': 670,
 'vector_type': 'dense'}

MAKE BASIC EXAMPLE OF EMBEDDING

In [88]:
from openai import OpenAI

client = OpenAI()

In [89]:
texts = [
    'this is the first chunk of text',
    'then this is the second chunk of text'
]

In [90]:
res = client.embeddings.create(
    input=texts,
    model="text-embedding-3-small"
)

In [91]:
len(res.data), len(res.data[0].embedding)

(2, 1536)

EMBED HUGGING FACE FILES

In [92]:
from tqdm.auto import tqdm
import tiktoken

def chunk_text(text, chunk_size=4000):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("cl100k_base")
    
    # Tokenize the text
    tokens = tokenizer.encode(text)
    
    # Split into chunks
    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk = tokens[i:i + chunk_size]
        chunks.append(tokenizer.decode(chunk))
    
    return chunks

def truncate_text(text, max_length=1000):
    # Truncate text to a reasonable length for metadata
    if len(text) > max_length:
        return text[:max_length] + "..."
    return text

data = dataset.to_pandas()

batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    
    # Process each document
    all_ids = []
    all_texts = []
    all_metadata = []
    
    for idx, row in batch.iterrows():
        # Chunk the content
        chunks = chunk_text(row['content'])
        
        # Create IDs and metadata for each chunk
        for chunk_idx, chunk in enumerate(chunks):
            all_ids.append(f"{row['id']}-{chunk_idx}")
            all_texts.append(chunk)
            all_metadata.append({
                'text': truncate_text(chunk),  # Truncate text in metadata
                'source': row['url'],
                'title': row['filename'],
                'chunk_id': chunk_idx
            })
    
    # Create embeddings for all chunks
    embeds = client.embeddings.create(
        input=all_texts,
        model="text-embedding-3-small"
    )
    
    vectors = [record.embedding for record in embeds.data]
    
    # Upsert all chunks
    index.upsert(vectors=zip(all_ids, vectors, all_metadata))

  0%|          | 0/7 [00:00<?, ?it/s]

100%|██████████| 7/7 [00:25<00:00,  3.62s/it]


In [93]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 670}},
 'total_vector_count': 670,
 'vector_type': 'dense'}

MAKING RAG CHATBOT

In [94]:
query_embedding = client.embeddings.create(
    input=["What were the key findings in the JFK assassination investigation?"],
    model="text-embedding-3-small"
).data[0].embedding

results = index.query(
    vector=query_embedding,
    top_k=3,
    include_metadata=True
)

for match in results["matches"]:
    print(match["metadata"]["text"])
    print("\nSource:", match["metadata"]["source"])
    print("File:", match["metadata"]["title"])
    print("-" * 80 + "\n")

2025 RELEASE UNDER THE PRESIDENT JOHN F\_ KENNEDY ASSASSINATION RECORDS ACT OF 1992

HOUSE

<!-- image -->

IP
| REI=   | REVIEED   |
|--------|-----------|
|        | BE        |

DCD-45s/78 19 April 1978

FROY

Ruth Elliff DCD/FIO/PAO

SUBJECT

House Select Committee on Assassinations Request (OLC 78-0986/1}

is forvarded in response to subject request:

- a DCD file A-19-91-59 on Abran Chayes
- b Docunents concerning Monica Krzner and Rita Nazan. (Please escuse the Poor qualitr of sone of this naterial it was iapossible to clear reproduction fron OUI microfiln.) get

<!-- image -->

Attachnents a/s


## RELLIFF:vfc Distribution

- DCD Chrono
- 0 Addressee
- 1 Staff A
- 3 Control
- 3 RElliff

E2 IMPDET CL BY 386090

4~

FRON

SUBJECT

Case 64574

USSR Exteraal Folicy

1 and a set of questions (Enclosure 5) at Qur request\_ inirial of each US participzt, as to the true identities of As result, sOne felt they had attended. cases , holever , Gere questia as to the true idcitity of\_the 

MAKE RAG AS A TOOL FOR AGENT

In [95]:
from agents import function_tool

@function_tool
def return_source_knowledge(query: str) -> str:
    # 1. Get the query embedding
    embeds_response = client.embeddings.create(
        input=[query],
        model="text-embedding-3-small"
    )
    query_embedding = embeds_response.data[0].embedding

    # 2. Query Pinecone
    results = index.query(
        vector=query_embedding,
        top_k=3,
        include_metadata=True
    )

    # 3. Extract the passages
    source_knowledge = "\n".join(
        match["metadata"]["text"] for match in results["matches"]
    )

    return source_knowledge

MAKE FINAL AGENT

In [96]:
rag_agent = Agent(
    name="JFK Document Assistant",
    model="gpt-4o",
    instructions="""You are an assistant specialized in answering questions about the JFK assassination and related documents. 
    When users ask questions about JFK, the assassination, or related historical events, use the return_source_knowledge tool 
    to retrieve relevant information from the official JFK although not all files, only return what you have found from the tool.""",
    tools=[return_source_knowledge]
)

In [97]:
query = "Tell me about the House Select Committee on Assassinations Request"

result = await Runner.run(
    starting_agent=rag_agent, 
    input=query,
)

print(result.final_output)

The House Select Committee on Assassinations (HSCA) sought information related to the assassinations of prominent figures such as Martin Luther King Jr. As part of their inquiry, the Committee requested information on certain individuals and organizations related to these events. 

Documents reveal that the Committee's request was limited to information during the immediate period surrounding the assassination. In this context, they referred to various groups, including the Flemish Order of Militants (Vlaamse Militantenorganisatie, VMO), and relied on available intelligence records, even when most references were scant.

The HSCA's investigations were part of broader efforts to understand political assassinations in the United States and were linked to legal mandates under President John F. Kennedy Assassination Records Act of 1992 for eventual public disclosure.
