In [1]:
from helper_utils import project_embeddings, word_wrap
from pypdf import PdfReader
import os
from openai import OpenAI
from dotenv import load_dotenv

from pypdf import PdfReader
import umap


# Load environment variables from .env file
load_dotenv()

# openai_key = os.getenv("OPENAI_API_KEY")
# client = OpenAI(api_key=openai_key)

gemini_api_key = os.getenv('GEMINI_API_KEY')
client = OpenAI(base_url="https://generativelanguage.googleapis.com/v1beta/openai/", api_key=gemini_api_key)

reader = PdfReader("data/microsoft-annual-report.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]
# print(
#     word_wrap(
#         pdf_texts[0],
#         width=100,
#     )
# )

# split the text into smaller chunks


from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
)

# There are two splitters
# RecursiveCharacterTextSplitter - "dum splitter" that splits first by \n\n and if the length is still too large then uses \n until
#    it reaches the defined chunk_size = 1000
# SentenceTransformersTokenTextSplitter - split the text into chunks that fit the token window of the sentence transformer model that you would like to use.
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1000, chunk_overlap=0
)
character_split_texts = character_splitter.split_text("\n\n".join(pdf_texts))

# print(word_wrap(character_split_texts[10]))
# print(f"\nTotal chunks: {len(character_split_texts)}")

token_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=0, tokens_per_chunk=256
)
token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

# print(word_wrap(token_split_texts[10]))
# print(f"\nTotal chunks: {len(token_split_texts)}")


import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()
# print(embedding_function([token_split_texts[10]]))

chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection(
    "microsoft-collection", embedding_function=embedding_function
)

# extract the embeddings of the token_split_texts
ids = [str(i) for i in range(len(token_split_texts))]
chroma_collection.add(ids=ids, documents=token_split_texts)
chroma_collection.count()

query = "What was the total revenue for the year?"


results = chroma_collection.query(query_texts=[query], n_results=5)
retrieved_documents = results["documents"][0]

# for document in retrieved_documents:
#     print(word_wrap(document))
#     print("\n")


def augment_query_generated(query, model="gemini-2.5-flash"):
    prompt = """You are a helpful expert financial research assistant. 
   Provide an example answer to the given question, that might be found in a document like an annual report."""
    messages = [
        {
            "role": "system",
            "content": prompt,
        },
        {"role": "user", "content": query},
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    return content


original_query = "What was the total profit for the year, and how does it compare to the previous year?"
# hypothetical_answer is a hallucinated answer that will serve as an example for when we have the documents
# that will serve as basis for generating a real answer
hypothetical_answer = augment_query_generated(original_query)

joint_query = f"{original_query} {hypothetical_answer}"
print(word_wrap(joint_query))

results = chroma_collection.query(
    query_texts=joint_query, n_results=5, include=["documents", "embeddings"]
)
retrieved_documents = results["documents"][0]


# for doc in retrieved_documents:
#     print(word_wrap(doc))
#     print("")

embeddings = chroma_collection.get(include=["embeddings"])["embeddings"]
umap_transform = umap.UMAP(random_state=0, transform_seed=0).fit(embeddings)
projected_dataset_embeddings = project_embeddings(embeddings, umap_transform)


retrieved_embeddings = results["embeddings"][0]
original_query_embedding = embedding_function([original_query])
augmented_query_embedding = embedding_function([joint_query])

projected_original_query_embedding = project_embeddings(
    original_query_embedding, umap_transform
)
projected_augmented_query_embedding = project_embeddings(
    augmented_query_embedding, umap_transform
)
projected_retrieved_embeddings = project_embeddings(
    retrieved_embeddings, umap_transform
)


  from .autonotebook import tqdm as notebook_tqdm


What was the total profit for the year, and how does it compare to the previous year? T
he Company recorded a net profit of **$2.3 billion** for the fiscal year ended December
 31, 2023. This represents a **15% increase** compared to the net profit of **$2.0 bill
ion** reported for the fiscal year ended December 31, 2022.


  warn(


In [17]:
print(len(embeddings))
print(len(embeddings[0]))

embeddings


359
384


array([[-0.04634688, -0.02581087,  0.05038458, ..., -0.03059213,
        -0.02240589, -0.0518907 ],
       [-0.09822965, -0.00692612, -0.00180436, ..., -0.09671083,
         0.04921794,  0.00239094],
       [-0.03088396, -0.04228372, -0.00010988, ..., -0.0845217 ,
         0.06621506, -0.03285785],
       ...,
       [ 0.02055793, -0.01914324,  0.00779968, ...,  0.01622162,
        -0.05980263,  0.01522017],
       [-0.00536957, -0.00242523, -0.01556219, ..., -0.01464541,
        -0.0233831 , -0.05169443],
       [-0.04356471,  0.00185994, -0.05116569, ...,  0.06841221,
         0.00977197, -0.00619247]], shape=(359, 384))

In [3]:
projected_dataset_embeddings

array([[ 3.6789658,  3.047822 ],
       [ 3.6097682,  3.6192114],
       [ 3.9814134,  3.0865893],
       [ 3.4197474,  3.3040833],
       [ 3.4442086,  3.5675867],
       [ 3.7329476,  3.3342032],
       [ 4.054826 ,  3.292517 ],
       [ 3.4913561,  3.7180307],
       [ 3.4253604,  3.6908681],
       [ 5.076144 ,  3.9225113],
       [ 3.6639848,  3.6981165],
       [ 3.87123  ,  3.7278483],
       [ 3.7271836,  2.913031 ],
       [ 4.570864 ,  2.7867846],
       [ 6.9958525,  2.0595717],
       [ 4.2547374,  2.1516478],
       [ 5.1830945,  2.8689494],
       [ 2.8996165,  2.7830584],
       [ 3.4710119,  2.7857282],
       [ 3.7953703,  2.2923586],
       [ 3.9424713,  2.9725907],
       [ 3.711887 ,  2.7133963],
       [ 3.9281049,  2.5437055],
       [ 3.657644 ,  2.4144633],
       [ 3.9316993,  2.2574682],
       [ 3.6175265,  2.954042 ],
       [ 3.3729312,  2.701552 ],
       [ 3.3073044,  2.5463924],
       [ 3.4362657,  2.813005 ],
       [ 3.2641602,  2.6076405],
       [ 2

In [None]:

import matplotlib.pyplot as plt

# Plot the projected query and retrieved documents in the embedding space
plt.figure()

plt.scatter(
    projected_dataset_embeddings[:, 0],
    projected_dataset_embeddings[:, 1],
    s=10,
    color="gray",
)
plt.scatter(
    projected_retrieved_embeddings[:, 0],
    projected_retrieved_embeddings[:, 1],
    s=100,
    facecolors="none",
    edgecolors="g",
)
plt.scatter(
    projected_original_query_embedding[:, 0],
    projected_original_query_embedding[:, 1],
    s=150,
    marker="X",
    color="r",
)
plt.scatter(
    projected_augmented_query_embedding[:, 0],
    projected_augmented_query_embedding[:, 1],
    s=150,
    marker="X",
    color="orange",
)

plt.gca().set_aspect("equal", "datalim")
plt.title(f"{original_query}")
plt.axis("off")
plt.show()  # display the plot
