In [1]:
import os
import requests
from pprint import pprint
from dotenv import load_dotenv
import nest_asyncio

from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.core import VectorStoreIndex, Document

from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever, SummaryIndexRetriever, TransformRetriever

from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

load_dotenv()
nest_asyncio.apply()

In [2]:
COHERE_API_KEY = os.environ["COHERE_API_KEY"] 
LLAMA_CLOUD_API_KEY = os.environ["LLAMA_CLOUD_API_KEY"]

doc_name = "Academic-CV-V1.pdf"

### Reading Data

In [3]:
# # random text about books
# books = [
#     "Harry Potter is a series of seven fantasy novels written by British author J. K. Rowling. The novels chronicle the lives of a young wizard, Harry Potter, and his friends Hermione Granger and Ron Weasley, all of whom are students at Hogwarts School of Witchcraft and Wizardry. The books follow Voldemort's rise to power, Harry's struggle against him, and his eventual downfall.",
#     "The Lord of the Rings is an epic high-fantasy novel written by English author and scholar J. R. R. Tolkien. The story began as a sequel to Tolkien's 1937 fantasy novel The Hobbit, but eventually developed into a much larger work. Written in stages between 1937 and 1949, The Lord of the Rings is one of the best-selling books in history, with over 150 million copies sold.",
#     "The Hunger Games is a dystopian novel written by American author Suzanne Collins. It is set in a dark vision of the future, where a totalitarian government, known as the Capitol, exercises full control over the twelve districts of Panem. The novel follows the story of Katniss Everdeen, a sixteen-year-old girl who volunteers to take the place of her younger sister in the annual Hunger Games, a televised competition in which twenty-four children aged twelve to eighteen are chosen at random to fight to the death.",
#     "The Da Vinci Code is a mystery thriller novel by Dan Brown, first published on March 18, 2003. The novel follows the fictional character Robert Langdon, a Harvard University symbologist, who is summoned to the Louvre Museum in Paris to examine a mysterious painting. The painting is believed to contain hidden clues about the Holy Grail, which is said to be the chalice used by Jesus Christ at the Last Supper.",
#     "The Alchemist is a novel by Brazilian author Paulo Coelho that was first published in 1988. The story follows the journey of a young Andalusian shepherd boy named Santiago who travels to Egypt in search of a treasure buried in the Pyramids. Along the way, he meets a series of characters who teach him about the world and the nature of his quest. The novel is a philosophical fable about the importance of following one's dreams and the power of the universe to conspire in their fulfillment.",
#     "The Catcher in the Rye is a novel by American author J. D. Salinger, first published in 1951. The story follows the adventures of sixteen-year-old Holden Caulfield, a troubled teenager who has been expelled from his prep school and spends three days wandering around New York City. The novel is known for its exploration of themes such as alienation, disillusionment, and the struggle for self-identity.",
#     "The Great Gatsby is a novel by American author F. Scott Fitzgerald, first published in 1925. The story follows the life of Jay Gatsby, a mysterious millionaire who throws extravagant parties on Long Island, and his relationship with Nick Carraway, a young man from the Midwest who becomes his neighbor. The novel is a critique of the American Dream and the excesses of the Roaring Twenties.",
#     "The Shining is a horror novel by American author Stephen King, first published in 1977. The story follows the life of Jack Torrance, a struggling writer who takes a job as the winter caretaker of the isolated Overlook Hotel in Colorado. Jack and his family, wife Wendy and son Danny, move into the hotel, where they encounter strange occurrences and Jack's descent into madness.",
#     "The Girl with the Dragon Tattoo is a novel by Swedish author Stieg Larsson, first published in 2005. The story follows the investigation of a series of murders by journalist Mikael Blomkvist and computer hacker Lisbeth Salander. The novel is a critique of the Swedish elite and the media, and it explores themes such as corruption, power, and justice.",
#     "The Road is a post-apocalyptic novel by American author Cormac McCarthy, first published in 2006. The story follows the journey of a father and his young son as they travel through a desolate and dangerous landscape. The novel is a meditation on themes such as survival, hope, and the nature of human relationships.",
# ]

# documents = [Document(text=t) for t in books]

In [4]:
# Using LlamParse to parse the PDF. The API key is should be set in the enviorment variablse. 
# check https://docs.cloud.llamaindex.ai/llamaparse/getting_started/python
parser = LlamaParse(
    result_type="text",
)

# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=[f"../data/{doc_name}"], file_extractor=file_extractor).load_data()

Started parsing the file under job_id b9443ed1-974b-447d-b4ab-7f5ba4b7789c


In [10]:
print(documents)

[Document(id_='7fe3a325-b1db-44cc-a7b6-29dfa016f03b', embedding=None, metadata={'file_path': '../data/Academic-CV-V1 .pdf', 'file_name': 'Academic-CV-V1 .pdf', 'file_type': 'application/pdf', 'file_size': 149705, 'creation_date': '2024-08-17', 'last_modified_date': '2024-07-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='HOSSEIN GOLMOHAMMADI                                             COMPUTER SCIENCE STUDENT\n\n        +989038029473                        hossein.gmohammadi@gmail.com                              Isfahan-Iran\n        github.com/artmiss-gns               linkedin.com/in/hossein-golmohammadi-gns                  WIP...\n\n   HIGHLIGHTS\n       Fourth-year CS student specializing in ML, ranked in top 10% of class\n       4+ years Pyt

### Model initialization

In [28]:
embed_model = CohereEmbedding(
    api_key=COHERE_API_KEY,
    model_name="embed-english-v3.0",
    input_type="search_query",
)

In [29]:
# create an index from the parsed markdown
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

### Retriever

In [None]:
# retriever = index.as_retriever()
# nodes = retriever.retrieve("Contact")
# for node in nodes:
#     print(node.node.get_content())
#     print("\n\n")

In [None]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=3,
    vector_store_kwargs={"score_threshold": 0.7},
)

In [None]:
nodes = retriever.retrieve("What are his GPA")
for node in nodes:
    print(node.node.get_content())
    print("\n\n")