In [2]:
!pip install llama-index openai

Collecting llama-index
  Downloading llama_index-0.11.1-py3-none-any.whl.metadata (11 kB)
Collecting openai
  Downloading openai-1.42.0-py3-none-any.whl.metadata (22 kB)
Collecting llama-index-agent-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_agent_openai-0.3.0-py3-none-any.whl.metadata (728 bytes)
Collecting llama-index-cli<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_cli-0.3.0-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.12.0,>=0.11.1 (from llama-index)
  Downloading llama_index_core-0.11.1-py3-none-any.whl.metadata (2.4 kB)
Collecting llama-index-embeddings-openai<0.3.0,>=0.2.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.2.3-py3-none-any.whl.metadata (635 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.3.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.3.0-py3-none-any.whl.metadata (3.8 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloadin

In [7]:
from google.colab import userdata

OPENAI_API_KEY =  userdata.get('OPENAI_API_KEY')

In [8]:
import os
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [9]:
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader
from IPython.display import Markdown, display

In [11]:
import os
import subprocess

# Specify the URL of the PDF file you want to download
pdf_url = "https://blogmedia.testbook.com/blog/wp-content/uploads/2022/03/indus-valley-civilization-0dfc01e1.pdf"

# Specify the directory where you want to save the PDF file
download_dir = "data"

# Specify the desired new name for the downloaded PDF file
new_filename = "indus-valley-civilization.pdf"

# Create the directory if it doesn't exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Use wget to download the PDF file into the directory
command = f"wget -P {download_dir} {pdf_url}"

# Execute the wget command using subprocess
try:
    subprocess.run(command, shell=True, check=True)
    print(f"PDF downloaded successfully to {download_dir}")

    # Rename the downloaded file to the desired name
    old_filepath = os.path.join(download_dir, os.path.basename(pdf_url))
    new_filepath = os.path.join(download_dir, new_filename)
    os.rename(old_filepath, new_filepath)
    print(f"PDF renamed to {new_filename}")
except subprocess.CalledProcessError as e:
    print(f"Error: {e}")

PDF downloaded successfully to data
PDF renamed to indus-valley-civilization.pdf


In [12]:
%%capture
!pip install pypdf

In [13]:
documents = SimpleDirectoryReader("data").load_data()

In [14]:
len(documents)

16

In [15]:
documents[0]

Document(id_='08a7d7b5-8315-4c1a-bd1b-cbddcaf921cf', embedding=None, metadata={'page_label': '1', 'file_name': 'indus-valley-civilization.pdf', 'file_path': '/content/data/indus-valley-civilization.pdf', 'file_type': 'application/pdf', 'file_size': 545423, 'creation_date': '2024-08-27', 'last_modified_date': '2022-03-22'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text=' \n \nDownload Testbook App  \n \n \nIndus Valley Civilization (IVC) holds a prominent place in the glorious history of India. It \nis also known as ‘Harappan Civilization’ as Harappa was the first city to be discovered \nalong the banks of river Ravi by Daya Ram Sahni in 1921. Indus Valley Civilization is one \nof the most important topics for  UPSC IAS Examination . \nIn this article on

In [16]:
# build index/vectorstore (document splitting, embedding, storing embeddings + chunks)
index = VectorStoreIndex.from_documents(documents)

In [17]:
query_engine = index.as_query_engine() #no history preserved

In [18]:
response = query_engine.query("what is the Geographical Extent of Indus Valley Civilization?")

In [19]:
display(Markdown(f"{response}"))

The Geographical Extent of Indus Valley Civilization covered regions such as Punjab, Sindh, Balochistan, Rajasthan, Gujarat, and Western Uttar Pradesh. It extended from Sutkagendor in Balochistan, Pakistan in the West to Alamgirpur in Western Uttar Pradesh in the East, and from Mandu in Jammu in the North to Daimabad in Ahmednagar, Maharashtra in the South. Additionally, some Indus Valley sites have been found as far away as Afghanistan and Turkmenistan.

In [20]:
index.storage_context.persist()

In [21]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context=storage_context)

In [24]:
from llama_index.core import Settings, set_global_service_context
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI

In [25]:
#define LLM
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=512)

# configure service context
Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 512
Settings.context_window = 3900

# set_global_service_context(service_context)
index = VectorStoreIndex.from_documents(documents)

In [26]:
query_engine = index.as_query_engine(streaming=True)
response = query_engine.query("What is significane of Indus Valley?")
response.print_response_stream()

The significance of the Indus Valley Civilization lies in its advanced agricultural practices, extensive trade networks, unique burial practices, and scientific and technological achievements such as the development of a decimal-based numerical system that influenced other ancient civilizations.