In [None]:
%pip install -q cassio datasets langchain openai tiktoken

In [None]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

from datasets import load_dataset

In [None]:
%pip install PyPDF2

In [33]:
from PyPDF2 import PdfReader
from dotenv import load_dotenv
import os
load_dotenv()
key = os.environ["OPENAI_API_KEY"]
ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"]
ASTRA_DB_ID=os.environ["ASTRA_DB_ID"]



In [None]:
pdfreader = PdfReader('./pdfs/budget_speech.pdf')

In [None]:
from typing_extensions import Concatenate
raw_text=''
for i,page in enumerate(pdfreader.pages):
    content=page.extract_text()
    if content:
       raw_text += content 
print(raw_text)

In [34]:
import cassio 
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN,database_id=ASTRA_DB_ID)

In [35]:

llm = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
embedding = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])

In [36]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None
)

In [37]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap= 200,
    length_function =len
)

texts= text_splitter.split_text(raw_text)
print(texts[:10])

['GOVERNMENT OF INDIA\nINTERIM BUDGET 2024-2025\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2024 \nCONTENTS  \n \nPART – A \n Page No.  \nIntroduction  1 \nInclusive Development and Growth  2 \nSocial Justice   3  \nExemplary  Track Record of Governance,  \nDevelopment and Performance (GDP)  7 \nEconomic Management  8 \nGlobal Context  9 \nVision for ‘Viksit Bharat’  10 \nStrategy for  ‘Amrit Kaal’  11 \nInfrastructure Development  17 \nAmrit Kaal as Kartavya Kaal  22 \nRevised Estimates 2023 -24 23 \nBudget Estimates 2024 -25 23 \nPART – B \nDirect taxes  25 \nIndirect Taxes   26 \nEconomy – Then and Now  28 \n  \n  1 \n Interim Budget 2024 -2025  \nSpeech of  \nNirmala Sitharaman  \nMinister of Finance  \nFebruary 1, 2024  \nHon’ble Speaker,  \n I present the Interim Budget for 2024 -25.', '1 \n Interim Budget 2024 -2025  \nSpeech of  \nNirmala Sitharaman  \nMinister of Finance  \nFebruary 1, 2024  \nHon’ble Speaker,  \n I present the Interim Budget for 2024 -2

In [39]:
astra_vector_store.add_texts(texts[:10])
print(" Inserted %i headlines" %len(texts[:10]))
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)


 Inserted 10 headlines


In [42]:
first_question = True
query_text = input("\n Enter your first question or type QUIT to exit").strip()
answer = astra_vector_index.query(query_text, llm=llm).strip()
print("\ANSWER: \"%s\"" % answer)


# while True:
#     if first_question:
#         query_text = input("\n Enter your first question or type QUIT to exit").strip()
#         first_question= False
#     else:
#         query_text = input("\n Enter your next question or type QUIT to exit").strip()
    
#     if query_text.lower() == "quit":
#         break

#     print(query_text)
#     if query_text == "":
#         continue
    
# first_question = False

# print("\nQUESTION: \"%s\"" % query_text)
# answer = astra_vector_index.query(query_text, llm=llm).strip()
# print("\ANSWER: \"%s\"" % answer)

print("\FIRST DOCUMENT BY RELEVANCE:")
for doc , score in astra_vector_store.similarity_search_with_score(query_text,k=4):
    print(" [%0.4f] \" %s ...\"" %(score,doc.page_content[:84]))







\ANSWER: "The four major castes mentioned in the context are 'Garib' (Poor), 'Mahilayen' (Women), 'Yuva' (Youth), and 'Annadata' (Farmer)."
\FIRST DOCUMENT BY RELEVANCE:
 [0.9154] " 9. As our Prime Minister firmly believes , we need to focus on 
four major castes. T ..."
 [0.9154] " 9. As our Prime Minister firmly believes , we need to focus on 
four major castes. T ..."
 [0.9154] " 9. As our Prime Minister firmly believes , we need to focus on 
four major castes. T ..."
 [0.9154] " 9. As our Prime Minister firmly believes , we need to focus on 
four major castes. T ..."
