## Querying PDF using Langchain and Astra DB

In [1]:
!pip install pyarrow



In [2]:
!pip install python-dotenv



In [3]:
!pip install -q cassio datasets langchain openai tiktoken

In [4]:
!pip install PyPDF2



### Import the packages needed

In [5]:
# LangChain components 
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset
 
import cassio
from PyPDF2 import PdfReader
import os
from dotenv import load_dotenv

### Setup

In [6]:
load_dotenv()

True

In [7]:
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [8]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('Generative_AI.pdf')

In [9]:
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

### Initializing connection to the database

In [10]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

### Create the LangChain embedding and LLM objects

In [11]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

  warn_deprecated(
  warn_deprecated(


### Create your LangChain vector store (backed by Astra DB)

In [12]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [13]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [14]:
texts[:50]

['CATCHWORD\nGenerative AI\nStefan Feuerriegel •Jochen Hartmann •Christian Janiesch •\nPatrick Zschech\nReceived: 29 April 2023 / Accepted: 7 August 2023 / Published online: 12 September 2023\n/C211The Author(s) 2023\nKeywords Generative AI /C1Artiﬁcial intelligence /C1\nDecision support /C1Content creation /C1Information systems\n1 Introduction\nTom Freston is credited with saying ‘‘Innovation is taking\ntwo things that exist and putting them together in a new\nway’’. For a long time in history, it has been the prevailingassumption that artistic, creative tasks such as writing\npoems, creating software, designing fashion, and compos-\ning songs could only be performed by humans. Thisassumption has changed drastically with recent advances in\nartiﬁcial intelligence (AI) that can generate new content in',
 'ing songs could only be performed by humans. Thisassumption has changed drastically with recent advances in\nartiﬁcial intelligence (AI) that can generate new content in\nways that c

### Load the dataset into the vector store

In [15]:
astra_vector_store.add_texts(texts)
print("Inserted %i headlines." % len(texts))
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 150 headlines.


### Run the Q/A cycle

In [17]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


Enter your question (or type 'quit' to exit):  what is generative AI?



QUESTION: "what is generative AI?"
ANSWER: "Generative AI refers to computational techniques that are capable of generating seemingly new, meaningful content such as text, images, or audio from training data. It has the potential to transform industries that rely on creativity, innovation, and knowledge processing."

FIRST DOCUMENTS BY RELEVANCE:
    [0.9463] "ing songs could only be performed by humans. Thisassumption has changed drastically  ..."
    [0.9463] "ing songs could only be performed by humans. Thisassumption has changed drastically  ..."
    [0.9463] "ing songs could only be performed by humans. Thisassumption has changed drastically  ..."
    [0.9447] "models and systems could be used and combined with each
other to form applications f ..."



What's your next question (or type 'quit' to exit):  QUIT
