In [1]:
import cassio

# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
from dotenv import load_dotenv

ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_TOKEN")

ASTRA_DB_ID = os.getenv("ASTRA_DB_ID") # enter your Database ID

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # enter your OpenAI key

In [4]:
from PyPDF2 import PdfReader
pdfreader = PdfReader('attention.pdf')

In [7]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [10]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN,database_id=ASTRA_DB_ID)

In [11]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

  llm = OpenAI(openai_api_key=OPENAI_API_KEY)
  embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


In [12]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [13]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [14]:
astra_vector_store.add_texts(texts[:50])

print("Inserted %i headlines." % len(texts[:50]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 50 headlines.


In [15]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


QUESTION: "What is this PDF about?"
ANSWER: "This PDF is about the training regime and hardware used for neural machine translation models. It also includes information on different model variations and their performance on English-to-German translation."

FIRST DOCUMENTS BY RELEVANCE:
    [0.8635] "and semantic structure of the sentences.
5 Training
This section describes the train ..."
    [0.8634] "Recognition , pages 770–778, 2016.
[11] Sepp Hochreiter, Yoshua Bengio, Paolo Frasco ..."
    [0.8621] "tensorflow/tensor2tensor .
Acknowledgements We are grateful to Nal Kalchbrenner and  ..."
    [0.8606] "architectures from the literature. We estimate the number of ﬂoating point operation ..."
