### Importing packages 

In [10]:
#langchain components to use 
from langchain_community.vectorstores import Cassandra
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

#support for dataset retrieval with hugging face
from datasets import load_dataset

#with CassIO, the engine powering the Astra DB integration in Langchain
#It helps to initialize the DB connection
import cassio

import os

In [2]:
from PyPDF2 import PdfReader

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
pdf_reader = PdfReader('SlotFinder_A_Spatio-temporal_based_Car_Parking_System.pdf')

In [5]:
from typing_extensions import Concatenate
#read text from pdf 
raw_text = ''
for i, page in enumerate(pdf_reader.pages):
    content = page.extract_text()
    if content:
        raw_text+=content

In [6]:
raw_text

'2022 25th International Conference on Computer and Information Technology (ICCIT)\n17-19 December 2022, Cox’s Bazar, Bangladesh\nSlotFinder: A Spatio-temporal based Car Parking\nSystem\nMebin Rahman Fateha, Md. Saddam Hossain Mukta, Md. Abir Hossain, Mahmud Al Islam, Salekul Islam\nDepartment of CSE, United International University (UIU)\nPlot-2, United City, Madani Avenue, Badda, Dhaka-1212, Bangladesh\nEmail: mfateha171124@bscse.uiu.ac.bd, saddam@cse.uiu.ac.bd, mhossain171125@bscse.uiu.ac.bd,\nmislam171131@bscse.uiu.ac.bd, salekul@cse.uiu.ac.bd\nAbstract —Nowadays, the increasing number of vehicles and\nshortage of parking spaces have become an inescapable condition\nin big cities across the world. Car parking problem is not a\nnew phenomenon, especially in a crowded city such as Dhaka,\nBangladesh. Shortage of parking spaces leads to several problems\nsuch as road congestion, illegal parking on the streets, and fuel\nwaste in searching for a free parking space. In order to overcome

### Initialize the connection to my Database

In [7]:
cassio.init(
    token= os.environ['ASTRA_DB_APPLICATION_TOKEN'],
    database_id= os.environ['ASTRA_DB_ID']
)

### Creating the Langchain embedding and LLM object for later usage

In [8]:
llm = ChatOpenAI(
    api_key= os.environ['OPENAI_API_KEY'],
    model="gpt-4o-mini",
    temperature=0.6
)
embedding = OpenAIEmbeddings(
    api_key=os.environ['OPENAI_API_KEY']
)

### Create my Langchain vector store backed by AstraBD

In [9]:
astra_vector_store = Cassandra(
    embedding= embedding,
    table_name='qa_mini',
    session=None,
    keyspace=None
)

### Converting the whole data into chunks

In [14]:
text_spliter = RecursiveCharacterTextSplitter(
    separators=["\n"],
    chunk_size = 800, 
    chunk_overlap = 200,
    length_function = len
    )
texts = text_spliter.split_text(raw_text)
    

### Loading the dataset into the vector store

In [16]:
astra_vector_store.add_texts(texts=texts)

['ae378674ddad4fcdb7d2074b69625f4c',
 '11b28939bb1240b682f4cc1f5afb21fe',
 '61a266af473d4bdd8230e220cb741901',
 '5237fa96bfe744869f8030f811260765',
 'f164b88d1d6f481dacd02652f991ab33',
 '6eb047c02b7e444b9ffdd662bbd3ae55',
 'fdd0983fa24b4b40a8748a8db4ec991e',
 '81bac33759d64a89a8ecfe03f95f1713',
 '5c3551dabb924da6bc0c37c9f7676ae7',
 'af52342e38864c05bdc3f613cb7e26ac',
 '16420ab4b2a94a8eb05fe8dfcdc89b23',
 '9643a56d755844f1b4fb45dfccd67555',
 '61fd5442f5444ce687eed10228407601',
 '1ee1fa5215f54a9d9bf47c53c84a4aec',
 '806c5fddfc31445e9bfba44046156092',
 '29030f00be34465ea55823e6405fa2de',
 'ee001cdaca0b45ad97e84302337aa141',
 'a3541e1b979f4972ba2571f2c0aaab58',
 '8b7ee1309b9d4badb9a8ff2e4543c212',
 '3a0dbb9e1fe24e679815e78988b9c5f6',
 '998a9e895dd74bf6a4abdf6ac4ee7c97',
 '0067285eaef6477c9c525ce8e750c676',
 'bce3adb1c5c24f448e1d32a565727b79',
 '39adf209a7b245f68d803a608410f2e6',
 '707e8ec41f4749508e131f4a66d7dc9a',
 '41a4e0917e97480393a5ad641b0f7827',
 '9314cb9ad77f465c9e08a8c8c6c5a456',
 

### Running the QA cycle

In [18]:
retriever = astra_vector_store.as_retriever(search_kwargs={"k": 3})

In [23]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

prompt = ChatPromptTemplate.from_template("""
Answer the question using only the context below.

Context:
{context}
Question:
{question}
"""
)

rag_chain = (
    {"context" : retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
)

In [24]:
first_question = True

while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == 'quit':
        break
    if query_text == "":
        continue

    first_question = False

    print(f'\nQuestion : "{query_text}"')

   
    answer = rag_chain.invoke(query_text)
    print("\nAnswer:")
    print(answer.content)

    print("\nFirst documents by relevance:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=3):
        print(f'     [{score:.4f}] "{doc.page_content[:84]}..."')


Question : "how many buildings were there?"

Answer:
There were 408 buildings.

First documents by relevance:
     [0.9050] "Number of
instances408We also choose those areas where parking problem is a
common i..."
     [0.8837] "parking space, for how much time in a day it remains vacant,
whether a specific park..."
     [0.8790] "Shyamoli 8:00 AM - 9:00 AM 5:00 PM - 6:00 PM 42
Mohammadpur 8:00 AM - 9:00 AM 5:00 P..."

Question : "how many area were there?"

Answer:
There were seven areas.

First documents by relevance:
     [0.8981] "Number of
instances408We also choose those areas where parking problem is a
common i..."
     [0.8899] "the number of instances from each area based on average
departure and arrival time.
..."
     [0.8848] "seven different areas to group 408 parking spaces. We use
longitude and latitude val..."

Question : "who wrote the full document?"

Answer:
The context provided does not specify who wrote the full document.

First documents by relevance:
     [0.849