#### Importing Libraries

In [1]:
import openai
import langchain
import pinecone
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_openai import ChatOpenAI
import os

  from .autonotebook import tqdm as notebook_tqdm


#### Loading environment variables

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

### Reading the pdf document

In [3]:
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [4]:
doc = read_doc('document/')

In [5]:
len(doc)

3

### Converting the documents into chunks for the model restriction of token size

In [6]:
def chunk_data(doc, chunk_size = 500, chunk_overlap = 50):
    text_spliter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
    doc = text_spliter.split_documents(doc)
    return doc

In [7]:
documents = chunk_data(doc = doc)
documents

[Document(metadata={'producer': 'Skia/PDF m132', 'creator': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36', 'creationdate': '2025-02-05T22:06:35+00:00', 'title': 'View your immigration status – GOV.UK', 'moddate': '2025-02-05T22:06:35+00:00', 'source': 'document\\View your immigration status – GOV.UK.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='Name HOSSAIN MD\nABIR\nDate of\nbirth\n4 November 1997\nNational\nity\nBGD\nStatus Graduate Route\nValid\nfrom\n19 December\n2024\nValid\nuntil\n19 December\n2026\nIf any of the information\ndisplayed on your status\nis incorrect, contact UK\nVisas and Immigration\n(https://www.gov.uk/contact-\nukvi-inside-outside-uk).\nProve your status\nIf you need to prove your\nimmigration status to\nsomeone, you can do this\nonline with a share code.\nYour immigration\nstatus\nView and prove your immigration status'),
 Document(metadata={'producer': 'Skia/PDF m132', '

In [8]:
print(len(documents))

8


### Embedding Technique of OpenAI


In [9]:
embedding = OpenAIEmbeddings(api_key= os.environ['OPENAI_API_KEY'])

In [10]:
embedding

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001A5DCAEE0E0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001A5DD2C3D60>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [11]:
vectors = embedding.embed_query("How Are you?")
len(vectors)

1536

### Vector search DB in PineCone

In [12]:
from pinecone import Pinecone
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index("langchainvector")

In [None]:
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore.from_documents(
    documents=documents, 
    embedding=embedding, 
    index_name="langchainvector"
)

### Cosine Similarity to Retreive Results from Pinecone VectorDB

In [19]:
def retreive_query(query, k=2):
    matching_results = vectorstore.similarity_search(query, k=k)
    return matching_results

In [20]:
from langchain_classic.chains.question_answering import load_qa_chain

In [21]:
llm = ChatOpenAI(
    model = "gpt-4o-mini",
    api_key= os.environ["OPENAI_API_KEY"],
    temperature=0.6
)
chain = load_qa_chain(llm, chain_type="stuff")

### Search answers from VectorDB

In [22]:
def retreive_answer(query):
    doc_search = retreive_query(query)
    print(doc_search)
    response = chain.run(input_documents = doc_search, question = query)
    return response

In [25]:
my_query = "When his visa going to be end?"
answer  = retreive_answer(my_query)
print(answer)

[Document(id='5e55ef47-da04-4615-97ba-7f0859c7872c', metadata={'creationdate': '2025-02-05T22:06:35+00:00', 'creator': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36', 'moddate': '2025-02-05T22:06:35+00:00', 'page': 0.0, 'page_label': '1', 'producer': 'Skia/PDF m132', 'source': 'document\\View your immigration status – GOV.UK.pdf', 'title': 'View your immigration status – GOV.UK', 'total_pages': 3.0}, page_content='Name HOSSAIN MD\nABIR\nDate of\nbirth\n4 November 1997\nNational\nity\nBGD\nStatus Graduate Route\nValid\nfrom\n19 December\n2024\nValid\nuntil\n19 December\n2026\nIf any of the information\ndisplayed on your status\nis incorrect, contact UK\nVisas and Immigration\n(https://www.gov.uk/contact-\nukvi-inside-outside-uk).\nProve your status\nIf you need to prove your\nimmigration status to\nsomeone, you can do this\nonline with a share code.\nYour immigration\nstatus\nView and prove your immigration status\n05/02