In [1]:
from langchain_openai import AzureChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from pinecone import Pinecone
from langchain_pinecone.vectorstores import PineconeVectorStore


  from tqdm.autonotebook import tqdm


In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [3]:
directory = "E:/2024_GenAI/LANGCHAIN/Langchain_Pinecone/data/Canada.pdf"

def load_docs(directory):
    loader = PyPDFLoader(directory)
    documents = loader.load()
    return documents

documents = load_docs(directory)
len(documents)

65

In [4]:
def split_docs(documents, chunk_size=500, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs

docs = split_docs(documents)
len(docs)

511

In [5]:
embeddings = AzureOpenAIEmbeddings(
    api_key=AZURE_OPENAI_API_KEY,
    api_version='2024-02-15-preview',
    azure_deployment='vector-search-instance2',
    azure_endpoint='https://cb-att-openai-instance.openai.azure.com/'
)

In [6]:
pc = Pinecone(
    api_key=PINECONE_API_KEY,
    environment = 'us-east-1'
)

index_name = 'langchain-pinecone'
index = pc.Index(index_name)

vectordb = PineconeVectorStore.from_documents(documents=docs, embedding=embeddings, index_name=index_name)

In [7]:
vectordb.similarity_search("what is the population in canada", k=2)

[Document(page_content="has also produc ed one of the world's most successful and widely\nused soundi ng rockets, the Black Brant.[322]\nThe 2021 Canadian census enumerated a total\npopul ation of 36,991,981, an increase of around 5.2\npercent over the 2016 figure.[324] It is estimated that\nCanada's popul ation surpassed 40,000,000 in 2023.[325]\nThe main drivers of popul ation growth are immigration\nand, to a lesser extent, natural growth.[326] Canada has\none of the highest per-capita immigration rates in the", metadata={'page': 16.0, 'source': 'E:/2024_GenAI/LANGCHAIN/Langchain_Pinecone/data/Canada.pdf'}),
 Document(page_content="has also produc ed one of the world's most successful and widely\nused soundi ng rockets, the Black Brant.[322]\nThe 2021 Canadian census enumerated a total\npopul ation of 36,991,981, an increase of around 5.2\npercent over the 2016 figure.[324] It is estimated that\nCanada's popul ation surpassed 40,000,000 in 2023.[325]\nThe main drivers of popul ation

In [8]:
vectordb.similarity_search_with_score("what is the culture in canada?", k=2)

[(Document(page_content='cultural, political, social, and econom ic fabric of Canada".[467]\nCanada has a well-developed media sector, but its cultural output—\nparticularly in English films, television shows, and magazines—is\noften overshadowed by imports from the United States.[468] As a\nresult, the preservation of a distinctly Canadian culture is suppor ted\nby federal gove rnment programs, laws, and institutions such as the\nCanadian Broadcasting Corporation (CBC), the National Film', metadata={'page': 22.0, 'source': 'E:/2024_GenAI/LANGCHAIN/Langchain_Pinecone/data/Canada.pdf'}),
  0.868315756),
 (Document(page_content='cultural, political, social, and econom ic fabric of Canada".[467]\nCanada has a well-developed media sector, but its cultural output—\nparticularly in English films, television shows, and magazines—is\noften overshadowed by imports from the United States.[468] As a\nresult, the preservation of a distinctly Canadian culture is suppor ted\nby federal gove rnment p

In [9]:
from langchain_community.chat_models import AzureChatOpenAI

In [10]:
llm = AzureChatOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version='2024-02-15-preview',
    azure_deployment='atttestgpt35turbo',
    azure_endpoint='https://cb-att-openai-instance.openai.azure.com/'
)

  warn_deprecated(


In [15]:
from langchain.chains.question_answering import load_qa_chain

chain = load_qa_chain(llm=llm, chain_type="stuff")

In [16]:
query = "what is the population in canada"

matching_docs = vectordb.similarity_search(query)
answer = chain.run(input_documents=matching_docs, question=query)
answer

"According to the 2021 Canadian census, the total population of Canada is 36,991,981. It is estimated that Canada's population surpassed 40,000,000 in 2023."