## PDF Query using LangChain

In [21]:
from IPython.display import display as disp

### Loading PDF on langchain document loader 

In [22]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [23]:
loader = PyPDFLoader("./tmp/tmp.pdf")
pages = loader.load_and_split()
print(f'no of page  = {len(pages)}')
print(f'page_content length = {len(pages[0].page_content)}')

no of page  = 380
page_content length = 192


#### Data Chunks in smaller documents 

In [24]:
text_spliter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=0)
texts = text_spliter.split_documents(pages)
texts[8]

Document(page_content='be able to reproduce, animals must solve a continuous stream of problems during theirlives, e.g., ﬁnding food, avoiding predators, mating, and parenting. This suggests that\nhuman intelligence primarily evolved for solving everyday problems related to survival\nin the different habitats of Homo sapiens .\nArtiﬁcial Intelligence started as an attempt to reproduce parts of human intelligence\nin machines and, just like the notion of human intelligence, it is associated with a\ncertain vagueness regarding its deﬁnition, targeted problems, performance measures,\nand relations to neighboring research ﬁelds.\nRecently, AI research has been quite successful at producing systems that are gen-\neral in the sense that they can translate between many languages, play many games,\nmanipulate many objects, predict many video frames, write many texts, generate many\nimages, and diagnose many diseases.\nStill, many of the basic challenges of AGI remain unsolved. In fact, we do n

##### Database Connection 

## Embedding with Chroma DB 

In [25]:
import os
import numpy as np 
from pathlib import Path
from langchain.vectorstores import Chroma  
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders.pdf import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import  OpenAI

from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings
# from langchain.document_loaders import TextLoader, DirectoryLoader

In [26]:
loader = PyPDFLoader("./tmp/tmp.pdf")
pages = loader.load_and_split()
text_spliter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=0)
texts = text_spliter.split_documents(pages)
texts[8]

Document(page_content='be able to reproduce, animals must solve a continuous stream of problems during theirlives, e.g., ﬁnding food, avoiding predators, mating, and parenting. This suggests that\nhuman intelligence primarily evolved for solving everyday problems related to survival\nin the different habitats of Homo sapiens .\nArtiﬁcial Intelligence started as an attempt to reproduce parts of human intelligence\nin machines and, just like the notion of human intelligence, it is associated with a\ncertain vagueness regarding its deﬁnition, targeted problems, performance measures,\nand relations to neighboring research ﬁelds.\nRecently, AI research has been quite successful at producing systems that are gen-\neral in the sense that they can translate between many languages, play many games,\nmanipulate many objects, predict many video frames, write many texts, generate many\nimages, and diagnose many diseases.\nStill, many of the basic challenges of AGI remain unsolved. In fact, we do n

In [27]:
Path('Knowledge_space')
CACHE_DATASET = Path('cache_dir')
DATABASE_DIR = Path('Knowledge_space')
if not DATABASE_DIR.is_dir():
    DATABASE_DIR.mkdir(parents=True)
if not CACHE_DATASET.is_dir():
    CACHE_DATASET.mkdir(parents=True)

In [28]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(
    "all-MiniLM-L6-v2", 
    cache_folder = CACHE_DATASET.resolve().__str__()
)

# text_embedding_vector = embedding_model.encode([text.page_content for text in texts])

In [29]:
exp_text = np.array([text.page_content for text in texts])
exp_text.shape

(1203,)

In [34]:
from langchain_community.embeddings.sentence_transformer \
import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

### Creating Vector Store 

In [36]:
vector_db=Chroma.from_documents(documents=texts,
                               embedding = embedding_function,
                               persist_directory= str(DATABASE_DIR.resolve()))

In [None]:
import chromadb 
db_client = chromadb.Client()

In [None]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-large')
sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
instruction = "Represent the Science title:"
embeddings = model.encode([[instruction,sentence]])
print(embeddings)


In [None]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-xl')
sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
instruction = "Represent the Science title:"
embeddings = model.encode([[instruction,sentence]])
print(embeddings)


## Chroma DB Client and Lang Chain Chroma 

In [39]:
import chromadb

In [40]:
DATABASE_PATH = Path( 'VectorDB')
user_1_db  = DATABASE_PATH/'user1'

In [41]:
client = chromadb.PersistentClient(path=str(DATABASE_PATH.resolve()))

In [42]:
import chromadb
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [45]:
# collection = client.create_collection(name="Collection2")
collection = client.get_collection(name="Collection2")

In [49]:
print(dir(client))


['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slotnames__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_add', '_admin_client', '_count', '_create_system_if_not_exists', '_delete', '_get', '_get_identifier_from_settings', '_identifer_to_system', '_identifier', '_modify', '_peek', '_populate_data_from_system', '_query', '_server', '_system', '_update', '_upsert', '_validate_tenant_database', 'clear_system_cache', 'count_collections', 'create_collection', 'database', 'delete_collection', 'from_system', 'get_collection', 'get_or_create_collection', 'get_settings', 'get_version', 'heartbeat', 'list_collections', 'max_batch_size', 'reset', 'set_database'

In [53]:
client.list_collections()

[Collection(name=Collection2), Collection(name=Collection1)]

In [54]:
langchain_chroma = Chroma(
    client=client,
    collection_name="Collection_1",
    embedding_function=embedding_function,
)

In [59]:
lang_chroma = langchain_chroma.from_documents(texts, embedding_function)

In [58]:
langchain_chroma.get()

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

In [61]:
lang_chroma.as_retriever()

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002117DDD26D0>)