In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings 
from langchain_chroma import Chroma
from langchain_core.documents import Document
import chromadb
from chromadb.utils import embedding_functions



## Load pdf and split into chunks

In [2]:
# Load your PDF document
pdf_path = "./DLVS.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()
print(len(documents))

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=2500,
    chunk_overlap=1000,
    length_function=len,
    is_separator_regex=False,
)

documents = text_splitter.split_documents(documents)
documents = [Document(page_content=x.page_content, metadata=x.metadata, id = idx) for idx,x in enumerate(documents)]
print(len(documents))

145
180


In [3]:
ids = [x.id for x in documents]
metadata = [x.metadata for x in documents]
docs = [x.page_content for x in documents]

In [4]:
client = chromadb.HttpClient(host='localhost', port=8000)
collection_name = 'thesis'
ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name='all-MiniLM-L6-v2')
print(list(client.list_collections()))
#delete collection if it already exists
if collection_name in client.list_collections():
    client.delete_collection(collection_name)
#


  from tqdm.autonotebook import tqdm, trange


[Collection(id=f5563a7f-2e8d-4b17-a094-641a5fd980be, name=thesis)]


In [5]:
thesis_collection = client.create_collection( 
        collection_name,
        metadata={"hnsw:space": "cosine"},
        embedding_function=ef,
        get_or_create=True
)
thesis_collection.add(documents = docs, ids =ids, metadatas = metadata)

## Test if we can load it

In [2]:
client = chromadb.HttpClient(host='localhost', port=8000)
ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name='all-MiniLM-L6-v2')
thesis_collection = client.get_collection(
    'thesis',
    embedding_function = ef
)

  from tqdm.autonotebook import tqdm, trange


In [3]:
query = "DIFRINT"
matches = thesis_collection.query(
    query_texts=query, 
    n_results= 3
)

In [4]:
matches['documents']

[['100 \n 7.1 DIFRINT Implementation  \nDIFRINT proposes a deep frame interpolation architecture intended for   video \nstabilization, that is depicted in Figure 7.1. The training will take place in an \nunsupervised manner. We will go over the training and testing scheme separately as \nthey differ and finally, we will introduce two inference parameters: number of \niterations and the ‘skip’ parameter, which are both cruci al for achieving high quality \nstabilization.  \n \nFigure 7.1: The DIFRINT   [40] framework during (a) training and (b) testing.  \n \n7.1.1  Training Scheme  \nThe basic idea is that given two adjacent frames 𝑓𝑖−1,𝑓𝑖+1 we want to generate the \nintermediate frame 𝑓𝑖𝑛𝑡. The first part is generating the input to the network. Instead \nof simply concatenating the two images and feeding them to the network, the authors \nwarp the two frames towards the intermediate frame 𝑓𝑖, through optical flow \nestimated with PWC -Net [8], thus producing the warped frames 𝑓𝑤−,𝑓𝑤+.

In [5]:
embeddings= HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_store_from_client = Chroma(
    client=client,
    collection_name="thesis",
    embedding_function=embeddings,
)

In [6]:
vector_store_from_client.similarity_search(query)

[Document(metadata={'page': 110, 'source': './DLVS.pdf'}, page_content='100 \n 7.1 DIFRINT Implementation  \nDIFRINT proposes a deep frame interpolation architecture intended for   video \nstabilization, that is depicted in Figure 7.1. The training will take place in an \nunsupervised manner. We will go over the training and testing scheme separately as \nthey differ and finally, we will introduce two inference parameters: number of \niterations and the ‘skip’ parameter, which are both cruci al for achieving high quality \nstabilization.  \n \nFigure 7.1: The DIFRINT   [40] framework during (a) training and (b) testing.  \n \n7.1.1  Training Scheme  \nThe basic idea is that given two adjacent frames 𝑓𝑖−1,𝑓𝑖+1 we want to generate the \nintermediate frame 𝑓𝑖𝑛𝑡. The first part is generating the input to the network. Instead \nof simply concatenating the two images and feeding them to the network, the authors \nwarp the two frames towards the intermediate frame 𝑓𝑖, through optical flow \ne