In [37]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY']
  


In [38]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("docs/papers/paper_1.pdf"),
    PyPDFLoader("docs/papers/paper_2.pdf"),
    PyPDFLoader("docs/papers/paper_3.pdf"),
    PyPDFLoader("docs/papers/paper_4.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [39]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [40]:
splits = text_splitter.split_documents(docs)

In [41]:
len(splits)

312

In [42]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [43]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [44]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [45]:
import numpy as np

In [46]:
np.dot(embedding1, embedding2)

0.9631675619330512

In [47]:
np.dot(embedding1, embedding3)

0.7710630976675917

In [48]:
np.dot(embedding2, embedding3)

0.7596682675219103

In [49]:
from langchain.vectorstores import Chroma

In [50]:
persist_directory = 'docs/chroma/'

In [51]:
!rm -rf ./docs/chroma  # remove old database files if any

'rm' is not recognized as an internal or external command,
operable program or batch file.


In [52]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [53]:
print(vectordb._collection.count())

460


In [54]:
question = "is there an abstract"

In [55]:
docs = vectordb.similarity_search(question,k=3)

In [56]:
len(docs)

3

In [57]:
docs[0].page_content

'9  Summary  and  outlook 78\nmore  efficient  strategy  for  real  data  set  creation  can  also  be  significantly  reduced.  \nImprovements  in  synthetic  PBR  data  set  creation  could  also  reduce  or  completely  \neliminate  the  need  for  real  training  data  in  the  future.Machine Translated by Google'

In [58]:
vectordb.persist()

In [59]:
question = "what did they say about yolox?"

In [60]:
docs = vectordb.similarity_search(question,k=5)

In [61]:
docs[0]

Document(page_content='\x08\x04\x08\x0c\x03\x06\x05\x07\x03\x0b\t\x04\x05\x08\x03\t\x08\x08\x03\x07\x05\x04\x03\t\x08\x0b\x03\x07\x05\x05\x03\t\t\x0b\n\x08\x0b\x03\x0b\x05\x06\x03\x05\x08\x0c\x03\r\x05\x06\x03\x0b\x08\x08\r\x03\x08\x08\n\x03\x06\r\x03\r\t\x05\x06\x08\n\x03\x0c\x05\x04\x03\r\x08\x0c\x03\x05\x05\x05\x03\n\x08\x07\x03\x07\x0c\x03\x07\x08\t\x03\x06\r\x03\x05\n\x14\x07\n\x03\x0b\x0c\x03\r\x07\r\x03\n\r\x03\n\x06\x07\x03\t\x05\n\x03\x08\x06\x08\x03\x06\x05\n\x03\n\x11\x08\x08\x03\t\x05\x05\x03\x05\x08\n\x03\x08\x05\x06\x03\x07\x10\x08\x0c\x03\x06\x05\x07\x03\x0b\t\x04\x05\x08\x03\t\x16\t\x04\x03\x08\x05\t\x03\x07\x05\n\x03\x07\x0f\x1b\x1b\x1c\x19\x1c\x1a\x1f"\x0e\x1a"\x08\x08\x03\x0b\x12\x18\x1f \x0e\x1a"\x02\x1e\x17\x13\x10\x13\x16\x01\x15\x1c\x1f$\x08\x07\x03\n\x06\x04\x03\n\x05\x04\x03\x06\x06\x05\x03\x0c\x05\x04\x03\x08\x08\x06\x03\x0c\x06\x07\x03\t\x05\n\x03\x08\x06\x08\x03\x06\x05\n\x03\n\x08\x06\x03\x0b\x0c\x08\x05\x03\t\x08\x05\x03\r\x0b\x03\x0c\x08\x05\x0b\x03\x0b\x

In [62]:
docs[1]

Document(page_content='[16]9\nWith  YOLOX,  further  changes  were  implemented  in  2021  based  on  YOLOv3  and  \nsignificant  improvements  were  achieved.This  will  be  expanded  in  2018  with  YOLOv3  to  include  skip  connections  and  thus  \nbecome  a  ResNet,  which  also  made  further  depth  possible.  This  gave  the  feature  \nextractor  53  convolutional  layers  and  the  name  Darknet-53  [20].\nAnother  change  of  YOLOX  compared  to  the  other  published  versions  from  YOLOv2  is  \nthat  the  architecture  was  again  designed  without  anchor  boxes,  as  these  generalize  \nmore  poorly  and  increase  complexity,  which  could  be  a  limitation  for  further  reducing  \nlatency  [17].  Furthermore,  in  recent  years,  anchor-free  detectors  have  achieved  results  \ncomparable  to  anchor-based  ones  [23].YOLOXSince  the  release  of  YOLO,  there  have  been  some  efforts  to  improve  the  original  \napproach.  In  2017,  anchor  boxes  were  

In [63]:
for doc in docs:
    print(doc.metadata)

{'page': 0, 'source': 'docs/papers/paper_3.pdf'}
{'page': 17, 'source': 'docs/papers/paper_4.pdf'}
{'page': 5, 'source': 'docs/papers/paper_3.pdf'}
{'page': 1, 'source': 'docs/papers/paper_3.pdf'}
{'page': 0, 'source': 'docs/papers/paper_3.pdf'}
