In [1]:
%%time

from IPython.display import clear_output

! pip install sentence_transformers==2.2.2

! pip install -qq -U langchain
! pip install -qq -U langchain-community
! pip install -qq -U langchain-huggingface
! pip install -qq -U tiktoken
! pip install -qq -U pypdf
! pip install -qq -U faiss-gpu-cu12
! pip install -qq -U InstructorEmbedding 

! pip install -qq -U transformers 
! pip install -qq -U accelerate
! pip install -qq -U bitsandbytes
! pip install -qU langchain-openai

Collecting sentence_transformers==2.2.2
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers==2.2.2)
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting tqdm (from sentence_transformers==2.2.2)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torch>=1.6.0 (from sentence_transformers==2.2.2)
  Using cached torch-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision (from sentence_transformers==2.2.2)
  Using cached torchvision-0.22.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting numpy (from sentence_transformers==2.2.2)
  Using cached numpy-2.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting scikit-learn (from sentence_transformers==2.2.2)
  Using cached scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy (from sent

In [2]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [9]:
from langchain_openai import OpenAIEmbeddings

In [10]:
%%time

import warnings
warnings.filterwarnings("ignore")

import os
import glob
import textwrap
import time

import langchain_community
import langchain

### loaders
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

### prompts
from langchain import PromptTemplate, LLMChain

### vector stores
from langchain_community.vectorstores import FAISS

### models
from langchain_community.llms import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceInstructEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

CPU times: user 364 μs, sys: 109 μs, total: 473 μs
Wall time: 477 μs


# Create configuration

In [20]:
class CFG:
    # LLMs
    # model_name = 'llama2-13b-chat' # wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B
    # temperature = 0
    # top_p = 0.95
    # repetition_penalty = 1.15    

    # splitting
    split_chunk_size = 800
    split_overlap = 0
    
    # embeddings
    # embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'    

    # similar passages
    k = 10
    
    # paths
    PDFs_path = './input/books/'
    Embeddings_path =  './output/book-embeddings/'
    Output_folder = './output/books-vectordb'

# Loading the data

In [21]:
sorted(glob.glob(CFG.PDFs_path + '*'))

['./input/books/Journey of Black and Red, A - Mecanimus.pdf']

In [22]:
%%time

loader = DirectoryLoader(
    CFG.PDFs_path,
    glob="./*.pdf",
    loader_cls=PyPDFLoader,
    show_progress=True,
    use_multithreading=True
)

documents = loader.load()

100%|██████████| 1/1 [00:57<00:00, 57.07s/it]

CPU times: user 57 s, sys: 100 ms, total: 57.1 s
Wall time: 57.1 s





In [25]:
print(f'We have {len(documents)} pages in total')

documents[8].page_content

We have 5570 pages in total


'How I wish I could convey my outrage at being held like this! Not\neven a bucket of water, or a chamber pot! Am I to live like a beast? I\ndo not want to think about it. I do not want to think about a great\nmany things.\nThe smaller, white man jumps in surprise and even the Asian\nguardian lifts an eyebrow. What is wrong with them? Did they\nexpect me to cower, to beg?\n“Well,  Milady. Forgive this humble Baudouin, heh? Did not expect ya\nto be so…”\nI huff with impatience and address his companion.\n“How about you, warrior, care to explain why I am being held so?”\nWhile Baudouin is flustered, this one seems barely amused.\n“It is for your own safety.”\n“My safety? I will be secure when I am unbound and at home, you\nrogue! What will it take for you to release me?”\nBaudouin interrupts me, apparently miffed at being ignored.\n“Don’t ya worry your cute little head, Lady, you’ll be released soon\nenough.”\n“I… I…”\nI want to go on, I want to extract information from the reluctant\nduo

# Chunking the data

In [32]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CFG.split_chunk_size,
    chunk_overlap = CFG.split_overlap
)

texts = text_splitter.split_documents(documents)

print(f'We have created {len(texts)} chunks from {len(documents)} pages')

We have created 11473 chunks from 5570 pages


# Creating the embeddings

In [27]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    # With the `text-embedding-3` class
    # of models, you can specify the size
    # of the embeddings you want returned.
    dimensions=1024
)

# instantiating the database

In [None]:
%%time

from langchain_huggingface import HuggingFaceEmbeddings

### we create the embeddings only if they do not exist yet
if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):

    ### create embeddings and DB
    vectordb = FAISS.from_documents(
        documents = texts, 
        embedding = embeddings
    )

    ### persist vector database
    vectordb.save_local(f"{CFG.Output_folder}/faiss_index_hp") # save in output folder
#     vectordb.save_local(f"{CFG.Embeddings_path}/faiss_index_hp") # save in input folder

CPU times: user 3.99 s, sys: 795 ms, total: 4.79 s
Wall time: 57.2 s


In [None]:
## load previously generated embeddings

In [29]:
%%time
from langchain_huggingface import HuggingFaceEmbeddings

### load vector DB embeddings
vectordb = FAISS.load_local(
#    CFG.Embeddings_path, # from input folder
    CFG.Output_folder + '/faiss_index_hp', # from output folder
    embeddings,
    allow_dangerous_deserialization=True
)

CPU times: user 29.2 ms, sys: 69.2 ms, total: 98.4 ms
Wall time: 111 ms


# Test the database

In [30]:
### test if vector DB was loaded correctly
vectordb.similarity_search('magic creatures')

[Document(id='e18175d3-5991-4cd9-92e7-56fb724c432f', metadata={'producer': 'calibre (7.18.0) [https://calibre-ebook.com]', 'creator': 'calibre (7.18.0) [https://calibre-ebook.com]', 'creationdate': '2025-05-17T13:48:53+00:00', 'author': 'Mecanimus', 'moddate': '2025-05-17T13:48:53+00:00', 'title': 'A Journey of Black and Red', 'source': 'input/books/Journey of Black and Red, A - Mecanimus.pdf', 'total_pages': 5570, 'page': 3629, 'page_label': '3630'}, page_content='their midst. There are tales of witches, werewolves and fae filled\nwith inexact statements and exaggerations, and yet the flowing\nprose makes them so very believable. The mysterious author also\nincludes absurd creatures such as chupacabras and drop bears\nwhich we are reasonably certain are jokes. He even mentions\ntraveling courts of magical dancers and small winged creatures!”\nWait.\nWait.\nHold on.\nThis sounds awfully familiar.\n“Ahem, imagine that. What does he say about the fae?”'),
 Document(id='81cfdb9e-6800-4b7a

In [31]:
### testing MMR search
question = "Who turns Ariane into a vampire?"
vectordb.max_marginal_relevance_search(question, k = CFG.k)

[Document(id='cf279924-e021-4fc5-899f-4ec40517c2ee', metadata={'producer': 'calibre (7.18.0) [https://calibre-ebook.com]', 'creator': 'calibre (7.18.0) [https://calibre-ebook.com]', 'creationdate': '2025-05-17T13:48:53+00:00', 'author': 'Mecanimus', 'moddate': '2025-05-17T13:48:53+00:00', 'title': 'A Journey of Black and Red', 'source': 'input/books/Journey of Black and Red, A - Mecanimus.pdf', 'total_pages': 5570, 'page': 763, 'page_label': '764'}, page_content='attended church every Sunday. I am Ariane of the Nirari, the\ndaughter of Thorn and Hunger, she who carved a bloody path to\nfreedom through vampires and werewolves alike. That Ariane does'),
 Document(id='9ede49bb-1973-4477-9179-3797771c9537', metadata={'producer': 'calibre (7.18.0) [https://calibre-ebook.com]', 'creator': 'calibre (7.18.0) [https://calibre-ebook.com]', 'creationdate': '2025-05-17T13:48:53+00:00', 'author': 'Mecanimus', 'moddate': '2025-05-17T13:48:53+00:00', 'title': 'A Journey of Black and Red', 'source': '

In [None]:
### testing similarity search
question = "Who turns Ariane into a vampire?"
vectordb.similarity_search(question, k = CFG.k)

# Instantiating the model

In [None]:
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)