Steps for creating a setup LLM

1. Load raw PDFs, Books, Notes
2. Create chunks
3. Create vector embeddings
4. Store embeddings in FAISS

In [2]:
#REQUIREMENTS 

from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 


In [3]:
# Loading raw PDFs

data_path="data/"
def load_pdf_files(data):
    loader= DirectoryLoader(data,
                            glob='*.pdf',
                            loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

documents= load_pdf_files(data=data_path)
print("Length of PDF pages:", len(documents))

Length of PDF pages: 153


In [4]:
# Create Chunks


def create_chunks(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,
                                                 chunk_overlap=500)
    chunks= text_splitter.split_documents(extracted_data)
    return chunks

chunks=create_chunks(extracted_data=documents)
print('length of text chunks: ', len(chunks))

length of text chunks:  365


In [5]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


In [6]:
# Create Vector Embeddings


def get_embedding_model():
    embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model

embedding_model=get_embedding_model()

  embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [8]:
# Store embeddings in FAISS

DB_FAISS_PATH="vectorstore/db_faiss"
db=FAISS.from_documents(chunks, embedding_model)
db.save_local(DB_FAISS_PATH)
