# Module for training and computing the vectors and storing it for the LLM model to use as context to the query to produce a response.

I'll make a documents folder which holds the PDFs that I'll use as training material.

In [4]:
from pathlib import Path

#make the directory "trainingdocs"
Path("docs").mkdir(parents=True, exist_ok=True)

Make a .env file to hold environment variables

In [5]:
%%writefile -a .env
OPENAI_API_KEY='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'

Writing .env


Make .env file for environment variable. 

In [6]:
!pip install python-dotenv
import os
from dotenv import load_dotenv

load_dotenv()

Collecting python-dotenv
  Using cached python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0


True

Make a training file that can take paths to training documents to create embeddings from them and store in a local vector store

In [7]:
%%writefile -a train.py
import sys
sys.path.append('..')
import os
import argparse
from ai.embeddings.openai_embeddings import OpenAI_Embeddings
from utils.loaders.pymupdf import PyMuPDF_Loader
from utils.splitters.recursive import RecursiveCharacter_TextSplitter
from utils.vectorstores.deep_lake import DeeplakeDB

openai_api_key = os.environ.get('OPENAI_API_KEY')

def load_and_split(pdf):
    """
    This method takes an input pdf to be loaded and split into chunks
    
    :param pdf: path to training document
    
    :return: split langchain Document objects
    """
    loader = PyMuPDF_Loader(pdf)
    data = loader.load_text()
    # split extracted text(tokenize)
    # split recursively by different characters - starting with "\n\n", then "\n", then " "
    splitter = RecursiveCharacter_TextSplitter(
        chunk_size = 1000,
        chunk_overlap = 200,
        length_function = len
    )
    docs = splitter.split_data(data)
    return docs

def embed_and_store(docs):
    """
    This method takes an input list of chunked documents to be embedded and stored
    
    :param docs: list of split langchain Document objects
    """
    # initialize embeddings model to pass in to db
    embeddings = OpenAI_Embeddings(api_key=openai_api_key).vectorizer
    # initialize vector store, add split docs
    # (db will compute embeddings using embedding model and store in specified path)
    deeplake = DeeplakeDB(store_path='./embeddings_deeplake', embedding_model=embeddings)
    deeplake.add_docs(docs)

def main():
    """
    When file is run, command line takes input file paths separated by spaces. These will be loaded, split, and embedded, then stored.
    """
    parser = argparse.ArgumentParser(description='Training Script')
    parser.add_argument('PDF_paths', nargs='+', type=str, help='Paths to the documents')
    args = parser.parse_args()
    
    docs = args.PDF_paths
    split_docs = []
    for doc in docs:
        chunks = load_and_split(doc)
        split_docs.append(chunks)
    
    embed_and_store(split_docs)
    
if __name__=="__main__":
    main()

Writing train.py


To run train.py, navigate to training directory, go into shell with python -m pipenv shell or pipenv shell. Then run with python train.py path_to_training_file1 path_to_training_file2 etc

ex. python train.py docs/ePortEngageComboInstallGuide.pdf docs/ePortG9QuickstartGuide.pdf docs/ePortG11InstallGuide.pdf

In [None]:
%%writefile train.py
import sys
sys.path.append('..')
import os
import argparse
from ai.embeddings.embeddings_mapper import Embeddings_Mapper
from utils.loaders.loader_mapper import LoaderMapper
from utils.splitters.recursive import RecursiveCharacter_TextSplitter
from utils.vectorstores.deep_lake import DeeplakeDB

def choose_embeddings(model):
    embeddings_mapper = Embeddings_Mapper()
    embeddings = embeddings_mapper.find_model(model)
    return embeddings


def get_files():
    """
    takes an input string "model" that denotes which embeddings model to use(available options in embeddings_mapper)
    """
    parser = argparse.ArgumentParser(description='Training Script')
    parser.add_argument('PDF_paths', nargs='+', type=str, help='Paths to the documents')
    args = parser.parse_args()
    
    files = args.PDF_paths
    return files

def load_and_split(pdf):
    """
    This method takes an input pdf to be loaded and split into chunks
    
    :param pdf: path to training document
    
    :return: split langchain Document objects
    """
    mapper = LoaderMapper()
    loader = mapper.find_loader(pdf)
    data = loader.load()
    # split extracted text(tokenize)
    # split recursively by different characters - starting with "\n\n", then "\n", then " "
    splitter = RecursiveCharacter_TextSplitter(
        chunk_size = 1000,
        chunk_overlap = 200,
        length_function = len
    )
    docs = splitter.split_data(data)
    return docs

def embed_and_store(docs):
    """
    This method takes an input list of chunked documents to be embedded and stored
    
    :param docs: list of split langchain Document objects
    """
    # initialize embeddings model to pass in to db
    embeddings = choose_embeddings("openai")
    # initialize vector store, add split docs
    # (db will compute embeddings using embedding model and store in specified path)
    deeplake = DeeplakeDB(store_path='./embeddings_deeplake', embedding_model=embeddings)
    deeplake.add_docs(docs)

def main():
    """
    When file is run, command line takes input file paths separated by spaces. These will be loaded, split, and embedded, then stored.
    """
    docs = get_files()
    split_docs = []
    for doc in docs:
        chunks = load_and_split(doc)
        split_docs.append(chunks)
    
    embed_and_store(split_docs)
    
if __name__=="__main__":
    main()

# Now, we need to write test cases for the training module.

In [1]:
!mkdir tests