# Download embedding and indeference models models to cache

In [1]:
from pathlib import Path
import os

from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, TextLoader, UnstructuredMarkdownLoader

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain_community.vectorstores import Chroma
#from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch

# Activate cuda if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Paths for prject
DB_DIR: str = Path.cwd().joinpath('vectorstore.db')
HF_CACHE = Path.cwd().joinpath('model_cache')

if not HF_CACHE.exists():
    HF_CACHE.mkdir()

# Make os path var as well because langchain cant handle Pathlib paths >:(
HF_CACHE_W_PATH = os.getcwd() + "\model_cache"


# EMBEDDING_MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
#EMBEDDING_MODEL = "intfloat/e5-mistral-7b-instruct"
EMBEDDING_MODEL = "all-mpnet-base-v2"

### Load models into specific cache

In [2]:
# Initialize HuggingFace embeddings
huggingface_embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs={"device": device},
    cache_folder = HF_CACHE_W_PATH
)

  return self.fget.__get__(instance, owner)()


In [3]:
# Initialize loaders for different file types
pdf_loader = DirectoryLoader("data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
markdown_loader = DirectoryLoader(
    "data/", glob="**/*.md", loader_cls=UnstructuredMarkdownLoader
)
text_loader = DirectoryLoader("data/", glob="**/*.txt", loader_cls=TextLoader)

all_loaders = [
    pdf_loader, 
    markdown_loader, 
    text_loader
    ]

# Load documents from all loaders
loaded_documents = []
for loader in all_loaders:
    loaded_documents.extend(loader.load())

# Split loaded documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)
chunked_documents = text_splitter.split_documents(loaded_documents)


Auto

In [4]:
from langchain_community.vectorstores import Chroma

# Create and persist a Chroma vector database from the chunked documents
vector_database = Chroma.from_documents(
    documents=chunked_documents,
    embedding=huggingface_embeddings,
    persist_directory=DB_DIR.as_posix(),
)

vector_database.persist()