# Setup

In [None]:
# Run this cell if using Google Colab
!pip -q install langchain-community==0.3.2
!pip -q install langchain-ollama==0.2.0 langchain-core==0.3.10 langchain-chroma==0.1.4
!pip -q install tiktoken chromadb pypdf transformers InstructorEmbedding
!pip -q install accelerate bitsandbytes
!pip -q install sentence_transformers==2.2.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m407.2/407.2 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.8/295.8 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.0/78.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# LangChain Multi-Doc Retriever with ChromaDB

***Points***
- Multiple Files - PDFs
- ChromaDB
- Local LLM
- Instuctor Embeddings


## Setting up LangChain

In [None]:
import os

from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import CSVLoader
from langchain.document_loaders import JSONLoader
from langchain.document_loaders import DirectoryLoader

from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


## Instructor Embeddings

In [None]:
# HuggingFace
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      #model_kwargs={"device": "cuda"}
                                                      )

In [None]:
# Ollama
from langchain_ollama import OllamaEmbeddings

instructor_embeddings = OllamaEmbeddings(model="qwen2.5:14b-instruct-q5_K_M")

## Load multiple and process documents

In [None]:
# Load and process the text files
# loader = TextLoader('single_text_file.pdf')
# loader = CSVLoader(file_path="./example_data/mlb_teams_2012.csv")
# loader = JSONLoader(file_path='./data/indonesian_rupiah_dataset.json', jq_schema='.', text_content=False)
# loader = DirectoryLoader('./puebi/puebi/', glob="./*.csv", loader_cls=CSVLoader)
loader = DirectoryLoader('./dataset/puebi-dan-penyuluhan', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [None]:
len(documents)

640

In [None]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [None]:
len(texts)

1092

## create the DB

In [None]:
from langchain.vectorstores.chroma import Chroma

# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## Here is the nmew embeddings being used
embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)