In [1]:
pip install langchain langchain-community langchain-core langchain-ollama chromadb sentence-transformers pypdf python-dotenv unstructured[pdf] tiktoken

Collecting tiktoken
  Using cached tiktoken-0.9.0-cp313-cp313-win_amd64.whl.metadata (6.8 kB)
Collecting unstructured[pdf]
  Using cached unstructured-0.17.2-py3-none-any.whl.metadata (24 kB)
Collecting chardet (from unstructured[pdf])
  Using cached chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting filetype (from unstructured[pdf])
  Using cached filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured[pdf])
  Using cached python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting lxml (from unstructured[pdf])
  Using cached lxml-5.4.0-cp313-cp313-win_amd64.whl.metadata (3.6 kB)
Collecting nltk (from unstructured[pdf])
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting emoji (from unstructured[pdf])
  Using cached emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured[pdf])
  Using cached python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB)
Collecting langdetect 

ERROR: Could not install packages due to an OSError: [Errno 28] No space left on device



langchain, langchain-community, langchain-core: The core LangChain framework for building LLM applications.

langchain-ollama: Specific integration for using Ollama models with LangChain.

chromadb: The local vector database for storing document embeddings.

sentence-transformers: Used for an alternative local embedding method (explained later).

pypdf: A library for loading PDF documents.

python-dotenv: For managing environment variables (optional but good practice).

unstructured[pdf]: An alternative, powerful document loader, especially for complex PDFs.

tiktoken: Used by LangChain for token counting.

## How to Build a local RAG system with Qwen 3
- RAG: Retrieval-Augmented Generation, it a powerful techinque that enhances LLMs by providing them with external knowledge.
- Rag Process involves:
    - Loading and splitting documents into manageable chunks
    - Converting these chunks into numerical representations using an embedding model.
    - Storing these embeddings in a vector database for efficient searching.
    - When a query comes in, embedding the query and searching the vector database for the most similar document chunks.
    - Providing these relevant chunks along with the original query to the llm to generate an informed ans.

## Load documents in python

In [2]:
!pip install langchain_community



In [3]:
!pip install pypdf



In [4]:
import os 
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader

load_dotenv()

DATA_PATH = ''
PDF_FILENAME = 'test.pdf'

def load_document():
    # pdf_path = os.path.join(DATA_PATH, PDF_FILENAME)
    loader = PyPDFLoader(r'C:\Users\user\Desktop\Git-hub\LLM-engieering\test.pdf')

    documents = loader.load()
    print(f'Loaded {len(documents)} page(s) from {r'C:\Users\user\Desktop\Git-hub\LLM-engieering\test.pdf'}')
    return documents

load_document()

Loaded 2 page(s) from C:\Users\user\Desktop\Git-hub\LLM-engieering\test.pdf


[Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20250516052051', 'source': 'C:\\Users\\user\\Desktop\\Git-hub\\LLM-engieering\\test.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content="RESUME\nNANDANI KUMARI\nVill:-Harivanshpur, Post:-Harhaki, Dist:-Madhubani\nMob:- 9308057178\nEmail:- sambhusk1716@gmail.com\n\nCAREER OBJECTIVE\nLooking for an entry into a world-class, highly professional organization with challenging and\ncompetitive environment, where I can use my knowledge base as well as personal attributes to\nachieve the organization goals.\n\nEDUCATIONAL QUALIFICATION\n10th - B.S.E.B PATNA - Passed in 2018 - 2ND Division\n12th - B.S.E.B PATNA - Passed in 2020 - 1st Division\nB.A Psychology Hons - Lalit Narayan Mithila University - 2023 - 1st Division\nP.G. - Lalit Narayan Mithila University (Bihar) - 2023-25 - Pursuing\n\nCOMPUTER SKILLS\nComputer Course in KYP\n\nPERSONAL SKILLS\nQuick learner\

## Split Documents

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
    )
    all_splits = text_splitter.split_documents(documents)
    print(f'Split into {len(all_splits)} chunks')
    return all_splits
loaded_docs = load_document()
split_documents(loaded_docs)

Loaded 2 page(s) from C:\Users\user\Desktop\Git-hub\LLM-engieering\test.pdf
Split into 2 chunks


[Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20250516052051', 'source': 'C:\\Users\\user\\Desktop\\Git-hub\\LLM-engieering\\test.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content="RESUME\nNANDANI KUMARI\nVill:-Harivanshpur, Post:-Harhaki, Dist:-Madhubani\nMob:- 9308057178\nEmail:- sambhusk1716@gmail.com\n\nCAREER OBJECTIVE\nLooking for an entry into a world-class, highly professional organization with challenging and\ncompetitive environment, where I can use my knowledge base as well as personal attributes to\nachieve the organization goals.\n\nEDUCATIONAL QUALIFICATION\n10th - B.S.E.B PATNA - Passed in 2018 - 2ND Division\n12th - B.S.E.B PATNA - Passed in 2020 - 1st Division\nB.A Psychology Hons - Lalit Narayan Mithila University - 2023 - 1st Division\nP.G. - Lalit Narayan Mithila University (Bihar) - 2023-25 - Pursuing\n\nCOMPUTER SKILLS\nComputer Course in KYP\n\nPERSONAL SKILLS\nQuick learner\

## Choose and Configure Embedding Model

In [6]:
!pip install langchain_ollama



In [7]:
## option A: ollama embeddings
from langchain_ollama import OllamaEmbeddings

def get_embedding_function(model_name="nomic-embed-text"):
    """Initializes the Ollama embedding function."""
    # Ensure Ollama server is running (ollama serve)
    embeddings = OllamaEmbeddings(model=model_name)
    print(f"Initialized Ollama embeddings with model: {model_name}")
    return embeddings

# embedding_function = get_embedding_function() # Call this later

In [8]:
!pip install sentence-transformers



In [9]:
!pip install sentence-transformers



In [11]:
## option b
# Alternative embedding function using Sentence Transformers
from langchain_community.embeddings import HuggingFaceEmbeddings

def get_embedding_function_hf(model_name="all-MiniLM-L6-v2"):
     """Initializes HuggingFace embeddings (runs locally)."""
     embeddings = HuggingFaceEmbeddings(model_name=model_name)
     print(f"Initialized HuggingFace embeddings with model: {model_name}")
     return embeddings

embedding_function = get_embedding_function()

Initialized Ollama embeddings with model: nomic-embed-text


In [12]:
from langchain_community.vectorstores import Chroma

CHROMA_PATH = "chroma_db" # Directory to store ChromaDB data

def get_vector_store(embedding_function, persist_directory=CHROMA_PATH):
    """Initializes or loads the Chroma vector store."""
    vectorstore = Chroma(
        persist_directory=persist_directory,
        embedding_function=embedding_function
    )
    print(f"Vector store initialized/loaded from: {persist_directory}")
    return vectorstore

embedding_function = get_embedding_function()
vector_store = get_vector_store(embedding_function)

Initialized Ollama embeddings with model: nomic-embed-text


  vectorstore = Chroma(


Vector store initialized/loaded from: chroma_db
