# Data Ingestion

In [1]:
from langchain_core.documents import Document

In [2]:
documents = [
    Document(page_content="Apples are red", metadata={"title": "apple_book","pages":1,"author":"Adil Khan"}),
    Document(page_content="Blueberries are blue", metadata={"title": "blueberry_book","pages":1,"author":"Adil Khan"}),
    Document(page_content="Bananas are yelow", metadata={"title": "banana_book","pages":1,"author":"Adil Khan"}),
]

In [3]:
documents

[Document(metadata={'title': 'apple_book', 'pages': 1, 'author': 'Adil Khan'}, page_content='Apples are red'),
 Document(metadata={'title': 'blueberry_book', 'pages': 1, 'author': 'Adil Khan'}, page_content='Blueberries are blue'),
 Document(metadata={'title': 'banana_book', 'pages': 1, 'author': 'Adil Khan'}, page_content='Bananas are yelow')]

In [4]:
import os
os.makedirs("../data/text_files",exist_ok=True)

In [5]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("✅ Sample text files created!")

✅ Sample text files created!


In [6]:
from langchain.document_loaders import TextLoader

loader=TextLoader("../data/text_files/python_intro.txt",encoding="utf-8")

doc=loader.load()

doc

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]

# Embedding and Vector Store

In [7]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
import torch
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

ModuleNotFoundError: No module named 'sentence_transformers'

### Check GPU's CUDA Core

In [None]:
import torch
print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version (PyTorch built against):", torch.version.cuda)
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


PyTorch: 2.8.0+cu128
CUDA available: True
CUDA version (PyTorch built against): 12.8
GPU name: NVIDIA GeForce GTX 1650


### Class to Load Model & create Embeddings

In [None]:
class EmbeddingManager:
    """Handles document embedding generation using Sentence Transformer (384 dimensions)"""

    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager

        Args: model_name: Huggingface model name for sentence embeddings
        """
        self.model_name=model_name
        self.model=None
        self._load_model()

    def _load_model(self):
        """Load Sentence Transformer Model"""
        try:
            print(f"Loading Sentence Transformer Model:{self.model_name}")
            device = "cuda" if torch.cuda.is_available() else "cpu"
            print(f"Loading Model into {device}")
            self.model = SentenceTransformer(self.model_name, device=device)
            print(f"Model Loaded sucessfully,Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error in loading model {self.model_name}: {e}")
            raise
    
    def generate_embeddings(self,texts: List[str])->np.ndarray:
        """
        Generate embeddings for a list of text

        Args:
            texts: List of text strings to embed

        Returns:
            numpy array of embeddings with shape (len(texts),embedding_dim)
        """

        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")

        embeddings=self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    def get_embedding_dimension(self)-> int:
        """Get the embedding dimension of model"""

        if not self.model:
            raise ValueError("Model not loaded")
        
        return self.model.get_sentence_embedding_dimension()
    
    ##Initialize the embedding manager

embedding_manager=EmbeddingManager()
embedding_manager

Loading Sentence Transformer Model:all-MiniLM-L6-v2
Model Loaded sucessfully,Embedding dimention: 384


<__main__.EmbeddingManager at 0x78d6d4ef7c50>

### Vector Store

In [None]:
class VectorStore:
    def __init__(self,collection_name:str="pdf_documents",persist_directory:str="../data/vector_store"):
        """
        Initialize Vector Store

        Args:
            collection_name: Name of VectorDB(ChromaDB) collection
            persist_directory: Directory to persist the vector store
        """

        self.collection_name=collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            # Get or cr Loading. ction
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """Add documents and their embeddings to the vector store

        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding {len(documents)} documents to vector store...")
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            # Prepare metadata

            metadata = dict(doc.metadata)
            metadata[ 'doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append (metadata)
            # Document content
            documents_text.append(doc.page_content)
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
                self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
                )
                print(f"Successfully added {len(documents)} documents to vector store")
                print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise


vectorstore=VectorStore()
vectorstore

In [None]:
43:11

NameError: name 'chunks' is not defined