In [13]:
# Sample document about Python programming
sample_document = """
Python is a high-level, interpreted programming language known for its simplicity and readability. 
It was created by Guido van Rossum and first released in 1991. Python supports multiple programming 
paradigms including procedural, object-oriented, and functional programming.

One of Python's key strengths is its extensive standard library, which provides tools for many common 
programming tasks. The language emphasizes code readability with its use of significant indentation. 
Python's syntax allows programmers to express concepts in fewer lines of code compared to languages 
like C++ or Java.

Python is widely used in web development, data science, artificial intelligence, scientific computing, 
and automation. Popular frameworks include Django and Flask for web development, NumPy and Pandas for 
data analysis, and TensorFlow and PyTorch for machine learning.

The Python Package Index (PyPI) hosts hundreds of thousands of third-party packages that extend Python's 
capabilities. Installation is simple using pip, Python's package installer. The active community contributes 
to a rich ecosystem of libraries and frameworks.

Python continues to be one of the most popular programming languages worldwide. Its beginner-friendly nature 
makes it ideal for education, while its powerful features support professional software development and research.
"""

print(f"Document length: {len(sample_document)} characters")
print(f"Document length: {len(sample_document.split())} words")
print(f"\nFirst 200 characters:\n{sample_document[:200]}...")

Document length: 1367 characters
Document length: 187 words

First 200 characters:

Python is a high-level, interpreted programming language known for its simplicity and readability. 
It was created by Guido van Rossum and first released in 1991. Python supports multiple programming...


In [14]:
def chunk_by_characters(text, chunk_size=200, overlap=50):
    """
    Split text into chunks of specified character length.
    
    Args:
        text: The text to chunk
        chunk_size: Number of characters per chunk
        overlap: Number of characters to overlap between chunks
    
    Returns:
        List of text chunks
    """
    chunks = []
    start = 0
    
    while start < len(text):
        # Get chunk from start to start + chunk_size
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        
        # Move start position (with overlap)
        start += chunk_size - overlap
    
    return chunks

# Test it
chunks = chunk_by_characters(sample_document, chunk_size=200, overlap=50)

print(f"Number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks[:3], 1):  # Show first 3 chunks
    print(f"Chunk {i} ({len(chunk)} chars):")
    print(chunk)
    print("-" * 80)

Number of chunks: 10

Chunk 1 (200 chars):

Python is a high-level, interpreted programming language known for its simplicity and readability. 
It was created by Guido van Rossum and first released in 1991. Python supports multiple programming
--------------------------------------------------------------------------------
Chunk 2 (200 chars):
ased in 1991. Python supports multiple programming 
paradigms including procedural, object-oriented, and functional programming.

One of Python's key strengths is its extensive standard library, which
--------------------------------------------------------------------------------
Chunk 3 (200 chars):
strengths is its extensive standard library, which provides tools for many common 
programming tasks. The language emphasizes code readability with its use of significant indentation. 
Python's syntax
--------------------------------------------------------------------------------


In [15]:
def chunk_by_words(text, chunk_size=50, overlap=10):
    """
    Split text into chunks of specified word count.
    
    Args:
        text: The text to chunk
        chunk_size: Number of words per chunk
        overlap: Number of words to overlap between chunks
    
    Returns:
        List of text chunks
    """
    # Split text into words
    words = text.split()
    chunks = []
    start = 0
    
    while start < len(words):
        # Get chunk of words
        end = start + chunk_size
        chunk_words = words[start:end]
        
        # Join words back into text
        chunk = ' '.join(chunk_words)
        chunks.append(chunk)
        
        # Move start position (with overlap)
        start += chunk_size - overlap
    
    return chunks

# Test it
chunks = chunk_by_words(sample_document, chunk_size=50, overlap=10)

print(f"Number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks[:3], 1):  # Show first 3 chunks
    print(f"Chunk {i} ({len(chunk.split())} words):")
    print(chunk)
    print("-" * 80)

Number of chunks: 5

Chunk 1 (50 words):
Python is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. Python supports multiple programming paradigms including procedural, object-oriented, and functional programming. One of Python's key strengths is its extensive standard library, which provides tools for
--------------------------------------------------------------------------------
Chunk 2 (50 words):
strengths is its extensive standard library, which provides tools for many common programming tasks. The language emphasizes code readability with its use of significant indentation. Python's syntax allows programmers to express concepts in fewer lines of code compared to languages like C++ or Java. Python is widely used in web
--------------------------------------------------------------------------------
Chunk 3 (50 words):
like C++ or Java. Python is widely used in web development, d

In [16]:
def chunk_by_sentences(text, max_chunk_size=500):
    """
    Split text into chunks by sentences, keeping sentences intact.
    
    Args:
        text: The text to chunk
        max_chunk_size: Maximum characters per chunk
    
    Returns:
        List of text chunks
    """
    # Simple sentence splitting (split on . ! ?)
    import re
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        # Check if adding this sentence would exceed max size
        if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
            # Save current chunk and start new one
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            # Add sentence to current chunk
            current_chunk += " " + sentence if current_chunk else sentence
    
    # Don't forget the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Test it
chunks = chunk_by_sentences(sample_document, max_chunk_size=400)

print(f"Number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i} ({len(chunk)} chars):")
    print(chunk)
    print("-" * 80)

Number of chunks: 4

Chunk 1 (398 chars):
Python is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. Python supports multiple programming 
paradigms including procedural, object-oriented, and functional programming. One of Python's key strengths is its extensive standard library, which provides tools for many common 
programming tasks.
--------------------------------------------------------------------------------
Chunk 2 (320 chars):
The language emphasizes code readability with its use of significant indentation. Python's syntax allows programmers to express concepts in fewer lines of code compared to languages 
like C++ or Java. Python is widely used in web development, data science, artificial intelligence, scientific computing, 
and automation.
--------------------------------------------------------------------------------
Chunk 3 (332 chars):
Popular frameworks include Django 

In [17]:
def chunk_by_paragraphs(text, min_chunk_size=100):
    """
    Split text by paragraphs (double newlines).
    
    Args:
        text: The text to chunk
        min_chunk_size: Minimum characters per chunk (combine small paragraphs)
    
    Returns:
        List of text chunks
    """
    # Split by double newlines (paragraph separator)
    paragraphs = text.split('\n\n')
    
    chunks = []
    current_chunk = ""
    
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
            
        # If paragraph is too small, combine with next
        if len(para) < min_chunk_size:
            current_chunk += "\n\n" + para if current_chunk else para
        else:
            # Save previous chunk if exists
            if current_chunk:
                chunks.append(current_chunk.strip())
            # Start new chunk with this paragraph
            current_chunk = para
    
    # Don't forget the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Test it
chunks = chunk_by_paragraphs(sample_document, min_chunk_size=100)

print(f"Number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i} ({len(chunk)} chars):")
    print(chunk)
    print("-" * 80)

Number of chunks: 5

Chunk 1 (277 chars):
Python is a high-level, interpreted programming language known for its simplicity and readability. 
It was created by Guido van Rossum and first released in 1991. Python supports multiple programming 
paradigms including procedural, object-oriented, and functional programming.
--------------------------------------------------------------------------------
Chunk 2 (323 chars):
One of Python's key strengths is its extensive standard library, which provides tools for many common 
programming tasks. The language emphasizes code readability with its use of significant indentation. 
Python's syntax allows programmers to express concepts in fewer lines of code compared to languages 
like C++ or Java.
--------------------------------------------------------------------------------
Chunk 3 (270 chars):
Python is widely used in web development, data science, artificial intelligence, scientific computing, 
and automation. Popular frameworks include Djang

In [18]:
def chunk_with_metadata(text, source_name, chunk_size=50, overlap=10):
    """
    Create chunks with metadata.
    
    Args:
        text: The text to chunk
        source_name: Name of the source document
        chunk_size: Number of words per chunk
        overlap: Number of words to overlap
    
    Returns:
        List of dictionaries with 'text' and 'metadata'
    """

    # Get basic chunks 
    words= text.split()
    chunks=[]
    start=0
    chunk_index=0

    while start<len(words):
        end=start+chunk_size
        chunk_words=words[start:end]
        chunk_text= ' '.join(chunk_words)

        #create chunk with metadata
        chunk_with_meta ={
            'text':chunk_text,
            'metadata':{'source':source_name,
                        'chunk_index':chunk_index,
                        'chunk_size':len(chunk_words),
                        'char_count':len(chunk_text),
                        'start_word':start,
                        'end_word':end
                        }
        }
        chunks.append(chunk_with_meta)
        start += chunk_size - overlap
        chunk_index +=1

    return chunks

# Test it 
chunks= chunk_with_metadata(sample_document,source_name='python_intro.txt',chunk_size=50,overlap=10)
print(f"Total chunks: {len(chunks)}\n")
print("First chunk with metadata:")
print("=" * 80)
print(f"Text: {chunks[1]['text'][:200]}...")
print(f"\nMetadata:")
for key, value in chunks[1]['metadata'].items():
    print(f"  {key}: {value}")


Total chunks: 5

First chunk with metadata:
Text: strengths is its extensive standard library, which provides tools for many common programming tasks. The language emphasizes code readability with its use of significant indentation. Python's syntax a...

Metadata:
  source: python_intro.txt
  chunk_index: 1
  chunk_size: 50
  char_count: 329
  start_word: 40
  end_word: 90


6.1.  Loading Text Files 

In [19]:
def load_and_chunk_text_file(file_path, chunk_size=500, overlap=50):
    """
    Load a text file and chunk it.
    
    Args:
        file_path: Path to the text file
        chunk_size: Characters per chunk
        overlap: Character overlap between chunks
    
    Returns:
        List of chunks with metadata
    """
    import os 

    #Read the file
    with open(file_path, 'r', encoding='utf-8') as f:
        text= f.read()

    #get file metadata
    file_name = os.path.basename(file_path)
    file_size= os.path.getsize(file_path)

    # chunk the text
    chunks= chunk_by_sentences(text,max_chunk_size=chunk_size)

    #Add metadata to each chunk 
    chunks_with_metadata =[]
    for i, chunk in enumerate(chunks):
        chunks_with_metadata.append({
            'text': chunk,
            'metadata': {
                'source': file_name,
                'file_path': file_path,
                'file_size': file_size,
                'chunk_index': i,
                'total_chunks': len(chunks)
            }
        })
    
    return chunks_with_metadata

# Example usage (create a sample file first)
sample_file_path = 'sample_document.txt'
with open(sample_file_path, 'w', encoding='utf-8') as f:
    f.write(sample_document)

# Load and chunk
chunks = load_and_chunk_text_file(sample_file_path, chunk_size=400)

print(f"Loaded and chunked: {chunks[0]['metadata']['source']}")
print(f"Total chunks: {len(chunks)}")
print(f"\nChunk 1:")
print(chunks[0]['text'])

Loaded and chunked: sample_document.txt
Total chunks: 4

Chunk 1:
Python is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. Python supports multiple programming 
paradigms including procedural, object-oriented, and functional programming. One of Python's key strengths is its extensive standard library, which provides tools for many common 
programming tasks.


In [20]:
chunks[2]['metadata']

{'source': 'sample_document.txt',
 'file_path': 'sample_document.txt',
 'file_size': 1387,
 'chunk_index': 2,
 'total_chunks': 4}

In [21]:
from sentence_transformers import SentenceTransformer
import numpy as np

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [22]:
# Load a small, fast embedding model
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model loaded!")
print(f"Model produces {model.get_sentence_embedding_dimension()} dimensional embeddings")

Loading embedding model...


✅ Model loaded!
Model produces 384 dimensional embeddings


Generate Embeddings

In [23]:
text= "The cat sat on the mat"

embedding= model.encode(text)
print(f"Original text: {text}")
print(f"Embedding shape: {embedding.shape}")
print(f"Embedding type: {type(embedding)}")
print(f"\nFirst 10 values: {embedding[:10]}")

Original text: The cat sat on the mat
Embedding shape: (384,)
Embedding type: <class 'numpy.ndarray'>

First 10 values: [ 0.13040186 -0.01187012 -0.02811704  0.05123863 -0.05597441  0.03019154
  0.03016129  0.02469839 -0.01837056  0.05876678]


cosine similarity

In [24]:
def cosine_similarity(vec1,vec2):
    """
    Calculate cosine similarity between two vectors.
    Return a score between -1 -1 (higher= more similar)
    """
    dot_product= np.dot(vec1,vec2)
    norm1= np.linalg.norm(vec1)
    norm2= np.linalg.norm(vec2)
    return dot_product/(norm1*norm2)

print("Similarity function ready!")

Similarity function ready!


### Testing Similarity

In [25]:
# Create test sentences
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",      # Similar meaning, different words
    "Dogs are loyal animals",          # Different topic
    "Python is a programming language" # Completely unrelated
]


# generate embeddings for all sentenses
embeddings= model.encode(sentences)

#compare first sentence to all others 
print("Comparing to: 'The cat sat on the mat'\n")

for i, sentence in enumerate(sentences):
    similarity= cosine_similarity(embeddings[0],embeddings[i])
    print(f"Similarity to '{sentence}'")
    print(f"Score: {similarity:.3f}\n")

Comparing to: 'The cat sat on the mat'

Similarity to 'The cat sat on the mat'
Score: 1.000

Similarity to 'A feline rested on the rug'
Score: 0.564

Similarity to 'Dogs are loyal animals'
Score: 0.165

Similarity to 'Python is a programming language'
Score: 0.031



## Building a Simple Semantic Search

In [26]:
# Sample knowledge base
documents = [
    "Python is a high-level programming language known for simplicity",
    "Machine learning enables computers to learn from data",
    "Neural networks are inspired by biological brains",
    "Dogs are loyal and friendly pets that need exercise",
    "Cats are independent animals that make great companions",
    "JavaScript is used for web development and runs in browsers",
    "Deep learning uses multi-layered neural networks",
    "Puppies require training and socialization from an early age"
]

print(f"Knowledge base: {len(documents)} documents")

Knowledge base: 8 documents


## Embed All Documents

In [27]:
#Generate embeddings for all documents 
print("Generating embeddings for ALL DOCUMENTS...")
doc_embeddings = model.encode(documents)

print(f"✅ Created {len(doc_embeddings)} embeddings")
print(f"Each embedding has {doc_embeddings[0].shape[0]} dimensions")

Generating embeddings for ALL DOCUMENTS...
✅ Created 8 embeddings
Each embedding has 384 dimensions


## Search Functions

In [34]:
def search(query,documents,doc_embeddings,top_k=3):
    """ Search for documents similar to the query.
    
    Args:
    query:Search query (string)
    documents:List of document texts
    doc_embeddings: Pre-computed documents embeddings
    top_k: Number of results to return
    
    Returns:
        List of (document, similarity_score) tuples
    """

    #Embed the query
    query_embedding= model.encode(query)

    #calculate similarities
    similarities=[]
    for i,doc_emb in enumerate (doc_embeddings):
        similarity = cosine_similarity(query_embedding,doc_emb)
        similarities.append((documents[i],similarity))

    #sorting by similarity (highest first)

    similarities.sort(key=lambda x:x[1],reverse=True)

    #return top k results
    return similarities[:top_k]
print("✅ Search function ready!")

✅ Search function ready!


## Try it out 

In [35]:
# Test different queries
queries = [
    "What is artificial intelligence?",
    "Tell me about pet dogs",
    "How do I code in Python?"
]

for query in queries:
    print(f"\n{'='*80}")
    print(f"QUERY: {query}")
    print(f"{'='*80}")
    
    results = search(query, documents, doc_embeddings, top_k=3)
    
    for i, (doc, score) in enumerate(results, 1):
        print(f"\n{i}. (Score: {score:.3f})")
        print(f"   {doc}")


QUERY: What is artificial intelligence?

1. (Score: 0.408)
   Machine learning enables computers to learn from data

2. (Score: 0.395)
   Neural networks are inspired by biological brains

3. (Score: 0.326)
   Python is a high-level programming language known for simplicity

QUERY: Tell me about pet dogs

1. (Score: 0.548)
   Dogs are loyal and friendly pets that need exercise

2. (Score: 0.437)
   Puppies require training and socialization from an early age

3. (Score: 0.413)
   Cats are independent animals that make great companions

QUERY: How do I code in Python?

1. (Score: 0.554)
   Python is a high-level programming language known for simplicity

2. (Score: 0.148)
   Puppies require training and socialization from an early age

3. (Score: 0.138)
   JavaScript is used for web development and runs in browsers


Comparing Different models 

In [37]:
# # Load two different models for comparison
# print("Loading models...\n")

# model_small = SentenceTransformer('all-MiniLM-L6-v2')      # 384 dimensions
# model_large = SentenceTransformer('all-mpnet-base-v2')     # 768 dimensions

# print("✅ Both models loaded!")
# print(f"Small model: {model_small.get_sentence_embedding_dimension()} dimensions")
# print(f"Large model: {model_large.get_sentence_embedding_dimension()} dimensions")

Loading models...



'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 1fc49098-6c69-4314-b9c9-05dabde18039)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 83307f74-43b6-4a32-affc-5d5acba3dfa8)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 2s [Retry 2/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 57537c91-bf9d-4688-b74d-cf382f9d2d74)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 4s [Retry 3/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co'

KeyboardInterrupt: 

In [None]:
# # Compare on a similarity task
# test_pairs = [
#     ("The dog is running", "A canine is jogging"),           # Similar
#     ("I love pizza", "Pizza is delicious"),                  # Related
#     ("Python programming", "Cooking pasta")                  # Unrelated
# ]

# print("Comparing model performance:\n")
# for text1, text2 in test_pairs:
#     # Small model
#     emb1_small = model_small.encode([text1, text2])
#     sim_small = cosine_similarity(emb1_small[0], emb1_small[1])
    
#     # Large model  
#     emb1_large = model_large.encode([text1, text2])
#     sim_large = cosine_similarity(emb1_large[0], emb1_large[1])
    
#     print(f"Pair: '{text1}' vs '{text2}'")
#     print(f"  Small model: {sim_small:.3f}")
#     print(f"  Large model: {sim_large:.3f}")
#     print()

Comparing model performance:



NameError: name 'model_small' is not defined

# make sue you complete module 3 

## Module 4

In [39]:
import faiss
import numpy as np 
from sentence_transformers import SentenceTransformer
print(f"FAISS version: {faiss.__version__}")
print("✅ Libraries imported successfully!")


ModuleNotFoundError: No module named 'faiss'

In [None]:
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
embeddings = model.encode(documents)

print(f"Generated {len(embeddings)} embeddings")
print(f"Each embedding has {embeddings.shape[1]} dimensions")
print(f"Embeddings shape: {embeddings.shape}")

In [None]:
# Get embedding dimension
dimension = embeddings.shape[1]

# Create FAISS index (IndexFlatL2 = exact search with L2 distance)
index = faiss.IndexFlatL2(dimension)

# Add embeddings to index
index.add(embeddings)

print(f"✅ FAISS index created!")
print(f"Total vectors in index: {index.ntotal}")

In [None]:
# Sample documents
documents = [
    "Python is a versatile programming language used for web development and data science.",
    "Machine learning models require large amounts of training data to perform well.",
    "Neural networks are inspired by the structure of the human brain.",
    "Natural language processing enables computers to understand human language.",
    "Deep learning is a subset of machine learning using multi-layered neural networks.",
    "Data visualization helps communicate insights from complex datasets.",
    "Cloud computing provides on-demand access to computing resources.",
    "Cybersecurity protects systems and networks from digital attacks.",
    "Blockchain technology enables secure, decentralized transactions.",
    "Quantum computing uses quantum mechanics to solve complex problems."
]

print(f"Total documents: {len(documents)}")

In [None]:
# Query
query = "What is artificial intelligence and machine learning?"

# Embed query
query_embedding = model.encode([query])

# Search: find top 3 most similar vectors
k = 3
distances, indices = index.search(query_embedding, k)

print(f"Query: {query}\n")
print(f"Top {k} results:\n")

for i, (idx, distance) in enumerate(zip(indices[0], distances[0]), 1):
    print(f"{i}. (Distance: {distance:.4f})")
    print(f"   {documents[idx]}")
    print()

## Using Cosine Similarity with FAISS

In [None]:
#Normalize embeddings for cosine similarity
embddings_normalized = embeddings/np.linalg.norm(embeddings,axis=1,keepdims=True)

#Create index with inner product(equivalent to cosine for normalized vectors)
index_cosine=faiss.IndexFlatIP(dimension)
index_cosine.add(embddings_normalized)

#search with normalized query
# Search with normalized query
query_embedding_normalized = query_embedding / np.linalg.norm(query_embedding)
scores, indices = index_cosine.search(query_embedding_normalized, k=3)

print(f"Query: {query}\n")
print(f"Top {k} results with cosine similarity:\n")

for i, (idx, score) in enumerate(zip(indices[0], scores[0]), 1):
    print(f"{i}. (Similarity: {score:.4f})")
    print(f"   {documents[idx]}")
    print()


### Saving and Loading faiss index

In [None]:
#save index to disk

faiss.write_index(index_cosine,"my_faiss_index.bin")
import pickle
with open("documents.pkl","wb") as f:
    pickle.dump(documents,f)



In [None]:
# Load index from disk
loaded_index = faiss.read_index("my_faiss_index.bin")
print(f"✅ Index loaded: {loaded_index.ntotal} vectors")

# Load documents
with open("documents.pkl", "rb") as f:
    loaded_documents = pickle.load(f)
print(f"✅ Documents loaded: {len(loaded_documents)} documents")

# Chromadb

In [1]:
import chromadb

print("chroma version",chromadb.__version__)

chroma version 1.3.5


In [2]:
#create chroma client(persistent client)
client= chromadb.PersistentClient(path="./chroma_db")

#create or get collection

collection= client.get_or_create_collection(name="my_documents",metadata={"description":"Sample documents"})

print(f"✅ Collection created: {collection.name}")
print(f"Current count: {collection.count()} documents")
print(f"📁 Data persisted to: ./chroma_db/")


✅ Collection created: my_documents
Current count: 0 documents
📁 Data persisted to: ./chroma_db/


## Add document to chroma

In [None]:
# Sample documents with metadata
documents = [
    "Python is a versatile programming language used for web development and data science.",
    "Machine learning models require large amounts of training data to perform well.",
    "Neural networks are inspired by the structure of the human brain.",
    "Natural language processing enables computers to understand human language.",
    "Deep learning is a subset of machine learning using multi-layered neural networks."
]

# Metadata for each document
metadatas = [
    {"category": "programming", "topic": "python"},
    {"category": "AI", "topic": "machine learning"},
    {"category": "AI", "topic": "neural networks"},
    {"category": "AI", "topic": "NLP"},
    {"category": "AI", "topic": "deep learning"}
]

## its looking like chromma takes the documents or chunks in form of a list each seprated sentence by a comma

#chroma needs ids to add into collection

#creATING ids fro each documents 

ids= [f"doc_{i}" for i in range(len(documents))]

#adding to collection(chromma handles embeddings automatically)
collection.add(documents=documents,metadatas=metadatas,ids=ids)

print(f"✅ Added {len(documents)} documents to collection")
print(f"Total documents: {collection.count()}")

In [None]:
#query the collection

result= collection.query(query_texts=["what is artificial intelligence"],n_results=3)

print("Query: What is artificial intelligence?\n")
print("Top 3 results:\n")

for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
), 1):
    print(f"{i}. (Distance: {distance:.4f})")
    print(f"   Document: {doc}")
    print(f"   Metadata: {metadata}")
    print()

In [8]:
from chromadb.utils import embedding_functions

#use sentence-transformer embedding function 
sentence_trans_embeding_func= embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
colection_custom= client.get_or_create_collection(name="custom_embeddingscollection",embedding_function=sentence_trans_embeding_func)


#adding to the ollection 
colection_custom.add(documents=documents,metadatas=metadatas,ids=ids)


In [9]:
colection_custom.count()

5

In [10]:
results= colection_custom.query(query_texts=["What is artificial intelligence?"],n_results=3,include=["embeddings", "documents", "metadatas", "distances"])

In [11]:
results

{'ids': [['doc_4', 'doc_3', 'doc_2']],
 'embeddings': [array([[-0.08768701, -0.026981  ,  0.06650561, ...,  0.0354312 ,
          -0.00644909,  0.01132433],
         [ 0.01383317,  0.01758453,  0.09789699, ...,  0.08880782,
           0.04175101, -0.056559  ],
         [-0.06560455, -0.08392676,  0.05001441, ...,  0.16255856,
           0.05264854, -0.06086662]], shape=(3, 384))],
 'documents': [['Deep learning is a subset of machine learning using multi-layered neural networks.',
   'Natural language processing enables computers to understand human language.',
   'Neural networks are inspired by the structure of the human brain.']],
 'uris': None,
 'included': ['embeddings', 'documents', 'metadatas', 'distances'],
 'data': None,
 'metadatas': [[{'topic': 'deep learning', 'category': 'AI'},
   {'category': 'AI', 'topic': 'NLP'},
   {'category': 'AI', 'topic': 'neural networks'}]],
 'distances': [[0.5752362012863159, 0.6204118132591248, 0.6279884576797485]]}

In [None]:
# Update a document
collection.update(
    ids=["doc_0"],
    documents=["Python is an amazing programming language for AI and data science!"],
    metadatas=[{"category": "programming", "topic": "python", "updated": True}]
)
print("✅ Document updated")

# Delete a document
# collection.delete(ids=["doc_4"])
# print("✅ Document deleted")

print(f"\nTotal documents after update: {collection.count()}")

In [None]:
# import re

# class RAGretriever:
#     def __init__(self,collection_name='rag_collection',persist_dir='./rag_db'):
#         #create chroma client(using persistentclient for chromadb)
#         self.client= chromadb.PersistentClient(path=persist_dir)

#         #create collectiom with sentence tramnsformer 
#         embedding_fn= embedding_fun

# module 5 (Prompt Augmentation & Generation)

This module is all about prompt engineering

# Module 7

In [2]:
# Setup
import os
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv("openai_key")

if not openai_api_key:
    print("⚠️ Warning: OPENAI_API_KEY not found. Set it in .env file.")
else:
    print("✅ API key loaded successfully")

✅ API key loaded successfully


In [9]:
# Sample documents for all examples
documents = [
    "Python is a high-level programming language known for readability and simplicity.",
    "Machine learning is a subset of AI that enables systems to learn from data.",
    "RAG combines retrieval and generation to provide accurate, grounded responses."
]

### Using Langchain implementaton with FAISS

In [None]:
from langchain_community.docstore.document import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel,RunnablePassthrough


# convert documents
loc_docs=[Document(page_content=doc) for doc in documents]

#vector store
embeddings= OpenAIEmbeddings(openai_api_key=openai_api_key)

vectorstore=FAISS.from_documents(loc_docs,embeddings)
retriever= vectorstore.as_retriever()


#llm 
llm=ChatOpenAI(model="gpt-4o-mini",
    temperature=0,
    openai_api_key=openai_api_key)


#Prompt
prompt= ChatPromptTemplate.from_messages([
    ("system", "You are an expert assistant. Use ONLY the retrieved context."),
    ("human", "{question}\n\nContext:\n{context}")
])

#Build RAG pipeline
rag_chain = (RunnableParallel(context=retriever,question=RunnablePassthrough())
             |prompt
             |llm
             )

# Query
response = rag_chain.invoke("what is RAG")
print(response)


content='RAG combines retrieval and generation to provide accurate, grounded responses.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 13, 'prompt_tokens': 175, 'total_tokens': 188, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_ee69c2ef48', 'id': 'chatcmpl-CnVlJTvzF5ZNA7Fb6XiWXfqyGX3a5', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='lc_run--019b28ce-c8e9-7340-9290-8e56dba06c9c-0' usage_metadata={'input_tokens': 175, 'output_tokens': 13, 'total_tokens': 188, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}


In [4]:
response.content

'RAG combines retrieval and generation to provide accurate, grounded responses.'

## Approach 3: LlamaIndex IMplemention

In [12]:
from llama_index.core import Document, VectorStoreIndex,settings
from llama_index.llms.openai import OpenAI as LlamaOpenAI
from llama_index.embeddings.openai import OpenAIEmbedding


#configure llamaindex

settings.llm= LlamaOpenAI(model="gpt-3.5-turbo", temperature=0, api_key=openai_api_key)
settings.embed_model= OpenAIEmbedding(api_key=openai_api_key)

# Create documents and Index 
llama_docs =[Document(text=doc) for doc in documents]
index= VectorStoreIndex.from_documents(llama_docs)


#Query
query_engine= index.as_query_engine()
response= query_engine.query("what is RAG?")

print("LlamaIndex Answer:")
print(response)
print(response.response)

ValueError: 
******
Could not load OpenAI embedding model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

Consider using embed_model='local'.
Visit our documentation for more embedding options: https://developers.llamaindex.ai/python/framework/module_guides/models/embeddings/
******

# Module 8 RAG with Langchain

In [13]:
from langchain_core.documents import Document

# Create documents directly (you can also load from files using loaders)
documents = [
    Document(page_content="LangChain is a framework for developing applications powered by language models."),
    Document(page_content="RAG combines retrieval with generation to provide accurate, grounded responses."),
    Document(page_content="Vector stores enable efficient similarity search over embedded documents."),
    Document(page_content="FAISS is a library for efficient similarity search developed by Meta AI."),
    Document(page_content="Text splitters chunk documents into smaller pieces for better retrieval.")
]

print(f"Created {len(documents)} documents")
print(f"First document: {documents[0].page_content}")

Created 5 documents
First document: LangChain is a framework for developing applications powered by language models.


In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter=RecursiveCharacterTextSplitter(chunk_size=100,
                                             chunk_overlap=20,
                                             length_function=len)

# split documents
chunks= text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks")
for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i+1}: {chunk.page_content}")

NameError: name 'documents' is not defined

## Embedding

In [17]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=openai_api_key
)

# Test embedding
test_embedding = embeddings.embed_query("What is RAG?")
print(f"Embedding dimension: {len(test_embedding)}")
print(f"First 5 values: {test_embedding[:5]}")

Embedding dimension: 1536
First 5 values: [0.0006281227106228471, 0.02569717727601528, 0.007161187008023262, 0.03336399793624878, -0.031968604773283005]


## vectorsctore

In [4]:
from langchain_community.vectorstores import FAISS

#CREATE VECTOR STORE FROM DOCUMENTS 
vectorstore= FAISS.from_documents(chunks,embeddings)
print(f"✅ Vector store created with {len(chunks)} chunks")

# Test similarity search
query = "What is FAISS?"
results=vectorstore.similarity_search(query,k=2)

print(f"\nQuery: {query}")
for i, doc in enumerate(results):
    print(f"\nResult {i+1}: {doc.page_content}")


NameError: name 'chunks' is not defined

# Part2 Building RAG with LCEL (langchain expression LAnguage)


In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough ,RunnableParallel
from langchain_core.output_parsers import StrOutputParser

#create llm

llm= ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
    openai_api_key=openai_api_key
)

#create retriever
retriever =vectorstore.as_retriever(search_kwargs={"k": 2})

#create prompt
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer using ONLY the provided context."),
    ("human", "{question}\n\nContext:\n{context}")])

# Helper function to format documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain =(RunnableParallel(context=retriever | format_docs, question=RunnablePassthrough())
            |prompt
            |llm
            |StrOutputParser)

print("✅ RAG chain created")

In [None]:
# Query the chain
response = rag_chain.invoke("What is LangChain?")
print(response)

## Simple Conversational RAG

In [None]:
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.prompts import MessagesPlaceholder

chat_store ={}

def get_session_history(session_id: str):
    if session_id not in chat_store:
        chat_store[session_id] = InMemoryChatMessageHistory()
    return chat_store[session_id]


#create conversational prompt
conv_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer using the context provided."),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "Context: {context}\n\nQuestion: {question}")
])

#build base chain
con_chain_base=(RunnableParallel(
    context=lambda x: format_docs(retriever.invoke(x["question"])),
    question=lambda x: x["question"],
    chat_history=lambda x: x.get("chat_history", [])
)
|conv_prompt
|llm
|StrOutputParser())


#wrap with message history
conv_chain= RunnableWithMessageHistory(con_chain_base,get_session_history,input_messages_key="question",history_messages_key="chat_history")



### Production_RAG

In [1]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

print("✅ All libraries imported successfully!")

  from .autonotebook import tqdm as notebook_tqdm


✅ All libraries imported successfully!


In [4]:
import os
from dotenv import load_dotenv

# Load API key from .env file
# Create a .env file in this directory with: OPENAI_API_KEY=your-key-here
load_dotenv()
api_key = os.environ.get("openai_key")

if api_key:
    print("✅ API key loaded!")
else:
    print("⚠️  Warning: OPENAI_API_KEY not found in .env file")
    print("Create a .env file with: OPENAI_API_KEY=your-key-here")

✅ API key loaded!


In [None]:
loader= DirectoryLoader('documents/',glob="**/*.txt",loader_cls=TextLoader)
documents=loader.load()
print(f"✅ Loaded {len(documents)} documents")

#### chunk documents

In [None]:
text_splitter= RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)
chunks= text_splitter.split_documents(documents)
print(f"✅ Created {len(chunks)} chunks")

### Create Embeddings 


In [None]:

embeddings = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=api_key)

test_embedding= embeddings.embed_query("what is RAG")
print(f"✅ Embedding model loaded!")
print(f"Embedding dimension: {len(test_embedding)}")


## Create Vextor Store

In [None]:
print("Create vector store...")
vectorstore= Chroma.from_documents(documents=chunks,embedding=embeddings,persist_directory="./chroma_db")

print(f"Vector store created with {len(chunks)} chunks!")

## Step 7b: OR Load Existing Vector Store (Skip Step 4-7 if using this)

In [None]:
# Load existing vector store (uncomment to use)
vectorstore = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embeddings
)
print(f"✅ Loaded existing vector store!")

## Step 7c: Smart Approach - Load if Exists, Create if Not

**Best practice**: Check if the vector store already exists:

In [None]:
# Smart approach - load if exists, create if not (uncomment to use)
# import os
# 
# if os.path.exists("./chroma_db"):
#     print("Loading existing vector store...")
#     vectorstore = Chroma(
#         persist_directory="./chroma_db",
#         embedding_function=embeddings
#     )
#     print("✅ Loaded existing vector store!")
# else:
#     print("Creating new vector store...")
#     vectorstore = Chroma.from_documents(
#         documents=chunks,
#         embedding=embeddings,
#         persist_directory="./chroma_db"
#     )
#     print(f"✅ Vector store created with {len(chunks)} chunks!")

## Step 8: Set Up OpenAI LLM

We'll use GPT-3.5-turbo for fast, cost-effective answers:

In [None]:
#Initialize OPenai llm 
llm= ChatOpenAI(model="gpt-3.5-turbo",temperature=0,openai_api_key=api_key)


print("✅ OpenAI LLM initialized!")

## Step 9: Test Retrieval Only

Before we do full generation, let's see what documents we retrieve:

In [None]:
retriever= vectorstore.as_retriever(search_kwargs={"k":3})

#test retreival 
query= "What is machine learning?"
retrieved_docs= retriever.invoke(query)

print(f"Query: {query}\n")
print(f"Retrieved {len(retrieved_docs)} documents:\n")

for i, doc in enumerate(retrieved_docs, 1):
    print(f"{i}. {doc.page_content[:200]}...\n")

## Step 10: Create the RAG Prompt Template

This prompt tells the LLM how to use the retrieved context:

In [None]:
template="""You are a helpful assistant answering questions based on the provided context.

Context:
{context}

Question:{question}
Answer the question based on the context above. if you cannot answer based on the context,say so.

Answer:"""

prompt= ChatPromptTemplate.from_template(template)
print("Prompt template created!")



## Build the RAG Chain
This combinesretrieval + generation into one pipeline:

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [None]:
# Build the RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("✅ RAG chain built!")

#  RAG CORE 

In [None]:
import os 
from typing import List,Dict
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader,TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document


In [None]:
class DocumentProcessors:
    def __init__(self,chunk_size: int=500,chunk_overlap:int=50):
        self.chunk_size= chunk_size
        self.chunk_overlap=chunk_overlap
        self.text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap,length_function=len)

    def load_documents(self,documents_folder:str) -> List[Document]:
        loader = DirectoryLoader(
            documents_folder,
            glob="**/*.txt",
            loader_cls=TextLoader
        )
        documents = loader.load()
        print(f"✅ Loaded {len(documents)} documents")
        return documents
    def chunk_documents(self, documents: List[Document]) -> List[Document]:
        chunks=self.text_splitter.split_documents(documents)
        print(f"✅ Created {len(chunks)} chunks")
        return chunks
    

class VectorStoreManager:
    def __init__(
        self,
        embedding_model: str = "text-embedding-3-small",
        persist_directory: str = "./chroma_db"):
        
        load_dotenv()
        self.embedding_model=embedding_model
        self.persistent_directory= persist_directory
        # Initialize embeddings
        print(f"Loading embedding model: {embedding_model}...")
        self.embeddings = OpenAIEmbeddings(
            model=embedding_model
        )
        print("✅ Embedding model loaded!")

        self.vectorstore = None

    def create_vectorestore(self,chunks: List[Document]):
        """
        Create a vector store from document chunks.
        """
        print("Creating vector store with embeddings...")
        self.vectorstore=Chroma.from_documents(documents=chunks,embedding=self.embeddings,persist_directory=self.persistent_directory)
        print(f"✅ Vector store created with {len(chunks)} chunks!")

    def load_vectorstore(self):
        """
        Load an existing vector store from disk.
        """
        print("Loading existing vector store...")
        self.vectorestore=Chroma(persist_directory=self.persistent_directory,embedding_function=self.embeddings)
        print("✅ Vector store loaded!")
    def search(self, query: str, top_k: int = 3) -> Dict:
        """
        Search for relevant documents.
        """
        if self.vectorestore is None:
            raise ValueError("No vector store loaded. Create or load one first.")
        results_with_scores= self.vectorestore.similarity_search_with_score(query,k=top_k)
        formatted_results = {
            "query": query,
            "results": []
        }

        for doc, score in results_with_scores:
            formatted_results["results"].append({
                "text": doc.page_content,
                "metadata": doc.metadata,
                "score": float(score)
            })

        return formatted_results
    
    def get_retriever(self, top_k: int = 3):
        """
        Get a LangChain retriever for the vector store.
        """
        if self.vectorstore is None:
            raise ValueError("No vector store loaded. Create or load one first.")

        return self.vectorstore.as_retriever(search_kwargs={"k": top_k})
    
    def get_stats(self) -> Dict:
        """
        Get statistics about the vector store.
        """
        if self.vectorstore is None:
            return {"error": "No vector store loaded"}

        collection = self.vectorstore._collection
        return {
            "total_chunks": collection.count(),
            "embedding_model": self.embedding_model
        }


class RAGGenerator:
    """
    Generates answers by combining retrieval with an LLM.
    """

    def __init__(
        self,
        vectorstore_manager: VectorStoreManager,
        openai_model: str = "gpt-3.5-turbo",
        temperature: float = 0.0,
        top_k: int = 3
    ):
        """
        Initialize the RAG generator.
        """
        load_dotenv()

        self.vectorstore_manager = vectorstore_manager
        self.openai_model = openai_model
        self.temperature = temperature
        self.top_k = top_k

        # Initialize LLM
        self.llm = ChatOpenAI(
            model=openai_model,
            temperature=temperature
        )
        print(f"✅ OpenAI LLM initialized ({openai_model})!")

        # Build RAG chain
        self._build_rag_chain()
    def _build_rag_chain(self):
        """
        Build the RAG chain (retrieval + generation pipeline).
        """
        # Get retriever
        retriever = self.vectorstore_manager.get_retriever(top_k=self.top_k)

        # Create prompt template
        template = """You are a helpful assistant answering questions based on the provided context.

Context:
{context}

Question: {question}

Answer the question based on the context above. If you cannot answer based on the context, say so.

Answer:"""
        prompt = ChatPromptTemplate.from_template(template)

        # Helper function to format documents
        def format_docs(docs):
            return "\n\n".join(doc.page_content for doc in docs)

        # Build the RAG chain
        self.rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | self.llm
            | StrOutputParser()

        )

        print("✅ RAG chain built!")
    def query(self, question: str) -> Dict:
        """
        Ask a question and get an AI-generated answer.
        """
        answer = self.rag_chain.invoke(question)

        return {
            "question": question,
            "answer": answer
        }
# Convenience function to build a complete RAG system
def build_rag_system(documents_folder: str) -> tuple:
    """
    Build a complete RAG system from documents.
    """
    print(f"\n{'='*60}")
    print("BUILDING RAG SYSTEM")
    print(f"{'='*60}\n")
    doc_processor=DocumentProcessors()
    documents=doc_processor.load_documents(documents_folder)
    chunks= doc_processor.chunk_documents(documents)

    #Step 2
    vectorstore_manager = VectorStoreManager()
    vectorstore_manager.create_vectorstore(chunks)

    # Step 3: Create RAG generator
    rag_generator = RAGGenerator(vectorstore_manager)

    print(f"\n{'='*60}")
    print("RAG SYSTEM READY!")
    print(f"{'='*60}\n")

    return doc_processor, vectorstore_manager, rag_generator




