In [1]:
!pip install langchain langchain-openai langchain-community chromadb tiktoken unstructured python-dotenv
!pip install "unstructured[all-docs]"

Collecting langchain-openai
  Downloading langchain_openai-0.3.30-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting chromadb
  Downloading chromadb-1.0.17-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting unstructured
  Downloading unstructured-0.18.13-py3-none-any.whl.metadata (24 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-

In [2]:
import os
import sys
from pathlib import Path
from typing import List, Dict, Any
import warnings
warnings.filterwarnings('ignore')

# Core imports
from dotenv import load_dotenv
import chromadb

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
    DirectoryLoader,
    TextLoader,
    PyPDFLoader,
    Docx2txtLoader,
    CSVLoader
)
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain.prompts import PromptTemplate

print("All libraries imported successfully!")

All libraries imported successfully!


In [3]:
load_dotenv()

True

In [4]:
if not os.getenv("OPENAI_API_KEY"):
    print("⚠️  Please set your OPENAI_API_KEY!")
    print("You can:")
    print("1. Create a .env file with: OPENAI_API_KEY=your_api_key_here")
    print("2. Or uncomment the line above and add your API key")
else:
    print("✅ OpenAI API key is set!")

✅ OpenAI API key is set!


In [5]:
def create_sample_documents(directory: str = "./sample_docs"):
    """Create sample documents for testing"""
    os.makedirs(directory, exist_ok=True)

    sample_texts = {
        "ai_overview.txt": """
        Artificial Intelligence (AI) Overview

        Artificial Intelligence is the simulation of human intelligence processes by machines,
        especially computer systems. These processes include learning, reasoning, and self-correction.

        Key AI Technologies:
        - Machine Learning: Algorithms that improve through experience
        - Natural Language Processing: Understanding and generating human language
        - Computer Vision: Interpreting and understanding visual information
        - Robotics: Physical AI systems that interact with the world

        AI is transforming industries including healthcare, finance, transportation, and education.
        """,

        "machine_learning_guide.txt": """
        Machine Learning Guide

        Machine Learning is a subset of AI that enables systems to automatically learn
        and improve from experience without being explicitly programmed.

        Types of Machine Learning:

        1. Supervised Learning
        - Uses labeled training data
        - Examples: Classification, Regression
        - Algorithms: Linear Regression, Decision Trees, Random Forest

        2. Unsupervised Learning
        - Finds patterns in unlabeled data
        - Examples: Clustering, Association Rules
        - Algorithms: K-Means, Hierarchical Clustering

        3. Reinforcement Learning
        - Learns through interaction with environment
        - Uses rewards and penalties
        - Examples: Game playing, Robotics

        Popular ML Libraries: Scikit-learn, TensorFlow, PyTorch
        """,

        "deep_learning_intro.txt": """
        Introduction to Deep Learning

        Deep Learning is a subset of machine learning that uses artificial neural networks
        with multiple layers to model and understand complex patterns in data.

        Neural Network Components:
        - Neurons: Basic processing units
        - Layers: Input, Hidden, Output layers
        - Weights and Biases: Parameters that are learned
        - Activation Functions: ReLU, Sigmoid, Tanh

        Training Process:
        1. Forward Propagation: Data flows through network
        2. Loss Calculation: Compare predictions to actual values
        3. Backpropagation: Update weights to minimize loss
        4. Iteration: Repeat until convergence

        Applications:
        - Image Recognition: CNNs for computer vision
        - Natural Language Processing: RNNs, Transformers
        - Speech Recognition: Deep neural networks
        - Autonomous Vehicles: Multi-modal deep learning
        """,

        "ai_ethics.txt": """
        AI Ethics and Considerations

        As AI becomes more prevalent, ethical considerations become increasingly important.

        Key Ethical Issues:

        1. Bias and Fairness
        - AI systems can perpetuate or amplify existing biases
        - Need for diverse training data and testing
        - Regular auditing of AI systems

        2. Privacy and Security
        - AI systems often require large amounts of personal data
        - Risk of data breaches and misuse
        - Need for privacy-preserving techniques

        3. Transparency and Explainability
        - Many AI systems are "black boxes"
        - Need for interpretable AI
        - Right to explanation for automated decisions

        4. Job Displacement
        - AI automation may replace human jobs
        - Need for retraining and education programs
        - Consideration of economic impacts

        5. Autonomous Systems
        - Questions of responsibility and liability
        - Need for human oversight
        - Safety considerations

        Best Practices:
        - Inclusive design processes
        - Regular ethical reviews
        - Stakeholder engagement
        - Continuous monitoring and improvement
        """
    }

    for filename, content in sample_texts.items():
        with open(os.path.join(directory, filename), 'w', encoding='utf-8') as f:
            f.write(content.strip())

    print(f"✅ Sample documents created in {directory}")
    print(f"Created {len(sample_texts)} files:")
    for filename in sample_texts.keys():
        print(f"  - {filename}")
    for content in sample_texts.values():
        print(f"  - {content[:50]}...")

# Create sample documents
create_sample_documents()

✅ Sample documents created in ./sample_docs
Created 4 files:
  - ai_overview.txt
  - machine_learning_guide.txt
  - deep_learning_intro.txt
  - ai_ethics.txt
  - 
        Artificial Intelligence (AI) Overview

  ...
  - 
        Machine Learning Guide

        Machine L...
  - 
        Introduction to Deep Learning

        De...
  - 
        AI Ethics and Considerations

        As ...


In [6]:
DOCUMENTS_PATH = "./sample_docs"  # Change this to your documents directory
PERSIST_DIRECTORY = "./chroma_db"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

In [7]:
embeddings = OpenAIEmbeddings()
llm = ChatOpenAI(
    model_name="gpt-4o",
    temperature=0.7,
    max_tokens=500
)

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

print("✅ RAG components initialized!")

✅ RAG components initialized!


In [9]:
def load_documents_from_directory(directory_path: str) -> List[Document]:
    """Load documents from directory"""
    documents = []

    # Define loaders for different file types
    loaders = {
        '.txt': TextLoader,
        '.pdf': PyPDFLoader,
        '.docx': Docx2txtLoader,
        '.csv': CSVLoader,
    }

    print(f"📁 Loading documents from: {directory_path}")

    if not os.path.exists(directory_path):
        print(f"❌ Directory {directory_path} does not exist!")
        return documents

    # Load files
    for file_path in Path(directory_path).rglob('*'):
        if file_path.is_file():
            file_extension = file_path.suffix.lower()

            if file_extension in loaders:
                try:
                    loader = loaders[file_extension](str(file_path))
                    file_docs = loader.load()

                    # Add metadata
                    for doc in file_docs:
                        doc.metadata.update({
                            'source': str(file_path),
                            'filename': file_path.name,
                            'file_type': file_extension
                        })

                    documents.extend(file_docs)
                    print(f"  ✅ Loaded: {file_path.name}")

                except Exception as e:
                    print(f"  ❌ Error loading {file_path.name}: {str(e)}")
            else:
                print(f"  ⚠️  Unsupported file type: {file_path.name}")

    print(f"\n📊 Total documents loaded: {len(documents)}")
    return documents

# Load documents
documents = load_documents_from_directory(DOCUMENTS_PATH)

# Show first document preview
if documents:
    print(f"\n📄 Preview of first document:")
    print(f"Source: {documents[0].metadata.get('filename', 'Unknown')}")
    print(f"Content (first 200 chars): {documents[0].page_content[:200]}...")

📁 Loading documents from: ./sample_docs
  ✅ Loaded: machine_learning_guide.txt
  ✅ Loaded: deep_learning_intro.txt
  ✅ Loaded: ai_overview.txt
  ✅ Loaded: ai_ethics.txt

📊 Total documents loaded: 4

📄 Preview of first document:
Source: machine_learning_guide.txt
Content (first 200 chars): Machine Learning Guide

        Machine Learning is a subset of AI that enables systems to automatically learn
        and improve from experience without being explicitly programmed.

        Types o...


In [10]:
def split_documents(documents: List[Document]) -> List[Document]:
    """Split documents into smaller chunks"""
    print("✂️  Splitting documents into chunks...")

    chunks = text_splitter.split_documents(documents)

    print(f"📊 Created {len(chunks)} chunks from {len(documents)} documents")

    # Show chunk statistics
    chunk_lengths = [len(chunk.page_content) for chunk in chunks]
    avg_length = sum(chunk_lengths) / len(chunk_lengths) if chunk_lengths else 0

    print(f"📊 Chunk statistics:")
    print(f"  Average chunk length: {avg_length:.0f} characters")
    print(f"  Min chunk length: {min(chunk_lengths) if chunk_lengths else 0}")
    print(f"  Max chunk length: {max(chunk_lengths) if chunk_lengths else 0}")

    return chunks

# Split documents
if documents:
    chunks = split_documents(documents)

    # Preview first chunk
    if chunks:
        print(f"\n📄 Preview of first chunk:")
        print(f"Source: {chunks[0].metadata.get('filename', 'Unknown')}")
        print(f"Content: {chunks[0].page_content[:300]}...")
else:
    print("❌ No documents to split!")

✂️  Splitting documents into chunks...
📊 Created 5 chunks from 4 documents
📊 Chunk statistics:
  Average chunk length: 759 characters
  Min chunk length: 490
  Max chunk length: 937

📄 Preview of first chunk:
Source: machine_learning_guide.txt
Content: Machine Learning Guide

        Machine Learning is a subset of AI that enables systems to automatically learn
        and improve from experience without being explicitly programmed.

        Types of Machine Learning:

        1. Supervised Learning
        - Uses labeled training data
        - E...


In [11]:
def create_vectorstore(chunks: List[Document], persist_dir: str):
    """Create ChromaDB vectorstore from document chunks"""
    print("🗄️  Creating ChromaDB vectorstore...")

    # Remove existing directory if it exists
    if os.path.exists(persist_dir):
        import shutil
        shutil.rmtree(persist_dir)
        print(f"  🗑️  Removed existing vectorstore at {persist_dir}")

    # Create vectorstore
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=persist_dir
    )

    print(f"✅ Vectorstore created successfully!")
    print(f"  📍 Location: {persist_dir}")
    print(f"  📊 Number of chunks stored: {len(chunks)}")

    return vectorstore

if 'chunks' in locals() and chunks:
    vectorstore = create_vectorstore(chunks, PERSIST_DIRECTORY)
else:
    print("❌ No chunks available to create vectorstore!")

🗄️  Creating ChromaDB vectorstore...
✅ Vectorstore created successfully!
  📍 Location: ./chroma_db
  📊 Number of chunks stored: 5


In [12]:
def setup_qa_chain(vectorstore, llm):
    """Setup the Question-Answering chain"""
    print("🔗 Setting up QA chain...")

    # Custom prompt template
    prompt_template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer based on the context, just say that you don't know, don't try to make up an answer.

    Context: {context}

    Question: {question}

    Answer:"""

    PROMPT = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    # Create QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 5}  # Number of documents to retrieve
        ),
        chain_type_kwargs={"prompt": PROMPT},
        return_source_documents=True
    )

    print("✅ QA chain setup complete!")
    return qa_chain

# Setup QA chain
if 'vectorstore' in locals():
    qa_chain = setup_qa_chain(vectorstore, llm)
else:
    print("❌ Vectorstore not available!")

🔗 Setting up QA chain...
✅ QA chain setup complete!


In [None]:
def query_rag_system(qa_chain, question: str):
    """Query the RAG system with a question"""
    print(f"❓ Question: {question}")
    print("-" * 60)

    try:
        # Get response
        result = qa_chain({"query": question})

        # Display answer
        print(f"💡 Answer: {result['result']}")

        # Display sources
        print(f"\n📚 Sources used:")
        for i, doc in enumerate(result['source_documents'], 1):
            filename = doc.metadata.get('filename', 'Unknown')
            print(f"  {i}. {filename}")
            print(f"     Preview: {doc.page_content[:100]}...")

        return result

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return None

# Test queries
if 'qa_chain' in locals():
    test_questions = [
        "What is artificial intelligence?",
        "What are the different types of machine learning?",
        "How does deep learning work?",
        "What are the ethical considerations in AI?",
        "What is the difference between supervised and unsupervised learning?"
    ]

    print("🚀 Testing RAG System with Sample Questions")
    print("=" * 60)

    for i, question in enumerate(test_questions, 1):
        print(f"\n{'='*60}")
        print(f"TEST QUERY {i}")
        print('='*60)

        result = query_rag_system(qa_chain, question)

        if i < len(test_questions):  # Don't wait after last question
            input("\nPress Enter to continue to next question...")
else:
    print("❌ QA chain not available!")

🚀 Testing RAG System with Sample Questions

TEST QUERY 1
❓ Question: What is artificial intelligence?
------------------------------------------------------------


  result = qa_chain({"query": question})


💡 Answer: Artificial Intelligence is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning, reasoning, and self-correction.

📚 Sources used:
  1. ai_overview.txt
     Preview: Artificial Intelligence (AI) Overview

        Artificial Intelligence is the simulation of human in...
  2. machine_learning_guide.txt
     Preview: Machine Learning Guide

        Machine Learning is a subset of AI that enables systems to automatic...
  3. ai_ethics.txt
     Preview: AI Ethics and Considerations

        As AI becomes more prevalent, ethical considerations become in...
  4. ai_ethics.txt
     Preview: 4. Job Displacement
        - AI automation may replace human jobs
        - Need for retraining and...
  5. deep_learning_intro.txt
     Preview: Introduction to Deep Learning

        Deep Learning is a subset of machine learning that uses artif...

Press Enter to continue to next question...What is deep learning?

TEST QUERY 2
