In [None]:
!pip install langchain langchain-community pymupdf beautifulsoup4


### Loading Multiple Sources

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader, WebBaseLoader

# --- PDF Loader ---
pdf_path = "PDFs/Erickson_Kretschmer_Mendis_chapter_4_PD.pdf"  
pdf_loader = PyMuPDFLoader(pdf_path)
pdf_docs = pdf_loader.load()

# --- Web Page Loader ---
web_url = "https://medium.com/@vikrampande783/introduction-to-langchain-9e09aae37e62"
web_loader = WebBaseLoader(web_url)
web_docs = web_loader.load()

# Combine both
all_docs = pdf_docs + web_docs

print(f"Loaded {len(all_docs)} documents total.")


Loaded 31 documents total.


### Chunking

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Configure the splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,      # characters per chunk
    chunk_overlap=100    # overlap to preserve context
)

# Split all docs
split_docs = text_splitter.split_documents(all_docs)

print(f"Total chunks created: {len(split_docs)}")


Total chunks created: 241


### Indexing

In [None]:
from langchain.indexes import index, SQLRecordManager
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import os

# 1. Set up the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# 2. Create FAISS vector store
vectorstore = FAISS.from_documents(split_docs, embedding_model)

# 3. Set up Record Manager with SQLite
record_manager = SQLRecordManager(
    namespace="faiss_index",
    db_url="sqlite:///record_manager.db"
)
record_manager.create_schema()


In [7]:
# 4. Perform indexing using LangChain's API
results = index(
    split_docs,
    record_manager,
    vectorstore,
    cleanup="incremental",         # or "full", "scoped_full", "none"
    source_id_key="source"         
)

print("✅ Indexing complete:", results)

✅ Indexing complete: {'num_added': 241, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}


### Adding more resources

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader, WebBaseLoader

# === PDF Files ===
pdf_paths = [
    "PDFs/Erickson_Kretschmer_Mendis_chapter_4_PD.pdf",
    "PDFs/Vox-Jenkins.pdf",  # Public domain + culture
    "PDFs/Public Domain and Access to Knowledge.pdf",  # DigitalCommons UGA
    "PDFs/Giblin - What Happens When Books Enter the Public Domain.pdf"  # Harvard Ruggie
]

# Load all PDFs
pdf_docs = []
for path in pdf_paths:
    loader = PyMuPDFLoader(path)
    pdf_docs.extend(loader.load())

# === Web Pages ===
web_urls = [
    "https://medium.com/@vikrampande783/introduction-to-langchain-9e09aae37e62",  # Original
    "https://www.digitalocean.com/community/tutorials/langchain-language-model",  # Source #6
    "https://www.elastic.co/blog/langchain-tutorial"  # Source #7
]

web_docs = []
for url in web_urls:
    loader = WebBaseLoader(url)
    web_docs.extend(loader.load())

# === Combine All Documents ===
all_docs = pdf_docs + web_docs

print(f"✅ Loaded {len(all_docs)} documents from {len(pdf_paths)} PDFs and {len(web_urls)} web pages.")


✅ Loaded 134 documents from 4 PDFs and 3 web pages.


In [33]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Configure the splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,      # characters per chunk
    chunk_overlap=100    # overlap to preserve context
)

# Split all docs
split_docs = text_splitter.split_documents(all_docs)

print(f"Total chunks created: {len(split_docs)}")


Total chunks created: 1129


In [None]:
# Reuse the same record manager
record_manager = SQLRecordManager(
    namespace="faiss_index",
    db_url="sqlite:///record_manager.db"
)

# Index new docs
results = index(
    split_docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source"
)

print("✅ Re-indexing complete:", results)



✅ Re-indexing complete: {'num_added': 1019, 'num_updated': 0, 'num_skipped': 108, 'num_deleted': 133}


### LLM

In [8]:
from dotenv import load_dotenv
import os
import getpass


# Load environment variables from .env file
load_dotenv()

# Access groq_key
groq_key = os.getenv("GROQ_API_KEY")

if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = getpass.getpass(groq_key)

In [9]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

### QA Retriever

In [10]:
from langchain.chains import RetrievalQA

# Reuse vectorstore and llm from your previous cells
retriever = vectorstore.as_retriever()

# Build the QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",  # other types: "map_reduce", "refine"
    return_source_documents=True  # optional: to see which docs were used
)

# Ask a question
query = "What are the main findings about the value of the public domain?"
result = qa_chain({"query": query})

# Output
print("💬 Answer:\n", result["result"])
print("\n📎 Source Document:\n", result["source_documents"][0].metadata.get("source", "No source found"))


  result = qa_chain({"query": query})


💬 Answer:
 Based on the provided context, the main findings about the value of the public domain can be inferred from the works of R. Pollock, 'The Value of the Public Domain' (2006). However, the specific findings are not explicitly stated in the given text.

But, R. Pollock's work is mentioned as 'The Value of the Public Domain' (London: Institute for Public Policy Research, 2006), available at http://rufuspollock.org/papers/value_of_public_domain.ippr.pdf (accessed 30 September 2018). 

This suggests that R. Pollock's work provides insights into the value of the public domain, but the actual findings are not provided in the given context.

📎 Source Document:
 Erickson_Kretschmer_Mendis_chapter_4_PD.pdf


In [11]:
query = "What is LangChain and what are its main use cases?"
result = qa_chain({"query": query})

print("💬 Answer:\n", result["result"])
print("\n📎 Source:\n", result["source_documents"][0].metadata.get("source"))


💬 Answer:
 LangChain is an open-source software framework that integrates Large Language Models (LLMs) into domain-specific applications. Released in October 2022, it has gained popularity in the industry and research for its easy-to-use interface. LangChain is designed to simplify the development, productionization, and deployment of LLM-powered applications.

LangChain has a set of building blocks for almost every stage of the LLM application lifecycle, making it a versatile tool for various use cases. Some of the main use cases of LangChain include:

1. **Chatbots and Conversational AI**: LangChain can be used to build chatbots that can understand and respond to user queries, using LLMs to generate human-like responses.
2. **Text Summarization and Generation**: LangChain can be used to summarize long pieces of text, generate text based on a prompt, or even create entire articles.
3. **Question Answering and Knowledge Retrieval**: LangChain can be used to build question-answering sys

### Gradio & Agent

In [29]:
from langchain.agents import Tool
from langchain.tools.retriever import create_retriever_tool

retriever = vectorstore.as_retriever()

retriever_tool = create_retriever_tool(
    retriever,
    name="document_search",
    description="Use this tool to search information from uploaded documents"
)


In [40]:
from langchain.agents import initialize_agent
from langchain.memory import ConversationBufferMemory
from langchain.agents.agent_types import AgentType

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

agent = initialize_agent(
    tools=[retriever_tool],
    llm=llm,
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    verbose=True,
    memory=memory
)


In [None]:
import gradio as gr

def chat_with_docs(message, history):
    return agent.run(message)

gr.ChatInterface(
    fn=chat_with_docs,
    title="📚 Ask Your Documents (LLaMA 3)",
    chatbot=gr.Chatbot(show_copy_button=True),
    examples=[
        # 📘 PDF: Erickson_Kretschmer_Mendis_chapter_4_PD.pdf
        "What is the legal framework discussed in Chapter 4 on the public domain?",
        "How do the authors define cultural commons?",

        # 📕 PDF: Vox-Jenkins.pdf
        "Why does Vox argue that the public domain is shrinking?",
        "How does the public domain support creativity according to the Vox PDF?",

        # 📗 PDF: Public Domain and Access to Knowledge.pdf
        "What role does the public domain play in access to knowledge?",
        "How does copyright affect the spread of knowledge?",

        # 📙 PDF: What Happens When Books Enter the Public Domain.pdf
        "What are the main effects of books entering the public domain?",
        "How does the public benefit when copyright expires?",

        # 🌐 Web: Medium - Introduction to LangChain
        "What is LangChain and why is it useful?",
        "What are document loaders in LangChain?",

        # 🌐 Web: DigitalOcean - LangChain Guide
        "How does LangChain integrate with LLMs?",
        "What is a simple chain example from the DigitalOcean tutorial?",

        # 🌐 Web: Elastic - LangChain Tutorial
        "How can LangChain be used with vector databases?",
        "What does Elastic suggest for building RAG apps?"
    ],
    theme="default",
    type="messages"
).launch()
