# Document Loaders and Text Splitters in LangChain


1. **File Loaders**: For loading local files (CSV, PDF, TXT, etc.)
2. **Web Loaders**: For loading content from web sources


In [6]:
from langchain_community.document_loaders import CSVLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAI
from langchain_core.prompts import PromptTemplate
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

True

## Understanding Text Splitting

- RecursiveCharacterTextSplitter
- CharacterTextSplitter


In [None]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Number of characters per chunk
    chunk_overlap=50,  # Number of overlapping characters
    length_function=len,
)

print("Text splitter initialized with chunk size of 500 and overlap of 50 characters")

## File Loader Example: CSV


In [7]:
def load_and_process_csv():
    # Initialize the CSV loader
    loader = CSVLoader(
        file_path="customers-100.csv",
        csv_args={
            'delimiter': ',',
            'quotechar': '"',
        }
    )
    
    # Load the documents
    documents = loader.load()
    print(f"Loaded {len(documents)} documents")
    
    # Split documents into chunks
    splits = text_splitter.split_documents(documents)
    print(f"Created {len(splits)} splits")
    
    # Take first few chunks to stay within token limits
    limited_splits = splits[:5]
    
    return "\n\n".join([doc.page_content for doc in limited_splits])

context = load_and_process_csv()

Loaded 100 documents
Created 100 splits


In [8]:
def setup_qa_chain():
    # Initialize OpenAI model
    llm = OpenAI(temperature=0)
    
    # Create prompt template
    prompt = PromptTemplate(
        template="""Based on the following customer data, please answer the question.
        
        Customer Data:
        {context}
        
        Question: {question}
        
        Answer: """,
        input_variables=["context", "question"]
    )
    
    return prompt | llm

# Set up the chain
qa_chain = setup_qa_chain()

# Test with a sample question
question = "What is sheryl's email address?"
response = qa_chain.invoke({"context": context, "question": question})
print(f"Question: {question}")
print(f"Answer: {response}")

Question: What is sheryl's email address?
Answer:  zunigavanessa@smith.info


## Web Loader Example


In [9]:
def load_and_process_webpage(url):
    # Initialize web loader with custom headers
    loader = WebBaseLoader(
        url,
        verify_ssl=False,
        header_template={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }
    )
    
    # Load and process the webpage
    documents = loader.load()
    print(f"Loaded webpage with {len(documents)} documents")
    
    # Split the content
    splits = text_splitter.split_documents(documents)
    print(f"Created {len(splits)} splits")
    
    limited_splits = splits[:3]
    return "\n\n".join([doc.page_content for doc in limited_splits])

# Test with a sample URL
url = "https://en.wikipedia.org/wiki/LangChain"
web_context = load_and_process_webpage(url)

# Test with some questions
questions = [
    "What is LangChain?",
    "What are the main features of LangChain?"
]

for question in questions:
    response = qa_chain.invoke({"context": web_context, "question": question})
    print(f"\nQuestion: {question}")
    print(f"Answer: {response}")



Loaded webpage with 1 documents
Created 42 splits

Question: What is LangChain?
Answer: LangChain is a language learning platform that offers courses in 8 different languages: Persian, Korean, Hindi, Japanese, Portuguese, Thai, Turkish, and Chinese. It provides tools and resources for users to improve their language skills and offers a variety of features such as interactive lessons, vocabulary building exercises, and cultural insights. LangChain aims to make language learning accessible and enjoyable for all users.

Question: What are the main features of LangChain?
Answer: 
1. Multilingual Support: LangChain supports 8 different languages, making it accessible to a wide range of users.

2. Translation Tools: LangChain offers various translation tools, such as machine translation and human translation, to help users accurately translate content.

3. Collaboration: LangChain allows for collaboration between users, making it easier to work on translations and share resources.

4. Histor