In [1]:
from dotenv import load_dotenv, find_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import TextLoader, PyPDFLoader, WebBaseLoader, DirectoryLoader, CSVLoader

load_dotenv(find_dotenv())

USER_AGENT environment variable not set, consider setting it to identify your requests.


True

In [2]:
# 1. Text loader

loader = TextLoader('demo.txt')
docs = loader.load()

docs

[Document(metadata={'source': 'demo.txt'}, page_content='This is an example of using text file in langchain\nlangchain is a powerful framework')]

In [3]:
# 2. PDF Loader


pdf_loader = PyPDFLoader('demo.pdf')

docs  = pdf_loader.load()

docs

[Document(metadata={'producer': 'Skia/PDF m140 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Untitled document', 'source': 'demo.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content="The  hippocampus is  a  major  component  of  the  brain of  humans and  many  other  vertebrates.  It  plays  important  roles  in  the  consolidation of  information  from  short-term  memory to  long-term  memory,  and  in  spatial  memory that  enables  navigation.  In  humans  and  other  primates,  the  hippocampus  is  located  in  the  archicortex,  one  of  the  three  regions  of  allocortex,  in  each  hemisphere.  The  hippocampus  is  a  structure  found  in  all  vertebrates.  In  Alzheimer's  disease (and  other  forms  of  dementia),  the  hippocampus  is  one  of  the  first  regions  of  the  brain  to  suffer  damage;  short-term  memory  loss and  disorientation are  included  among  the  early  symptoms.  Damage  to  the  hippocampus  can  also

In [4]:
# 3. Web based loader
# Purpose: Scrape or extract text/content from URLs.
#pip install bs4

web_loader = WebBaseLoader('https://www.google.com')
docs = web_loader.load()
docs

[Document(metadata={'source': 'https://www.google.com', 'title': 'Google', 'language': 'en-IN'}, page_content='GoogleSearch Images Maps Play YouTube News Gmail Drive More »Web History | Settings | Sign in\xa0Advanced searchGoogle offered in:  हिन्दी বাংলা తెలుగు मराठी தமிழ் ગુજરાતી ಕನ್ನಡ മലയാളം ਪੰਜਾਬੀ AdvertisingBusiness SolutionsAbout GoogleGoogle.co.in© 2026 - Privacy - Terms')]

In [5]:
# 4. Directory Loaders

# Purpose: Load all files from a folder (with options for file types).

#pip install unstructured

dir_loader = DirectoryLoader('dataset', glob= '**/*.text')
docs = dir_loader.load()

print(docs)

[]


In [6]:
#5. CSV data loader:

csv_loader = CSVLoader('50_Startups.csv')

csv_data = csv_loader.load()

csv_data

[Document(metadata={'source': '50_Startups.csv', 'row': 0}, page_content='R&D Spend: 165349.2\nAdministration: 136897.8\nMarketing Spend: 471784.1\nState: New York\nProfit: 192261.83'),
 Document(metadata={'source': '50_Startups.csv', 'row': 1}, page_content='R&D Spend: 162597.7\nAdministration: 151377.59\nMarketing Spend: 443898.53\nState: California\nProfit: 191792.06'),
 Document(metadata={'source': '50_Startups.csv', 'row': 2}, page_content='R&D Spend: 153441.51\nAdministration: 101145.55\nMarketing Spend: 407934.54\nState: Florida\nProfit: 191050.39'),
 Document(metadata={'source': '50_Startups.csv', 'row': 3}, page_content='R&D Spend: 144372.41\nAdministration: 118671.85\nMarketing Spend: 383199.62\nState: New York\nProfit: 182901.99'),
 Document(metadata={'source': '50_Startups.csv', 'row': 4}, page_content='R&D Spend: 142107.34\nAdministration: 91391.77\nMarketing Spend: 366168.42\nState: Florida\nProfit: 166187.94'),
 Document(metadata={'source': '50_Startups.csv', 'row': 5}, 

In [None]:
# 6. API and Database loaders

from langchain_community.document_loaders import SQLDatabaseLoader
from langchain_community.utilities import SQLDatabase
from sqlalchemy import create_engine

# 1. Create the database connection engine
engine = create_engine("sqlite:///test.db")
dbEmployee = SQLDatabase(engine)

# dbEmployee.run("CREATE TABLE employees (id INTEGER, name TEXT);")
# dbEmployee.run("INSERT INTO employees VALUES (1, 'Alice');")

# 2. Define the loader with the SQL query
loader = SQLDatabaseLoader(
    db=dbEmployee, 
    query="SELECT * FROM employees"
)

# 3. Load the data as documents
docs = loader.load()

# Display the documents
print(docs)

OperationalError: (sqlite3.OperationalError) table employees already exists
[SQL: CREATE TABLE employees (id INTEGER, name TEXT);]
(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [None]:
#pip install jq
from langchain_community.document_loaders import JSONLoader

jsloader = JSONLoader(file_path='test.json',
                      jq_schema='.[]',
                      text_content=False)

docs = jsloader.load()
docs

[Document(metadata={'source': 'D:\\Study\\Simplylearn - AIML course\\Module 5\\test.json', 'seq_num': 1}, page_content='1'),
 Document(metadata={'source': 'D:\\Study\\Simplylearn - AIML course\\Module 5\\test.json', 'seq_num': 2}, page_content='PN')]