In [1]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [3]:
# 1. Text Loaders

# Purpose: Load plain text files or strings.

from langchain_community.document_loaders import TextLoader

loader = TextLoader('dataFolder/demo.txt')
docs = loader.load()

In [4]:
docs

[Document(metadata={'source': 'dataFolder/demo.txt'}, page_content='This is an example of using text file in langchain\nlangchain is a powerful framework')]

In [12]:
# 2. PDF Loaders

# Purpose: Extract text from PDF documents.
# pip install pypdf

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('dataFolder/demo.pdf')
docs = loader.load()
docs

[Document(metadata={'producer': 'Skia/PDF m140 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Untitled document', 'source': 'dataFolder/demo.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content="The  hippocampus is  a  major  component  of  the  brain of  humans and  many  other  vertebrates.  It  plays  important  roles  in  the  consolidation of  information  from  short-term  memory to  long-term  memory,  and  in  spatial  memory that  enables  navigation.  In  humans  and  other  primates,  the  hippocampus  is  located  in  the  archicortex,  one  of  the  three  regions  of  allocortex,  in  each  hemisphere.  The  hippocampus  is  a  structure  found  in  all  vertebrates.  In  Alzheimer's  disease (and  other  forms  of  dementia),  the  hippocampus  is  one  of  the  first  regions  of  the  brain  to  suffer  damage;  short-term  memory  loss and  disorientation are  included  among  the  early  symptoms.  Damage  to  the  hippocampus

In [None]:
# 3. Web Page Loaders

# Purpose: Scrape or extract text/content from URLs.
#pip install bs4


from langchain_community.document_loaders import WebBaseLoader

#headers = {"User-Agent": "my-cool-app/1.0 (contact: youremail@example.com)"}
loader = WebBaseLoader("https://www.example.com")
docs = loader.load()

docs

[Document(metadata={'source': 'https://www.example.com', 'title': 'Example Domain', 'language': 'No language found.'}, page_content='\n\n\nExample Domain\n\n\n\n\n\n\n\nExample Domain\nThis domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.\nMore information...\n\n\n\n')]

In [11]:
! pip install pypdf



In [13]:
# 4. Directory Loaders

# Purpose: Load all files from a folder (with options for file types).

#pip install unstructured

from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader('dataFolder/dataset', glob='**/*.txt')
docs = loader.load()

docs


libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
Error loading file dataFolder/dataset/empData.txt


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/apoltavets/nltk_data'
    - '/Users/apoltavets/anna-apps/ml-and-engineering-docs/.venv/nltk_data'
    - '/Users/apoltavets/anna-apps/ml-and-engineering-docs/.venv/share/nltk_data'
    - '/Users/apoltavets/anna-apps/ml-and-engineering-docs/.venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
# 5. CSV & DataFrame Loaders

# Purpose: Load data from CSVs, Excel, and other tabular sources.

from langchain_community.document_loaders import CSVLoader

loader = CSVLoader('dataFolder/50_Startups.csv')
docs = loader.load()
docs


[Document(metadata={'source': '50_Startups.csv', 'row': 0}, page_content='R&D Spend: 165349.2\nAdministration: 136897.8\nMarketing Spend: 471784.1\nState: New York\nProfit: 192261.83'),
 Document(metadata={'source': '50_Startups.csv', 'row': 1}, page_content='R&D Spend: 162597.7\nAdministration: 151377.59\nMarketing Spend: 443898.53\nState: California\nProfit: 191792.06'),
 Document(metadata={'source': '50_Startups.csv', 'row': 2}, page_content='R&D Spend: 153441.51\nAdministration: 101145.55\nMarketing Spend: 407934.54\nState: Florida\nProfit: 191050.39'),
 Document(metadata={'source': '50_Startups.csv', 'row': 3}, page_content='R&D Spend: 144372.41\nAdministration: 118671.85\nMarketing Spend: 383199.62\nState: New York\nProfit: 182901.99'),
 Document(metadata={'source': '50_Startups.csv', 'row': 4}, page_content='R&D Spend: 142107.34\nAdministration: 91391.77\nMarketing Spend: 366168.42\nState: Florida\nProfit: 166187.94'),
 Document(metadata={'source': '50_Startups.csv', 'row': 5}, 

In [16]:
# 7. API & Database Loaders

# Purpose: Load data from APIs or databases (SQL, NoSQL).

# Popular Classes:

# SQLDatabaseLoader (for SQL databases)
# Custom API loaders (via HTTP requests)

from langchain_community.document_loaders import SQLDatabaseLoader
from langchain_community.utilities import SQLDatabase

# Create the database connection
dbEmployee = SQLDatabase.from_uri("sqlite:///dataFolder/test.db")

loader = SQLDatabaseLoader(
    db=dbEmployee,
    query="SELECT * FROM employees"
)
docs = loader.load()

docs


[Document(metadata={}, page_content='id: 1\nname: John Doe\nage: 30\ndepartment: Sales')]