## Document Loaders In LangChain

#### TextLoader

In [2]:
from langchain.document_loaders import TextLoader


#### CSVLoader

In [3]:
from langchain.document_loaders.csv_loader import CSVLoader

#### UnstructuredURLLoader

UnstructuredURLLoader of Langchain internally uses unstructured python library to load the content from url's

https://unstructured-io.github.io/unstructured/introduction.html

https://pypi.org/project/unstructured/#description

In [4]:
#installing necessary libraries, libmagic is used for file type detection
!pip3 install unstructured libmagic python-magic python-magic-bin



In [1]:
import requests
from bs4 import BeautifulSoup

def fetch_and_process_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text(separator=' ')
            return text
        else:
            return f"Failed to fetch: {response.status_code}"
    except Exception as e:
        return f"Error: {str(e)}"

urls = [
    "https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html",
    "https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html"
]

results = [fetch_and_process_url(url) for url in urls]
print(results)  # Print fetched and processed text


['  HDFC Bank re-appoints Sanmoy Chakrabarti as Chief Risk Officer                       \n \n     \n \n             \n \n       \n \n     English Hindi Gujarati Specials Search Quotes, News, Mutual Fund NAVs Moneycontrol Trending Stock Infosys\xa0 INE009A01021, INFY, 500209 State Bank of India\xa0 INE062A01020, SBIN, 500112 Yes Bank\xa0 INE528G01027, YESBANK, 532648 Bank Nifty\xa0 Nifty 500 \xa0 Quotes Mutual Funds Commodities Futures & Options Currency News Cryptocurrency Forum Notices Videos Glossary All    Hello, Login   Hello, Login Log-in or   Sign-Up My Account My Profile   My Portfolio My Watchlist FREE  Credit Score ₹100 Cash Reward My Alerts My Messages Price Alerts My Profile   My PRO My Portfolio My Watchlist FREE  Credit Score ₹100 Cash Reward My Alerts My Messages Price Alerts Logout Chat with Us Download App Follow us on: Premium My Alerts ->->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_BUSINESS_AS/MC_ENG_ROS_NWS_BUS_AS_ATF_728 Go  PRO  @₹99   PRO Advertisement Remove Ad Elections

## Text Splitters

In [5]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size=200,
    chunk_overlap=0
)