### HTML String

In [1]:
html_string = """

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Sample HTML Document</title>
</head>
<body>

    <h1>Introduction to Artificial Intelligence</h1>
    <p>
        Artificial Intelligence (AI) is a field of computer science focused on creating systems
        that can perform tasks requiring human intelligence.
    </p>

    <h2>History of AI</h2>
    <p>
        The concept of AI dates back to the 1950s when researchers began exploring machine-based
        reasoning and problem solving.
    </p>

    <h3>Early Years</h3>
    <p>
        Early AI systems relied heavily on symbolic reasoning and rule-based logic.
    </p>

    <h3>AI Winters</h3>
    <p>
        Due to limitations in computing power and data, AI research experienced periods of reduced
        funding known as AI winters.
    </p>

    <h2>Modern Artificial Intelligence</h2>
    <p>
        Modern AI systems leverage large datasets, powerful GPUs, and advanced algorithms.
    </p>

    <h3>Machine Learning</h3>
    <p>
        Machine learning allows systems to learn patterns directly from data without explicit programming.
    </p>

    <h4>Supervised Learning</h4>
    <p>
        Supervised learning uses labeled datasets to train predictive models.
    </p>

    <h4>Unsupervised Learning</h4>
    <p>
        Unsupervised learning discovers hidden patterns in unlabeled data.
    </p>

    <h3>Deep Learning</h3>
    <p>
        Deep learning uses multi-layer neural networks to achieve state-of-the-art performance
        in vision and language tasks.
    </p>

    <h2>Conclusion</h2>
    <p>
        Artificial Intelligence continues to evolve and transform industries worldwide.
    </p>

</body>
</html>

"""

### HTML Header Text Splitter

In [7]:
from langchain_text_splitters import HTMLHeaderTextSplitter

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h3", "Header 4")
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text(html_string)


In [8]:
html_header_splits

[Document(metadata={'Header 1': 'Introduction to Artificial Intelligence'}, page_content='Introduction to Artificial Intelligence'),
 Document(metadata={'Header 1': 'Introduction to Artificial Intelligence'}, page_content='Artificial Intelligence (AI) is a field of computer science focused on creating systems\n        that can perform tasks requiring human intelligence.'),
 Document(metadata={'Header 1': 'Introduction to Artificial Intelligence', 'Header 2': 'History of AI'}, page_content='History of AI'),
 Document(metadata={'Header 1': 'Introduction to Artificial Intelligence', 'Header 2': 'History of AI'}, page_content='The concept of AI dates back to the 1950s when researchers began exploring machine-based\n        reasoning and problem solving.'),
 Document(metadata={'Header 1': 'Introduction to Artificial Intelligence', 'Header 2': 'History of AI', 'Header 4': 'Early Years'}, page_content='Early Years'),
 Document(metadata={'Header 1': 'Introduction to Artificial Intelligence', '

## Using URL

In [19]:
from langchain_community.document_loaders import WebBaseLoader

url = "https://en.wikipedia.org/wiki/India/"

loader = WebBaseLoader(url)
docs = loader.load()


In [20]:
from langchain_text_splitters import HTMLHeaderTextSplitter

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h3", "Header 4"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text(docs[0].page_content)


In [21]:
html_header_splits

[Document(metadata={}, page_content='India/ - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nIndia/\n\n\n\nAdd languages\n\n\n\n\t\t\t\t\t\tPage contents not supported in other languages.\n\t\t\t\t\t\n\n\n\n\n\n\n\n\n\n\nArticleTalk\n\n\n\n\n\nEnglish\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nTools\n\n\n\n\n\nTools\nmove to sidebar\nhide\n\n\n\n\t\tActio