---
title: "HTML to markdown web scraping"
description: "A simple web scraping project to convert HTML to markdown using Python"
date: 2024-05-20
tags: ["python", "web scraping", "html", "markdown"]
---

In [None]:
# install prerequisites
%pip install markdownify
%pip install requests
%pip install pandas

In [None]:
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md

def find_content(soup):
    # Try different heuristics to find content
    content_selectors = [
        {'tag': 'div', 'attr': {'class': 'main-content'}},
        {'tag': 'main', 'attr': {}},
        {'tag': 'article', 'attr': {}}
    ]

    for selector in content_selectors:
        content = soup.find(selector['tag'], attrs=selector['attr'])
        if content:
            return content

    # As a fallback, return the body or None
    return soup.find('body') or None

def html_to_markdown(url):
    # Fetching the HTML content
    response = requests.get(url)
    response.raise_for_status()  # Will raise an HTTPError for bad requests

    # Parsing HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find content using heuristics
    content_div = find_content(soup)

    if content_div:
        # Converting to Markdown
        markdown_text = md(str(content_div), heading_style="ATX")
        return markdown_text
    else:
        return "Content not found."

# Example usage
url = 'https://getbootstrap.com/docs/5.3/components/modal/'
markdown_output = html_to_markdown(url)
print(markdown_output)


In [None]:
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from datetime import datetime

def find_content(soup):
    content_selectors = [
        {'tag': 'div', 'attr': {'class': 'main-content'}},
        {'tag': 'main', 'attr': {}},
        {'tag': 'article', 'attr': {}}
    ]
    for selector in content_selectors:
        content = soup.find(selector['tag'], attrs=selector['attr'])
        if content:
            return content
    return soup.find('body') or None

def extract_metadata(soup, url):
    title = soup.find('title').text if soup.find('title') else 'No Title'
    description_tag = soup.find('meta', attrs={"name": "description"})
    description = description_tag['content'] if description_tag else 'No Description'
    date_retrieved = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    return {
        'title': title,
        'url': url,
        'date_retrieved': date_retrieved,
        'description': description
    }

def html_to_markdown(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract content and metadata
    content_div = find_content(soup)
    metadata = extract_metadata(soup, url)

    if content_div:
        markdown_text = md(str(content_div), heading_style="ATX")
        # Clean up markdown (optional, can be customized)
        markdown_text = '\n'.join(line.strip() for line in markdown_text.split('\n') if line.strip())

        # Prepare frontmatter
        frontmatter = f"""---
title: "{metadata['title']}"
url: "{metadata['url']}"
date_retrieved: "{metadata['date_retrieved']}"
description: "{metadata['description']}"
---
"""
        return frontmatter + markdown_text
    else:
        return "Content not found."

# Example usage
url = 'https://jekyllrb.com/docs/configuration/options/#global-configuration'
markdown_output = html_to_markdown(url)
print(markdown_output)


In [None]:
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from datetime import datetime

def find_content(soup):
    content_selectors = [
        {'tag': 'div', 'attr': {'class': 'main-content'}},
        {'tag': 'main', 'attr': {}},
        {'tag': 'article', 'attr': {}}
    ]
    for selector in content_selectors:
        content = soup.find(selector['tag'], attrs=selector['attr'])
        if content:
            return content
    return soup.find('body') or None

def extract_metadata(soup, url):
    title = soup.find('title').text if soup.find('title') else 'No Title'
    description_tag = soup.find('meta', attrs={"name": "description"})
    description = description_tag['content'] if description_tag else 'No Description'
    date_retrieved = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    return {
        'title': title,
        'url': url,
        'date_retrieved': date_retrieved,
        'description': description
    }

def clean_markdown(markdown_text):
    lines = markdown_text.split('\n')
    cleaned_lines = []
    for line in lines:
        if line.startswith('#'):
            # Ensure space after '#' in headings
            cleaned_line = line.replace('#', '').strip()
            line = '#' + ' ' + cleaned_line
        elif line.startswith(('-', '*', '+')) and not line.startswith(('---', '***')):
            # Ensure space after list markers
            if line[1] != ' ':
                line = line[0] + ' ' + line[1:]
        cleaned_lines.append(line)
    return '\n'.join(cleaned_lines)

def html_to_markdown(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    content_div = find_content(soup)
    metadata = extract_metadata(soup, url)

    if content_div:
        raw_markdown = md(str(content_div), heading_style="ATX")
        markdown_text = clean_markdown(raw_markdown)

        frontmatter = f"""---
title: "{metadata['title']}"
url: "{metadata['url']}"
date_retrieved: "{metadata['date_retrieved']}"
description: "{metadata['description']}"
---
"""
        return frontmatter + markdown_text
    else:
        return "Content not found."

# Example usage
url = 'https://jekyllrb.com/docs/configuration/options/#global-configuration'
markdown_output = html_to_markdown(url)
print(markdown_output)


In [2]:
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from datetime import datetime

def find_content(soup):
    content_selectors = [
        {'tag': 'div', 'attr': {'class': 'main-content'}},
        {'tag': 'main', 'attr': {}},
        {'tag': 'article', 'attr': {}}
    ]
    for selector in content_selectors:
        content = soup.find(selector['tag'], attrs=selector['attr'])
        if content:
            return content
    return soup.find('body') or None

def extract_metadata(soup, url):
    title = soup.find('title').text if soup.find('title') else 'No Title'
    description_tag = soup.find('meta', attrs={"name": "description"})
    description = description_tag['content'] if description_tag else 'No Description'
    date_retrieved = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    return {
        'title': title,
        'url': url,
        'date_retrieved': date_retrieved,
        'description': description
    }

def clean_markdown(markdown_text):
    lines = markdown_text.split('\n')
    cleaned_lines = []
    consecutive_blank_lines = 0
    
    for line in lines:
        if line.strip() == '':
            consecutive_blank_lines += 1
            if consecutive_blank_lines > 1:
                continue
        else:
            consecutive_blank_lines = 0

        if line.startswith('#'):
            # Ensure space after '#' in headings
            cleaned_line = line.replace('#', '').strip()
            line = '#' + ' ' + cleaned_line
        elif line.startswith(('-', '*', '+')) and not line.startswith(('---', '***')):
            # Ensure space after list markers
            if line[1] != ' ':
                line = line[0] + ' ' + line[1:]
        cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines)

def html_to_markdown(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    content_div = find_content(soup)
    metadata = extract_metadata(soup, url)

    if content_div:
        raw_markdown = md(str(content_div), heading_style="ATX")
        markdown_text = clean_markdown(raw_markdown)

        frontmatter = f"""---
title: "{metadata['title']}"
url: "{metadata['url']}"
date_retrieved: "{metadata['date_retrieved']}"
description: "{metadata['description']}"
---
"""
        return frontmatter + markdown_text
    else:
        return "Content not found."

# Example usage
url = 'https://getbootstrap.com/docs/5.3/components/modal/'
markdown_output = html_to_markdown(url)
print(markdown_output)


---
title: "Modal · Bootstrap v5.3"
url: "https://getbootstrap.com/docs/5.3/components/modal/"
date_retrieved: "2024-05-20 21:32:11"
description: "Use Bootstrap’s JavaScript modal plugin to add dialogs to your site for lightboxes, user notifications, or completely custom content."
---

[View on GitHub](https://github.com/twbs/bootstrap/blob/v5.3.3/site/content/docs/5.3/components/modal.md "View and edit this file on GitHub") 

# Modal

Use Bootstrap’s JavaScript modal plugin to add dialogs to your site for lightboxes, user notifications, or completely custom content.

 On this page
 
* *On this page**

---

* [How it works](#how-it-works)
* [Examples](#examples)
	+ [Modal components](#modal-components)
	+ [Live demo](#live-demo)
	+ [Static backdrop](#static-backdrop)
	+ [Scrolling long content](#scrolling-long-content)
	+ [Vertically centered](#vertically-centered)
	+ [Tooltips and popovers](#tooltips-and-popovers)
	+ [Using the grid](#using-the-grid)
	+ [Varying modal content](#varyin