### Medium.com Articles to Markdown Files 
https://dsavir-h.medium.com/medium-to-markdown-via-python-ease-of-use-4a3687a53704

In [None]:
from datetime import datetime
import html2text
import requests
from bs4 import BeautifulSoup
import sys
import re

def get_html_element(element,soup) -> str:
    """
    Searches for the first occurrence of a specified HTML element in a BeautifulSoup object and returns its text.

    Parameters:
    - element (str): The tag name of the HTML element to search for (e.g., 'h1', 'div').
    - soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML document.

    Returns:
    - str: The text of the first occurrence of the specified element if found; otherwise, an empty string.
    """
    result = soup.find(element)
    if result:
        return result.text
    else:
        print(f"No element ${element} found.")
        return ""

def cut_text_at_marker(marker:str,text:str,beginning:bool):
    """
    Cuts the text at the specified marker and returns the resulting substring. The function can return the
    text after the first occurrence of the marker (if beginning is True) or before the last occurrence
    of the marker (if beginning is False).
    """
    # Find the index of the substring
    cut_off_index = 0
    if beginning:
        cut_off_index = text.find(marker)
    else:
        cut_off_index = text.rfind(marker)
    # Slice the string if the substring is found
    newText = ""
    if cut_off_index != -1:
        if beginning:
            newText = text[cut_off_index + len(marker):]
        else:
            newText = text[:cut_off_index]
    return newText

### get html content from url    
url = "https://medium.com/google-cloud/intelligent-document-discovery-with-vertex-ai-search-ca0641219ddb"
response = requests.get(url)
# you can check the response.status_code if you like (see comment)
html_content =  response.text

### define soup
soup = BeautifulSoup(html_content, 'lxml')

### get title
title = get_html_element('h1',soup) # for front matter
title_name = title.lower().replace(" ","-") # for filename
title_name = title_name.replace(":","") # remove : - from experience :)
title_name = title_name.replace(".","") # remove .

if (title == ""):
    print("no title")
    sys.exit()

### get subtitle
subtitle = get_html_element('h2',soup) # for front matter

if (subtitle == ""):
    print("no subtitle")
    sys.exit()

### code blocks
html_content = html_content.replace("<pre", "```<pre")
html_content = html_content.replace("</pre>", "</pre>```")

### text separators
# Find all elements with role="separator"
separator_elements = soup.find_all(attrs={"role": "separator"})

# replace with <hr> element, markdown recognizes this
for element in separator_elements:
    html_content = html_content.replace(str(element), "<hr>")

### convert to markdown
converter = html2text.HTML2Text()
converter.ignore_links = False  # preserve hyperlinks
markdown_text = converter.handle(html_content)

### cut end
markdown_text = cut_text_at_marker('\--',markdown_text,False)

### cut beginning
markdown_text = cut_text_at_marker('Share',markdown_text,True)

### get tags
pattern = r"\[\s*([^\]]+?)\s*\]" 
matches = re.findall(pattern, markdown_text) 
tags = matches[-5:]  


### cut end part II: remove the tags from the content
pattern = r'\[\s*{}'.format(re.escape(tags[0]))
all_patterns = list(re.finditer(pattern, markdown_text))
first_tag = all_patterns[-1]
second_cutoff = first_tag.start()
if second_cutoff != -1:
    markdown_text = markdown_text[:second_cutoff]

### code blocks part II: remove empty lines
pattern = r'(^```$)(\s*\n\s*)+'
# Replace matches with just the "```" line
markdown_text = re.sub(pattern, r'\1\n', markdown_text, flags=re.MULTILINE)

### get formatted date
today = datetime.now()
formatted_date_str = today.strftime("%Y-%m-%d")

### save file
filename = f"{formatted_date_str}-{title_name}.md"
print(filename)

with open(filename, 'w', encoding='utf-8') as file:
    file.write(markdown_text)