### Ingest Medium.com Articles to Markdown Files 
list of URLs -> markdown files in GCS Bucket

In [None]:
from datetime import datetime
import html2text
import requests
from bs4 import BeautifulSoup
import sys
import re
from google.cloud import storage
import os

def get_html_element(element,soup) -> str:
    """
    Searches for the first occurrence of a specified HTML element in a BeautifulSoup object and returns its text.

    Parameters:
    - element (str): The tag name of the HTML element to search for (e.g., 'h1', 'div').
    - soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML document.

    Returns:
    - str: The text of the first occurrence of the specified element if found; otherwise, an empty string.
    """
    result = soup.find(element)
    if result:
        return result.text
    else:
        print(f"No element ${element} found.")
        return ""

def cut_text_at_marker(marker:str,text:str,beginning:bool):
    """
    Cuts the text at the specified marker and returns the resulting substring. The function can return the
    text after the first occurrence of the marker (if beginning is True) or before the last occurrence
    of the marker (if beginning is False).
    """
    # Find the index of the substring
    cut_off_index = 0
    if beginning:
        cut_off_index = text.find(marker)
    else:
        cut_off_index = text.rfind(marker)
    # Slice the string if the substring is found
    newText = ""
    if cut_off_index != -1:
        if beginning:
            newText = text[cut_off_index + len(marker):]
        else:
            newText = text[:cut_off_index]
    return newText

def process_url(url: str):
    """
    Processes a single URL to extract content, convert to Markdown, and upload to GCS.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        html_content = response.text

        soup = BeautifulSoup(html_content, 'lxml')

        title = get_html_element('h1', soup)
        title_name = title.lower().replace(" ", "-").replace(":", "").replace(".", "").replace("/", "").replace(")", "").replace("(", "")

        if not title:
            print(f"No title found for {url}")
            return

        subtitle = get_html_element('h2', soup)

        if not subtitle:
            print(f"No subtitle found for {url}")
            return

        ### code blocks
        html_content = html_content.replace("<pre", "```<pre")
        html_content = html_content.replace("</pre>", "</pre>```")

        ### text separators
        # Find all elements with role="separator"
        separator_elements = soup.find_all(attrs={"role": "separator"})

        # replace with <hr> element, markdown recognizes this
        for element in separator_elements:
            html_content = html_content.replace(str(element), "<hr>")

        ### convert to markdown
        converter = html2text.HTML2Text()
        converter.ignore_links = False  # preserve hyperlinks
        markdown_text = converter.handle(html_content)

        ### cut end
        markdown_text = cut_text_at_marker('\--', markdown_text, False)

        ### cut beginning
        markdown_text = cut_text_at_marker('Share', markdown_text, True)

        ### get tags
        pattern = r"\[\s*([^\]]+?)\s*\]"
        matches = re.findall(pattern, markdown_text)
        tags = matches[-5:]

        ### cut end part II: remove the tags from the content
        pattern = r'\[\s*{}'.format(re.escape(tags[0]))
        all_patterns = list(re.finditer(pattern, markdown_text))
        first_tag = all_patterns[-1]
        second_cutoff = first_tag.start()
        if second_cutoff != -1:
            markdown_text = markdown_text[:second_cutoff]

        ### code blocks part II: remove empty lines
        pattern = r'(^```$)(\s*\n\s*)+'
        # Replace matches with just the "```" line
        markdown_text = re.sub(pattern, r'\1\n', markdown_text, flags=re.MULTILINE)

        ### get formatted date
        today = datetime.now()
        formatted_date_str = today.strftime("%Y-%m-%d")

        # Create the directory if it doesn't exist
        markdown_dir = "/Volumes/DataLakeActive/GCP-generative-ai/1-parsedURLs_markdown"
        os.makedirs(markdown_dir, exist_ok=True)

        filename = f"{formatted_date_str}-{title_name}.md"
        file_path = os.path.join(markdown_dir, filename)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(markdown_text)

        # DESTINATION GCS Bucket
        bucket_name = 'articles_extracted_markdown'
        upload_to_gcs(file_path, filename, bucket_name)

    except Exception as e:
        print(f"Error processing {url}: {e}")
        # Handle the error by prefixing the filename and uploading
        error_filename = f"ERROR_{formatted_date_str}-{title_name}.md"
        with open(error_filename, 'w', encoding='utf-8') as file:
            file.write(f"Error processing URL: {url}\n\n{str(e)}")
        upload_to_gcs(error_filename, 'articles_extracted_markdown')

def upload_to_gcs(file_path: str, filename: str, bucket_name: str):
    """Uploads a file to a GCS bucket."""
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(filename)
    blob.upload_from_filename(file_path)
    print(f"Uploaded {filename} to gs://{bucket_name}/{filename}")

# if __name__ == "__main__":
#     if len(sys.argv) < 2:
#         print("Usage: python script.py <urls_file.txt>")
#         sys.exit(1)

#    urls_file = sys.argv[1]



# Specify the file path directly
urls_file = "/Volumes/DataLakeActive/GCP-generative-ai/0-jc_notebooks/temp_list.txt"

with open(urls_file, 'r') as f:
    urls = [line.strip() for line in f]

for url in urls:
    process_url(url)

## GCP Documents

DOES NOT HAVE ANY CONTENT IN FILES

In [4]:
from datetime import datetime
import html2text
import requests
from bs4 import BeautifulSoup
import sys
import re
from google.cloud import storage
import os

def get_html_element(element,soup) -> str:
    """
    Searches for the first occurrence of a specified HTML element in a BeautifulSoup object and returns its text.

    Parameters:
    - element (str): The tag name of the HTML element to search for (e.g., 'h1', 'div').
    - soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML document.

    Returns:
    - str: The text of the first occurrence of the specified element if found; otherwise, an empty string.
    """
    result = soup.find(element)
    if result:
        return result.text
    else:
        print(f"No element ${element} found.")
        return ""

def cut_text_at_marker(marker:str,text:str,beginning:bool):
    """
    Cuts the text at the specified marker and returns the resulting substring. The function can return the
    text after the first occurrence of the marker (if beginning is True) or before the last occurrence
    of the marker (if beginning is False).
    """
    # Find the index of the substring
    cut_off_index = 0
    if beginning:
        cut_off_index = text.find(marker)
    else:
        cut_off_index = text.rfind(marker)
    # Slice the string if the substring is found
    newText = ""
    if cut_off_index != -1:
        if beginning:
            newText = text[cut_off_index + len(marker):]
        else:
            newText = text[:cut_off_index]
    return newText

def sanitize_filename(filename):
    """Sanitizes a filename to be compatible with GCS object naming rules."""
    # Remove newline characters
    filename = filename.replace('\n', '')
    # Replace disallowed characters with underscores
    filename = re.sub(r'[^\w\s.-]', '_', filename)
    # Replace multiple consecutive hyphens with a single hyphen
    filename = re.sub(r'-+', '-', filename)
    # Remove leading or trailing hyphens
    filename = filename.strip('-')
    return filename

def process_url(url: str):
    """
    Processes a single URL to extract content, convert to Markdown, 
    and upload to GCS.
    """
    # Get the formatted date string outside the try block
    formatted_date_str = datetime.now().strftime("%Y-%m-%d") 

    try:
        response = requests.get(url)
        response.raise_for_status()  
        html_content = response.text

        soup = BeautifulSoup(html_content, 'lxml')

        title = get_html_element('h1', soup)
        title_name = title.lower().replace(" ", "-").replace(":", "").replace(".", "").replace("/", "")

        if not title:
            print(f"No title found for {url}")
            return

        subtitle = get_html_element('h2', soup)

        if not subtitle:
            print(f"No subtitle found for {url}")
            return

        ### code blocks
        html_content = html_content.replace("<pre", "```<pre")
        html_content = html_content.replace("</pre>", "</pre>```")

        ### text separators
        # Find all elements with role="separator"
        separator_elements = soup.find_all(attrs={"role": "separator"})

        # replace with <hr> element, markdown recognizes this
        for element in separator_elements:
            html_content = html_content.replace(str(element), "<hr>")

        ### convert to markdown
        converter = html2text.HTML2Text()
        converter.ignore_links = False  # preserve hyperlinks
        markdown_text = converter.handle(html_content)

        ### cut end
        markdown_text = cut_text_at_marker('\--', markdown_text, False)

        ### cut beginning
        markdown_text = cut_text_at_marker('Share', markdown_text, True)

        ### get tags
        pattern = r"\[\s*([^\]]+?)\s*\]"
        matches = re.findall(pattern, markdown_text)
        tags = matches[-5:]

        ### cut end part II: remove the tags from the content
        if tags:  # Check if the tags list is not empty
            pattern = r'\[\s*{}'.format(re.escape(tags[0]))
            all_patterns = list(re.finditer(pattern, markdown_text))
            if all_patterns:
                first_tag = all_patterns[-1]
                second_cutoff = first_tag.start()
                if second_cutoff != -1:
                    markdown_text = markdown_text[:second_cutoff]

        ### code blocks part II: remove empty lines
        pattern = r'(^```$)(\s*\n\s*)+'
        # Replace matches with just the "```" line
        markdown_text = re.sub(pattern, r'\1\n', markdown_text, flags=re.MULTILINE)

        ### get formatted date
        today = datetime.now()
        formatted_date_str = today.strftime("%Y-%m-%d")

        # Create the directory if it doesn't exist
        markdown_dir = "/Volumes/DataLakeActive/GCP-generative-ai/1-parsedURLs_markdown"
        os.makedirs(markdown_dir, exist_ok=True)

        filename = f"{formatted_date_str}-{sanitize_filename(title_name)}.md"
        file_path = os.path.join(markdown_dir, filename)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(markdown_text)

        # DESTINATION GCS Bucket
        bucket_name = 'articles_extracted_markdown'
        upload_to_gcs(file_path, filename, bucket_name)

    except Exception as e:
        print(f"Error processing {url}: {e}")
        error_filename = f"ERROR_{formatted_date_str}-{title_name}.md"
        error_file_path = os.path.join(markdown_dir, error_filename)
        with open(error_file_path, 'w', encoding='utf-8') as file:
            file.write(f"Error processing URL: {url}\n\n{str(e)}")
        upload_to_gcs(error_file_path, error_filename, bucket_name)

def upload_to_gcs(file_path: str, filename: str, bucket_name: str):
    """Uploads a file to a GCS bucket."""
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(filename)
    blob.upload_from_filename(file_path)
    print(f"Uploaded {filename} to gs://{bucket_name}/{filename}")

# if __name__ == "__main__":
#     if len(sys.argv) < 2:
#         print("Usage: python script.py <urls_file.txt>")
#         sys.exit(1)

#    urls_file = sys.argv[1]



# Specify the file path directly
urls_file = "/Volumes/DataLakeActive/GCP-generative-ai/0-jc_notebooks/temp_list.txt"

with open(urls_file, 'r') as f:
    urls = [line.strip() for line in f]

for url in urls:
    process_url(url)

Uploaded 2024-09-01-google-models.md to gs://articles_extracted_markdown/2024-09-01-google-models.md
Uploaded 2024-09-01-embeddings-apis-overview.md to gs://articles_extracted_markdown/2024-09-01-embeddings-apis-overview.md
Uploaded 2024-09-01-document-understanding.md to gs://articles_extracted_markdown/2024-09-01-document-understanding.md
Uploaded 2024-09-01-model-tuning-for-gemini-text-models.md to gs://articles_extracted_markdown/2024-09-01-model-tuning-for-gemini-text-models.md
Uploaded 2024-09-01-tune-text-embeddings.md to gs://articles_extracted_markdown/2024-09-01-tune-text-embeddings.md
Uploaded 2024-09-01-llamaindex-on-vertex-ai-for-rag-overview.md to gs://articles_extracted_markdown/2024-09-01-llamaindex-on-vertex-ai-for-rag-overview.md
