## Hypertech RSS Markdown Converter

This script fetches XML files from RSS feed URLs and formats them into readable markdown files. 

In [None]:
import os
import requests
import xml.etree.ElementTree as ET
import logging
from datetime import datetime

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]: %(message)s")

XML_SAVE_DIR = os.environ.get('XML_SAVE_DIR', '/Users/blaed/Documents/GitHub/hypertech-fosai/notebooks/xml')
MD_SAVE_DIR = os.environ.get('MD_SAVE_DIR', '/Users/blaed/Documents/GitHub/hypertech-fosai/notebooks/md')
RSS_URLS = [
    "https://allainews.com/feed/",
    "https://analyticsvidhya.com/blog/category/machine-learning/feed/",
    "https://aws.amazon.com/blogs/machine-learning/feed",
    "https://bair.berkeley.edu/blog/feed.xml",
    "https://blog.google/technology/ai/rss",
    "https://deepmind.com/blog/rss.xml",
    "https://feeds.feedburner.com/kdnuggets-data-mining-analytics",
    "https://jamesg.blog/openai.xml",
    "https://lexfridman.com/feed/podcast/",
    "https://marktechpost.com/feed",
    "https://mltechniques.com/feed",
    "https://news.mit.edu/topic/mitartificial-intelligence2-rss.xml",
    "https://nvidianews.nvidia.com/releases.xml"
]

def generate_md_preamble() -> str:
    return """---
# Hypertech News Report
---

"""

def generate_md_postamble() -> str:
    return """---
##### Disclaimer: This report was generated from third-party sources and may be subject to change. Always refer to the original source for the most up-to-date information.
---

"""

def sanitize_for_filename(title: str) -> str:
    forbidden_chars = ['/', '\\', '?', '%', '*', ':', '|', '"', '<', '>', '#']
    for ch in forbidden_chars:
        title = title.replace(ch, '_')
    return title

def fetch_raw_rss(url: str) -> str:
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        logging.error(f"Failed to fetch RSS feed from {url}. Error: {e}")
        return None

def generate_file_name(url: str) -> str:
    domain = (url.split("//") + [""])[1].split("/")[0]
    return f"{domain}.xml"

def save_to_file(content: str, directory: str, file_name: str) -> str:
    os.makedirs(directory, exist_ok=True)
    filepath = os.path.join(directory, file_name)
    with open(filepath, 'w', encoding="utf-8") as file:
        file.write(content)
    return filepath

def extract_xml_content(element, tag: str) -> str:
    found = element.find(tag)
    return found.text if found is not None and found.text else ""

def convert_item_to_markdown(item) -> (str, str):
    item_title = extract_xml_content(item, 'title')
    item_link = extract_xml_content(item, 'link')
    item_description = extract_xml_content(item, 'description')

    markdown_section = ""
    if item_title and item_link:
        markdown_section += f"## {item_title}\n[Source]({item_link})\n"
        if item_description:
            markdown_section += f"\n{item_description}\n"
        
        markdown_section += f"\n\n[Click Here to Learn More]({item_link})\n\n"

    return markdown_section, item_title

def xml_to_markdown_sections(xml_data: str) -> list:
    try:
        root = ET.fromstring(xml_data)
    except ET.ParseError:
        logging.error("Error parsing XML data.")
        return []

    channel_title = extract_xml_content(root, 'channel/title')
    channel_description = extract_xml_content(root, 'channel/description')
    header = f"# {channel_title or 'Title Missing'}\n\n{channel_description or 'Description Missing'}\n\n"

    markdown_sections = [(header, "Header")]
    for item in root.findall('channel/item'):
        content, title = convert_item_to_markdown(item)
        markdown_sections.append((content, title))

    return markdown_sections

def generate_md_save_dir(base_dir, parent_file_name):
    parent_name = os.path.splitext(parent_file_name)[0]
    return os.path.join(base_dir, parent_name)

def save_master_markdown_file(content_list, directory, file_name):
    master_content = ""
    for section in content_list:
        section_content = section.replace(generate_md_preamble(), "").replace(generate_md_postamble(), "")
        master_content += section_content
    master_content += generate_md_postamble()
    return save_to_file(master_content, directory, file_name)

def main():
    for rss_url in RSS_URLS:
        xml_content = fetch_raw_rss(rss_url)
        if xml_content:
            file_name = generate_file_name(rss_url)
            xml_filepath = save_to_file(xml_content, XML_SAVE_DIR, file_name)
            logging.info(f"Saved RSS feed from {rss_url} to {xml_filepath}")

            current_md_save_dir = generate_md_save_dir(MD_SAVE_DIR, file_name)
            markdown_sections = xml_to_markdown_sections(xml_content)
            
            master_markdown_content_list = []
            
            for idx, (markdown_section, section_title) in enumerate(markdown_sections, 1):
                sanitized_title = sanitize_for_filename(section_title) if section_title else "section"
                md_file_name = file_name.replace(".xml", f"-{sanitized_title}.md")

                markdown_with_preamble = generate_md_preamble() + markdown_section + generate_md_postamble()
                master_markdown_content_list.append(markdown_with_preamble)

                md_filepath = save_to_file(markdown_with_preamble, current_md_save_dir, md_file_name)
                logging.info(f"Saved markdown section {idx} from {rss_url} to: {md_filepath}")
            
            master_md_filename = file_name.replace(".xml", "-master.md")
            save_master_markdown_file(master_markdown_content_list, current_md_save_dir, master_md_filename)
            logging.info(f"Saved master markdown file for {rss_url} to: {os.path.join(current_md_save_dir, master_md_filename)}")

if __name__ == "__main__":
    main()