# ETL Development
### phase 0
| Task                                  | Status      | Comments  |
|---------------------------------------|-------------|-----------|
| Manually clone Azure repo             | Complete    | use single branch clone <br>* still receiving 600k+ objects (31.57GiB) <br>* full clone was 7.9 million      |
| Crawl through repo & get all md files | In Progress | None      |
| Prepare md for indexing               | In progress | None      |

In [None]:
# Set global vars
import os

PROJECT_ROOT = "/Users/alex/Desktop/code_projects/commuter-copilot/"
RAW_DATA_SOURCE_PATH = os.path.join(
    PROJECT_ROOT, "local_files", "data", "raw", "azure-docs"
)
STAGED_DATA_SOURCE_PATH = os.path.join(
    PROJECT_ROOT, "local_files", "data", "staging", "azure-docs-staging"
)

# Ensure the directories exist
os.makedirs(RAW_DATA_SOURCE_PATH, exist_ok=True)
os.makedirs(STAGED_DATA_SOURCE_PATH, exist_ok=True)

In [None]:
# manually update & clone azure repo to local_files/ in terminal:
# currently broken
# !bash -c "$PROJECT_ROOT/data/etl/run_etl_azure_docs_raw.sh"

In [None]:
# find all md files in the raw data source path
def find_md_files(path):
    md_files = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".md"):
                md_files.append(os.path.join(root, file))
    return md_files


all_md_files = find_md_files(RAW_DATA_SOURCE_PATH)
all_md_files[:3]

In [None]:
import uuid
from frontmatter import Frontmatter
import os


def process_markdown_file(file_path: str) -> dict:
    """Process a markdown file into an indexable document."""
    try:
        post = Frontmatter.read_file(file_path)
        doc_id = str(uuid.uuid4())
        # Attempt to extract title: frontmatter 'title' key or first markdown header
        title = post["attributes"]["title"] or extract_title(post["body"])

        return {
            "id": doc_id,
            "filename": os.path.basename(file_path),
            "path": file_path,
            "title": title,
            "content": post["body"],
            "metadata": {k: v for k, v in post["attributes"].items() if k != "title"},
        }
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None


def extract_title(content: str) -> str:
    """A simple heuristic: use the first line that starts with a '#'."""
    for line in content.splitlines():
        if line.startswith("#"):
            return line.lstrip("# ").strip()
    return "Untitled"

In [None]:
single_md = "/Users/alex/Desktop/code_projects/commuter-copilot/local_files/data/raw/azure-docs/articles/container-apps/alerts.md"
single_md2 = all_md_files[0]

post = process_markdown_file(single_md)
# print(post["metadata"])
# print(post.keys())
post2 = process_markdown_file(single_md2)
print(post2)

In [None]:
md_sample = all_md_files[4:8]
processed_docs = []
for file_path in all_md_files:
    try:
        processed_doc = process_markdown_file(file_path)
        if processed_doc is None:
            print(f"Skipping {file_path} due to processing error.")
            continue
        processed_docs.append(processed_doc)
        # print(f"Processed {file_path} successfully.\n\n{processed_doc['title']}\n")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")