In [20]:
import os
import requests
import hashlib
import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup

# --- Config ---
BLS_URL = "https://download.bls.gov/pub/time.series/pr/"
LOCAL_DIR = "local_bls_data"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; DataSyncBot/1.0; +https://ashwinsingaram.us)"
}

os.makedirs(LOCAL_DIR, exist_ok=True)

def fetch_bls_file_list(base_url):
    """Fetch list of files from BLS directory."""
    response = requests.get(base_url, headers=HEADERS)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    file_names = [
        a["href"].split("/")[-1]
        for a in soup.find_all("a", href=True)
        if not a["href"].endswith("/")  # ignore directories
    ]
    return file_names


def calculate_md5(content):
    return hashlib.md5(content).hexdigest()

def get_local_files():
    """List all files in local directory."""
    return os.listdir(LOCAL_DIR)

def build_comparison_dataframe(bls_files, local_files):
    """Return DataFrame comparing BLS vs local folder."""
    comparison = []

    for f in bls_files:
        file_path = os.path.join(LOCAL_DIR, f)
        status = "New"

        if f in local_files:
            # Compare hashes
            bls_url = urljoin(BLS_URL, f)
            bls_resp = requests.get(bls_url, headers=HEADERS)
            bls_md5 = calculate_md5(bls_resp.content)

            with open(file_path, "rb") as lf:
                local_md5 = calculate_md5(lf.read())

            if bls_md5 == local_md5:
                status = "Up-to-date"
            else:
                status = "Updated"

        comparison.append({"filename": f, "status": status})

    for f in local_files:
        if f not in bls_files:
            comparison.append({"filename": f, "status": "Deleted"})

    return pd.DataFrame(comparison)

def sync_files():
    bls_files = fetch_bls_file_list(BLS_URL)
    local_files = get_local_files()

    df = build_comparison_dataframe(bls_files, local_files)

    # Handle sync: download/update new/updated files, delete removed ones
    for _, row in df.iterrows():
        fname, status = row["filename"], row["status"]
        file_path = os.path.join(LOCAL_DIR, fname)

        if status in ["New", "Updated"]:
            url = urljoin(BLS_URL, fname)
            resp = requests.get(url, headers=HEADERS)
            with open(file_path, "wb") as f:
                f.write(resp.content)
            print(f"{status}: {fname} saved locally.")

        elif status == "Deleted":
            os.remove(file_path)
            print(f"Deleted: {fname} removed from local.")

        else:
            print(f"Skipping: {fname} already up-to-date.")

    return df

if __name__ == "__main__":
    df_result = sync_files()
    print("\nFinal Sync Status:\n")
    print(df_result)


New: pr.class saved locally.
New: pr.contacts saved locally.
New: pr.data.0.Current saved locally.
New: pr.data.1.AllData saved locally.
New: pr.duration saved locally.
New: pr.footnote saved locally.
New: pr.measure saved locally.
New: pr.period saved locally.
New: pr.seasonal saved locally.
New: pr.sector saved locally.
New: pr.series saved locally.
New: pr.txt saved locally.
Deleted: overview.txt removed from local.

Final Sync Status:

             filename   status
0            pr.class      New
1         pr.contacts      New
2   pr.data.0.Current      New
3   pr.data.1.AllData      New
4         pr.duration      New
5         pr.footnote      New
6          pr.measure      New
7           pr.period      New
8         pr.seasonal      New
9           pr.sector      New
10          pr.series      New
11             pr.txt      New
12       overview.txt  Deleted
