In [4]:
import pathlib
import time
import zipfile
import sys
import shutil  # move finished files into Google Drive safely

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# -----------------------
# Page element IDs
# -----------------------
YEAR_DROPDOWN_ID = "cboYear"        # <select> with the years
QUARTER_DROPDOWN_ID = "cboPeriod"   # <select> with the quarters (ignored if QUARTERS=[])
DOWNLOAD_BTN_ID  = "btnDownload"    # "Download" button
CHECKBOX_ID      = "chkAllVars"     # "Select all variables" (used if REQUIRED_FIELDS=[])

# -----------------------
# Run settings
# -----------------------
FIRST_YEAR, LAST_YEAR = 2005, 2025      # inclusive loop
TIMEOUT_SEC           = 600             # max wait for page elements
HEADLESS              = True            # set False to watch the browser
DOWNLOAD_WAIT_SEC     = 600             # max time to wait for a download
OVERWRITE_FILES       = "overwrite"     # "overwrite", "timestamp", or "skip"
REFRESH_EVERY_N_DOWNLOADS = 4           # refresh page every N downloads to avoid popups

# -----------------------
# Paths
# -----------------------
# Google Drive (synced) folder (final destination)
GDRIVE_MYDRIVE = pathlib.Path(
    "/Users/akimovh/Library/CloudStorage/GoogleDrive-akimovhresearch@gmail.com/My Drive/predatory_pricing_airlines"
)

# NEW: local (NOT synced) working directory for downloads/unzips
# This avoids Google Drive syncing partially-downloaded files (common source of corruption/timeouts).
LOCAL_WORKDIR = pathlib.Path.home() / "bts_download_tmp"
LOCAL_WORKDIR.mkdir(parents=True, exist_ok=True)


# Selenium extractor

In [5]:
def make_driver() -> webdriver.Chrome:
    """Create Chrome driver with proper download settings (works on Mac locally)."""
    opts = webdriver.ChromeOptions()

    if HEADLESS:
        opts.add_argument("--headless=new")

    if sys.platform.startswith("linux"):
        opts.add_argument("--no-sandbox")
        opts.add_argument("--disable-dev-shm-usage")
        opts.add_argument("--disable-gpu")
        opts.add_argument("--disable-software-rasterizer")

    prefs = {
        "download.default_directory": str(ZIP_DIR.resolve()),
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True,
        "profile.default_content_settings.popups": 0,
        "profile.default_content_setting_values.automatic_downloads": 1,
    }
    opts.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(options=opts)

    try:
        driver.execute_cdp_cmd(
            "Page.setDownloadBehavior",
            {"behavior": "allow", "downloadPath": str(ZIP_DIR.resolve())},
        )
    except Exception:
        driver.execute_cdp_cmd(
            "Browser.setDownloadBehavior",
            {"behavior": "allow", "downloadPath": str(ZIP_DIR.resolve())},
        )

    return driver


def wait_for_download_complete(
    initial_files: set, max_wait_sec: int = DOWNLOAD_WAIT_SEC
) -> pathlib.Path | None:
    """Wait for a new file to appear in ZIP_DIR and finish downloading."""
    start_time = time.time()
    last_sizes = {}

    while time.time() - start_time < max_wait_sec:
        if list(ZIP_DIR.glob("*.crdownload")):
            print("Download in progress...")
            time.sleep(2)
            continue

        current_files = set(ZIP_DIR.iterdir())
        new_files = [f for f in (current_files - initial_files) if f.is_file()]
        new_files = [f for f in new_files if f.stat().st_size > 1000]

        if new_files:
            newest = max(new_files, key=lambda p: p.stat().st_mtime)
            size_now = newest.stat().st_size
            size_prev = last_sizes.get(newest, None)
            last_sizes[newest] = size_now

            if size_prev is not None and size_now == size_prev:
                print(f"‚úì Download complete: {newest.name} ({size_now} bytes)")
                return newest

        time.sleep(1)

    print(f"‚ö† Download timeout after {max_wait_sec} seconds")
    return None


def dismiss_popups(driver, wait):
    """Try to dismiss any survey popups or overlays that might be blocking the page."""
    try:
        popup_selectors = [
            "div[id*='QSIPopOver']",
            "div[class*='popup']",
            "div[class*='overlay']",
            "div[class*='modal']",
            "button[id*='close']",
            "button[class*='close']",
            "[aria-label*='close']",
            "[aria-label*='Close']",
        ]

        dismissed = False
        for selector in popup_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    if element.is_displayed():
                        print(f"    üö´ Dismissing popup: {selector}")
                        element.click()
                        dismissed = True
                        time.sleep(1)
                        break
            except Exception:
                continue

        if not dismissed:
            try:
                driver.find_element(By.TAG_NAME, "body").send_keys(Keys.ESCAPE)
                time.sleep(1)
            except Exception:
                pass

    except Exception as e:
        print(f"    ‚ö† Popup dismissal failed: {e}")


def select_data_fields(driver, wait):
    """Select data fields - either use 'select all' checkbox or individual fields."""
    if not REQUIRED_FIELDS:
        try:
            print("  üìã Using 'Select All' checkbox...")
            chk = wait.until(EC.element_to_be_clickable((By.ID, CHECKBOX_ID)))
            if not chk.is_selected():
                chk.click()
                print("  ‚úì All fields selected via checkbox")
            else:
                print("  ‚úì All fields already selected")
        except Exception as e:
            print(f"  ‚ö† Select all checkbox failed: {e}")
            print("  üîÑ Falling back to manual field selection...")
        else:
            try:
                zip_checkbox = driver.find_element(By.ID, "chkDownloadZip")
                if not zip_checkbox.is_selected():
                    zip_checkbox.click()
            except Exception:
                pass
            return True, []

    print(f"  üìã Selecting {len(REQUIRED_FIELDS)} required data fields...")
    selected_count = 0
    missing_fields = []

    for field_name in REQUIRED_FIELDS:
        possible_selectors = [
            f"input[value='{field_name}']",
            f"input[name*='{field_name}']",
            f"input[id*='{field_name}']",
            f"//input[@value='{field_name}']",
            f"//input[contains(@name, '{field_name}')]",
            f"//input[contains(@id, '{field_name}')]",
        ]

        field_found = False
        for selector in possible_selectors:
            try:
                element = (
                    driver.find_element(By.XPATH, selector)
                    if selector.startswith("//")
                    else driver.find_element(By.CSS_SELECTOR, selector)
                )
                if not element.is_selected():
                    driver.execute_script("arguments[0].scrollIntoView(true);", element)
                    time.sleep(0.1)
                    element.click()
                selected_count += 1
                field_found = True
                print(f"    ‚úì {field_name}")
                break
            except Exception:
                continue

        if not field_found:
            missing_fields.append(field_name)
            print(f"    ‚ùå {field_name} (not found)")

    if selected_count > 0:
        try:
            zip_checkbox = driver.find_element(By.ID, "chkDownloadZip")
            if not zip_checkbox.is_selected():
                zip_checkbox.click()
        except Exception:
            pass

    return selected_count > 0, missing_fields


def click_download_button_robustly(driver, wait):
    """Try multiple methods to click the download button, handling popups and overlays."""
    try:
        dismiss_popups(driver, wait)
        dl_btn = wait.until(EC.element_to_be_clickable((By.ID, DOWNLOAD_BTN_ID)))
        dl_btn.click()
        print("    ‚úì Download button clicked (normal)")
        return True
    except Exception as e:
        print(f"    ‚ö† Normal click failed: {str(e)[:100]}...")

    try:
        dl_btn = driver.find_element(By.ID, DOWNLOAD_BTN_ID)
        driver.execute_script("arguments[0].click();", dl_btn)
        print("    ‚úì Download button clicked (JavaScript)")
        return True
    except Exception as e:
        print(f"    ‚ö† JavaScript click failed: {str(e)[:100]}...")

    try:
        dl_btn = driver.find_element(By.ID, DOWNLOAD_BTN_ID)
        driver.execute_script("arguments[0].scrollIntoView(true);", dl_btn)
        time.sleep(1)
        dl_btn.click()
        print("    ‚úì Download button clicked (scroll + click)")
        return True
    except Exception as e:
        print(f"    ‚ö† Scroll + click failed: {str(e)[:100]}...")

    raise Exception("All download button click methods failed")


def is_valid_zip(file_path: pathlib.Path) -> bool:
    """Check if a file is a valid ZIP archive."""
    try:
        with zipfile.ZipFile(file_path, "r") as z:
            z.namelist()
        return True
    except zipfile.BadZipFile:
        return False
    except Exception as e:
        print(f"Error checking ZIP file: {e}")
        return False


def unzip_to_drive(
    year: int, quarter: int = None, zip_path: pathlib.Path = None, overwrite: str = "skip"
) -> bool:
    """Extract ZIP file to RAW_DIR with prefixed filenames (flat, no subfolders)."""
    try:
        RAW_DIR.mkdir(parents=True, exist_ok=True)
        prefix = f"{year}_Q{quarter}" if quarter is not None else f"{year}"

        with zipfile.ZipFile(zip_path, "r") as z:
            members = [m for m in z.infolist() if not m.is_dir()]
            print(f"  Extracting {len(members)} files with prefix {prefix}")

            extracted_count = 0
            skipped_count = 0

            for m in members:
                base_name = pathlib.Path(m.filename).name
                stem = pathlib.Path(base_name).stem
                suffix = pathlib.Path(base_name).suffix

                new_filename = f"{prefix}_{stem}{suffix}"
                out_path = RAW_DIR / new_filename

                if out_path.exists():
                    if overwrite == "overwrite":
                        out_path.unlink()
                    elif overwrite == "timestamp":
                        import datetime
                        timestamp = datetime.datetime.now().strftime("%H%M%S")
                        out_path = RAW_DIR / f"{prefix}_{stem}_{timestamp}{suffix}"
                    elif overwrite == "skip":
                        skipped_count += 1
                        continue

                tmp_path = out_path.with_suffix(out_path.suffix + ".tmp")
                with z.open(m, "r") as src, open(tmp_path, "wb") as dst:
                    shutil.copyfileobj(src, dst)
                tmp_path.replace(out_path)

                extracted_count += 1

            print(f"  üìä Extracted: {extracted_count}, Skipped: {skipped_count}")
        return True

    except Exception as e:
        print(f"Error extracting {zip_path}: {e}")
        return False


def sync_and_cleanup_one_download(download_id: str, clean_zip_path: pathlib.Path) -> None:
    """
    ‚úÖ Move results to Google Drive and delete local copies immediately.
    Assumes:
      - clean_zip_path is in ZIP_DIR (local)
      - extracted files are in RAW_DIR (local) and are prefixed with '{download_id}_'
      - DRIVE_ZIP_DIR and DRIVE_RAW_DIR exist (Google Drive destination)
    """
    # 1) Move ZIP to Google Drive
    try:
        dest_zip = DRIVE_ZIP_DIR / clean_zip_path.name
        if dest_zip.exists():
            dest_zip.unlink()
        shutil.move(str(clean_zip_path), str(dest_zip))
    except Exception as e:
        print(f"  ‚ö† Failed to move ZIP to Drive for {download_id}: {e}")

    # 2) Move extracted files (only those for this download_id)
    moved = 0
    for f in RAW_DIR.glob(f"{download_id}_*"):
        try:
            dest = DRIVE_RAW_DIR / f.name
            if dest.exists():
                dest.unlink()
            shutil.move(str(f), str(dest))
            moved += 1
        except Exception as e:
            print(f"  ‚ö† Failed to move {f.name} to Drive: {e}")

    # 3) Delete any leftovers locally (safety)
    # - ZIP already moved; remove temp or partials if any
    for tmp in ZIP_DIR.glob("*.crdownload"):
        try:
            tmp.unlink()
        except Exception:
            pass
    for tmp in RAW_DIR.glob("*.tmp"):
        try:
            tmp.unlink()
        except Exception:
            pass

    print(f"  üßπ Synced to Drive + cleaned local for {download_id} (moved {moved} raw files)")


def main():
    driver = None
    downloads_processed = 0
    is_quarterly = len(QUARTERS) > 0
    data_type = "quarterly" if is_quarterly else "annual"

    try:
        print(f"Starting BTS {data_type} data download...")
        print("Dataset: DB1B (Quarterly)" if is_quarterly else "Dataset: T100D (Annual)")

        driver = make_driver()
        wait = WebDriverWait(driver, TIMEOUT_SEC)

        print(f"Navigating to: {START_URL}")
        driver.get(START_URL)

        try:
            wait.until(EC.presence_of_element_located((By.ID, YEAR_DROPDOWN_ID)))
            print("‚úì Page loaded successfully")
        except TimeoutException:
            print("‚ùå Page failed to load properly")
            return

        quarters_to_process = QUARTERS if is_quarterly else [None]

        for yr in range(FIRST_YEAR, LAST_YEAR + 1):
            for qtr in quarters_to_process:
                download_id = f"{yr}_Q{qtr}" if qtr is not None else f"{yr}"
                print(f"\nüì• Processing {download_id}...")

                if downloads_processed > 0 and downloads_processed % REFRESH_EVERY_N_DOWNLOADS == 0:
                    print("  üîÑ Refreshing page to avoid popups...")
                    driver.get(START_URL)
                    try:
                        wait.until(EC.presence_of_element_located((By.ID, YEAR_DROPDOWN_ID)))
                        print("  ‚úì Page refreshed successfully")
                    except TimeoutException:
                        print("  ‚ùå Page refresh failed")
                        continue

                initial_files = set(ZIP_DIR.iterdir())

                try:
                    print(f"  Selecting year {yr}")
                    year_dropdown = Select(driver.find_element(By.ID, YEAR_DROPDOWN_ID))
                    year_dropdown.select_by_visible_text(str(yr))
                    time.sleep(1)

                    if qtr is not None:
                        print(f"  Selecting quarter {qtr}")
                        quarter_dropdown = Select(driver.find_element(By.ID, QUARTER_DROPDOWN_ID))
                        quarter_dropdown.select_by_visible_text(f"Quarter {qtr}")
                        time.sleep(1)

                    fields_selected, _ = select_data_fields(driver, wait)
                    if not fields_selected:
                        print(f"‚ùå No fields selected for {download_id}")
                        continue

                    print("  Clicking download button")
                    click_download_button_robustly(driver, wait)

                    print("  Waiting for download to complete...")
                    downloaded_file = wait_for_download_complete(initial_files)
                    if downloaded_file is None:
                        print(f"‚ùå Download failed for {download_id}")
                        continue

                    if not is_valid_zip(downloaded_file):
                        print(f"‚ùå Downloaded file is not a valid ZIP: {downloaded_file}")
                        # cleanup bad download locally
                        try:
                            downloaded_file.unlink()
                        except Exception:
                            pass
                        continue

                    # Rename locally to a clean name
                    clean_name = ZIP_DIR / f"{download_id}.zip"
                    if clean_name.exists():
                        clean_name.unlink()
                    downloaded_file.rename(clean_name)
                    print(f"  Renamed to: {clean_name.name}")

                    # Unzip locally
                    print("  Extracting files...")
                    if unzip_to_drive(yr, qtr, clean_name, overwrite=OVERWRITE_FILES):
                        print(f"‚úÖ {download_id} completed successfully")
                        downloads_processed += 1

                        # ‚úÖ Immediately sync to Google Drive + delete local files
                        sync_and_cleanup_one_download(download_id, clean_name)
                    else:
                        print(f"‚ùå Extraction failed for {download_id}")
                        # If unzip failed, remove the local zip to avoid buildup
                        try:
                            if clean_name.exists():
                                clean_name.unlink()
                        except Exception:
                            pass

                except TimeoutException as e:
                    print(f"‚ùå Timeout error for {download_id}: {e}")
                    print("  üîÑ Attempting page refresh due to timeout...")
                    try:
                        driver.get(START_URL)
                        wait.until(EC.presence_of_element_located((By.ID, YEAR_DROPDOWN_ID)))
                    except Exception:
                        print("  ‚ùå Recovery refresh failed")

                except Exception as e:
                    print(f"‚ùå Error processing {download_id}: {e}")
                    if "click intercepted" in str(e) or "popup" in str(e).lower():
                        print("  üîÑ Popup detected, refreshing page...")
                        try:
                            driver.get(START_URL)
                            wait.until(EC.presence_of_element_located((By.ID, YEAR_DROPDOWN_ID)))
                        except Exception:
                            print("  ‚ùå Recovery refresh failed")

    except Exception as e:
        print(f"‚ùå Fatal error: {e}")

    finally:
        if driver:
            driver.quit()
            print("\nüîí Browser closed")

    print(f"\nüìã SUMMARY:")
    print(f"  Data type: {data_type}")
    print(f"  Downloads processed: {downloads_processed}")
    print(f"  Local ZIP dir: {ZIP_DIR}")
    print(f"  Local RAW dir: {RAW_DIR}")
    print(f"  Drive ZIP dir: {DRIVE_ZIP_DIR}")
    print(f"  Drive RAW dir: {DRIVE_RAW_DIR}")


# T100

In [3]:
# -----------------------
# CONFIG: T100D (Annual)
# -----------------------

# Final destination in Google Drive (synced)
DRIVE_ROOT = GDRIVE_MYDRIVE / "data" / "T100"
DRIVE_ZIP_DIR = DRIVE_ROOT / "zip"
DRIVE_RAW_DIR = DRIVE_ROOT / "raw"
DRIVE_ZIP_DIR.mkdir(parents=True, exist_ok=True)
DRIVE_RAW_DIR.mkdir(parents=True, exist_ok=True)

# Local working folders (NOT synced) ‚Äî where Chrome downloads & unzip happens
ZIP_DIR = LOCAL_WORKDIR / "zip"
RAW_DIR = LOCAL_WORKDIR / "raw"
ZIP_DIR.mkdir(parents=True, exist_ok=True)
RAW_DIR.mkdir(parents=True, exist_ok=True)

START_URL = "https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=GEE&QO_fu146_anzr=Nv4+Pn44vr45"
QUARTERS = []          # Empty = annual data
REQUIRED_FIELDS = []   # Empty = use "select all" button
CHECKBOX_ID = "chkAllVars"


def cleanup_local_workdir():
    """
    Local safety cleanup.
    In the new design we clean per-iteration, so this should remove only leftovers
    like .crdownload or .tmp if something was interrupted.
    """
    for p in ZIP_DIR.glob("*.crdownload"):
        try:
            p.unlink()
        except Exception:
            pass

    for p in RAW_DIR.glob("*.tmp"):
        try:
            p.unlink()
        except Exception:
            pass


try:
    main()
    cleanup_local_workdir()

    print("\n‚úÖ Done.")
    print(f"   Google Drive ZIPs ‚Üí {DRIVE_ZIP_DIR}")
    print(f"   Google Drive CSVs ‚Üí {DRIVE_RAW_DIR}")
    print(f"   Local workdir (should be mostly empty) ‚Üí {LOCAL_WORKDIR}")

except KeyboardInterrupt:
    print("\n‚èπ Download interrupted by user")
    cleanup_local_workdir()
    print("üßπ Cleaned local temp leftovers")

except Exception as e:
    print(f"\nüí• Unexpected error: {e}")
    cleanup_local_workdir()
    print("üßπ Cleaned local temp leftovers")
    sys.exit(1)


Starting BTS annual data download...
Dataset: T100D (Annual)
Navigating to: https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=GEE&QO_fu146_anzr=Nv4+Pn44vr45

üîí Browser closed

‚èπ Download interrupted by user
üßπ Cleaned local temp leftovers


# DB1B

## Market

In [None]:
# -----------------------
# CONFIG: DB1B (Quarterly)
# -----------------------

# Final destination in Google Drive (synced)
DRIVE_ROOT = GDRIVE_MYDRIVE / "data" / "DB1B" / "Market"
DRIVE_ZIP_DIR = DRIVE_ROOT / "zip"
DRIVE_RAW_DIR = DRIVE_ROOT / "raw"
DRIVE_ZIP_DIR.mkdir(parents=True, exist_ok=True)
DRIVE_RAW_DIR.mkdir(parents=True, exist_ok=True)

# Local working folders (NOT synced) ‚Äî Chrome downloads & unzip happen here
ZIP_DIR = LOCAL_WORKDIR / "zip"
RAW_DIR = LOCAL_WORKDIR / "raw"
ZIP_DIR.mkdir(parents=True, exist_ok=True)
RAW_DIR.mkdir(parents=True, exist_ok=True)

START_URL = "https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=FHK&QO_fu146_anzr=b4vtv0+n0q+Qr56v0n6v10+f748rB"
QUARTERS = [1, 2, 3, 4]  # quarterly
REQUIRED_FIELDS = [
    "ITIN_ID", "MKT_ID", "MARKET_COUPONS", "YEAR", "QUARTER", "ORIGIN", "ORIGIN_COUNTRY",
    "ORIGIN_STATE_FIPS", "ORIGIN_STATE_ABR", "ORIGIN_STATE_NM", "ORIGIN_WAC", "DEST",
    "DEST_COUNTRY", "DEST_STATE_FIPS", "DEST_STATE_ABR", "DEST_STATE_NM", "DEST_WAC",
    "AIRPORT_GROUP", "WAC_GROUP", "TK_CARRIER_CHANGE", "TK_CARRIER_GROUP",
    "OP_CARRIER_CHANGE", "OP_CARRIER_GROUP", "REPORTING_CARRIER", "TICKET_CARRIER",
    "OPERATING_CARRIER", "BULK_FARE", "PASSENGERS", "MARKET_FARE", "MARKET_DISTANCE",
    "DISTANCE_GROUP", "MARKET_MILES_FLOWN", "NONSTOP_MILES", "ITIN_GEO_TYPE", "MKT_GEO_TYPE"
]
CHECKBOX_ID = "chkAllVars"  # unused when REQUIRED_FIELDS is non-empty, but safe to keep


def cleanup_local_workdir():
    """Remove leftovers if interrupted mid-download."""
    for p in ZIP_DIR.glob("*.crdownload"):
        try:
            p.unlink()
        except Exception:
            pass
    for p in RAW_DIR.glob("*.tmp"):
        try:
            p.unlink()
        except Exception:
            pass


try:
    main()
    cleanup_local_workdir()

    print("\n‚úÖ Done.")
    print(f"   Google Drive ZIPs ‚Üí {DRIVE_ZIP_DIR}")
    print(f"   Google Drive CSVs ‚Üí {DRIVE_RAW_DIR}")
    print(f"   Local workdir (should be mostly empty) ‚Üí {LOCAL_WORKDIR}")

except KeyboardInterrupt:
    print("\n‚èπ Download interrupted by user")
    cleanup_local_workdir()
    print("üßπ Cleaned local temp leftovers")

except Exception as e:
    print(f"\nüí• Unexpected error: {e}")
    cleanup_local_workdir()
    print("üßπ Cleaned local temp leftovers")
    sys.exit(1)


Starting BTS quarterly data download...
Dataset: DB1B (Quarterly)
Navigating to: https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=FHK&QO_fu146_anzr=b4vtv0+n0q+Qr56v0n6v10+f748rB
‚úì Page loaded successfully

üì• Processing 2005_Q1...
  Selecting year 2005
  Selecting quarter 1
  üìã Selecting 35 required data fields...
    ‚úì ITIN_ID
    ‚úì MKT_ID
    ‚úì MARKET_COUPONS
    ‚úì YEAR
    ‚úì QUARTER
    ‚úì ORIGIN
    ‚úì ORIGIN_COUNTRY
    ‚úì ORIGIN_STATE_FIPS
    ‚úì ORIGIN_STATE_ABR
    ‚úì ORIGIN_STATE_NM
    ‚úì ORIGIN_WAC
    ‚úì DEST
    ‚úì DEST_COUNTRY
    ‚úì DEST_STATE_FIPS
    ‚úì DEST_STATE_ABR
    ‚úì DEST_STATE_NM
    ‚úì DEST_WAC
    ‚úì AIRPORT_GROUP
    ‚úì WAC_GROUP
    ‚úì TK_CARRIER_CHANGE
    ‚úì TK_CARRIER_GROUP
    ‚úì OP_CARRIER_CHANGE
    ‚úì OP_CARRIER_GROUP
    ‚úì REPORTING_CARRIER
    ‚úì TICKET_CARRIER
    ‚úì OPERATING_CARRIER
    ‚úì BULK_FARE
    ‚úì PASSENGERS
    ‚úì MARKET_FARE
    ‚úì MARKET_DISTANCE
    ‚úì DISTANCE_GROUP
    ‚úì MA

## Ticket

In [None]:
# -----------------------
# CONFIG: DB1B (Quarterly)
# -----------------------

# Final destination in Google Drive (synced)
DRIVE_ROOT = GDRIVE_MYDRIVE / "data" / "DB1B" / "Ticket"
DRIVE_ZIP_DIR = DRIVE_ROOT / "zip"
DRIVE_RAW_DIR = DRIVE_ROOT / "raw"
DRIVE_ZIP_DIR.mkdir(parents=True, exist_ok=True)
DRIVE_RAW_DIR.mkdir(parents=True, exist_ok=True)

# Local working folders (NOT synced) ‚Äî Chrome downloads & unzip happen here
ZIP_DIR = LOCAL_WORKDIR / "zip"
RAW_DIR = LOCAL_WORKDIR / "raw"
ZIP_DIR.mkdir(parents=True, exist_ok=True)
RAW_DIR.mkdir(parents=True, exist_ok=True)

START_URL = "https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=FKF&QO_fu146_anzr=b4vtv0%20n0q%20Qr56v0n6v10%20f748rB"
QUARTERS = [1, 2, 3, 4]  # quarterly
REQUIRED_FIELDS = [
    "ITIN_ID",
    "COUPONS",
    "YEAR",
    "QUARTER",
    "ORIGIN",
    "ORIGIN_AIRPORT_ID",
    "ORIGIN_AIRPORT_SEQ_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN_COUNTRY",
    "ORIGIN_STATE_FIPS",
    "ORIGIN_STATE_ABR",
    "ORIGIN_STATE_NM",
    "ORIGIN_WAC",
    "ROUNDTRIP",
    "ONLINE",
    "DOLLAR_CRED",
    "ITIN_YIELD",
    "REPORTING_CARRIER",
    "PASSENGERS",
    "ITIN_FARE",
    "BULK_FARE",
    "DISTANCE",
    "DISTANCE_GROUP",
    "MILES_FLOWN",
    "ITIN_GEO_TYPE",
]

CHECKBOX_ID = "chkAllVars"  # unused when REQUIRED_FIELDS is non-empty, but safe to keep


def cleanup_local_workdir():
    """Remove leftovers if interrupted mid-download."""
    for p in ZIP_DIR.glob("*.crdownload"):
        try:
            p.unlink()
        except Exception:
            pass
    for p in RAW_DIR.glob("*.tmp"):
        try:
            p.unlink()
        except Exception:
            pass


try:
    main()
    cleanup_local_workdir()

    print("\n‚úÖ Done.")
    print(f"   Google Drive ZIPs ‚Üí {DRIVE_ZIP_DIR}")
    print(f"   Google Drive CSVs ‚Üí {DRIVE_RAW_DIR}")
    print(f"   Local workdir (should be mostly empty) ‚Üí {LOCAL_WORKDIR}")

except KeyboardInterrupt:
    print("\n‚èπ Download interrupted by user")
    cleanup_local_workdir()
    print("üßπ Cleaned local temp leftovers")

except Exception as e:
    print(f"\nüí• Unexpected error: {e}")
    cleanup_local_workdir()
    print("üßπ Cleaned local temp leftovers")
    sys.exit(1)


Starting BTS quarterly data download...
Dataset: DB1B (Quarterly)
Navigating to: https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=FKF&QO_fu146_anzr=b4vtv0%20n0q%20Qr56v0n6v10%20f748rB
‚úì Page loaded successfully

üì• Processing 2005_Q1...
  Selecting year 2005
  Selecting quarter 1
  üìã Selecting 25 required data fields...
    ‚úì ITIN_ID
    ‚úì COUPONS
    ‚úì YEAR
    ‚úì QUARTER
    ‚úì ORIGIN
    ‚úì ORIGIN_AIRPORT_ID
    ‚úì ORIGIN_AIRPORT_SEQ_ID
    ‚úì ORIGIN_CITY_MARKET_ID
    ‚úì ORIGIN_COUNTRY
    ‚úì ORIGIN_STATE_FIPS
    ‚úì ORIGIN_STATE_ABR
    ‚úì ORIGIN_STATE_NM
    ‚úì ORIGIN_WAC
    ‚úì ROUNDTRIP
    ‚úì ONLINE
    ‚úì DOLLAR_CRED
    ‚úì ITIN_YIELD
    ‚úì REPORTING_CARRIER
    ‚úì PASSENGERS
    ‚úì ITIN_FARE
    ‚úì BULK_FARE
    ‚úì DISTANCE
    ‚úì DISTANCE_GROUP
    ‚úì MILES_FLOWN
    ‚úì ITIN_GEO_TYPE
  Clicking download button
    ‚úì Download button clicked (normal)
  Waiting for download to complete...
Download in progress...
Download in prog

# T-F41

In [7]:
# -----------------------
# CONFIG: T-F41 (Schedule P-1.2) - Quarterly
# -----------------------

FIRST_YEAR, LAST_YEAR = 2005, 2025

# Final destination in Google Drive (synced)
DRIVE_ROOT = GDRIVE_MYDRIVE / "data" / "T-F41"
DRIVE_ZIP_DIR = DRIVE_ROOT / "zip"
DRIVE_RAW_DIR = DRIVE_ROOT / "raw"
DRIVE_ZIP_DIR.mkdir(parents=True, exist_ok=True)
DRIVE_RAW_DIR.mkdir(parents=True, exist_ok=True)

# Local working folders (NOT synced) ‚Äî keep dataset-specific to avoid collisions
ZIP_DIR = LOCAL_WORKDIR / "T-F41" / "zip"
RAW_DIR = LOCAL_WORKDIR / "T-F41" / "raw"
ZIP_DIR.mkdir(parents=True, exist_ok=True)
RAW_DIR.mkdir(parents=True, exist_ok=True)

# Your link (Air Carrier Financial : Schedule P-1.2). :contentReference[oaicite:3]{index=3}
START_URL = "https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=FMI&QO_fu146_anzr=Nv4%20Pn44vr4%20Sv0n0pvny"

# Annualy
QUARTERS = []
REQUIRED_FIELDS = []        # empty => select-all behavior
CHECKBOX_ID = "chkAllVars"  # keep for pages where it‚Äôs a checkbox


def cleanup_local_workdir():
    """Remove leftovers if interrupted mid-download."""
    for p in ZIP_DIR.glob("*.crdownload"):
        try: p.unlink()
        except Exception: pass
    for p in RAW_DIR.glob("*.tmp"):
        try: p.unlink()
        except Exception: pass


try:
    main()
    cleanup_local_workdir()

    print("\n‚úÖ Done.")
    print(f"   Google Drive ZIPs ‚Üí {DRIVE_ZIP_DIR}")
    print(f"   Google Drive CSVs ‚Üí {DRIVE_RAW_DIR}")
    print(f"   Local workdir (should be mostly empty) ‚Üí {LOCAL_WORKDIR}")

except KeyboardInterrupt:
    print("\n‚èπ Download interrupted by user")
    cleanup_local_workdir()
    print("üßπ Cleaned local temp leftovers")

except Exception as e:
    print(f"\nüí• Unexpected error: {e}")
    cleanup_local_workdir()
    print("üßπ Cleaned local temp leftovers")
    sys.exit(1)


Starting BTS annual data download...
Dataset: T100D (Annual)
Navigating to: https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=FMI&QO_fu146_anzr=Nv4%20Pn44vr4%20Sv0n0pvny
‚úì Page loaded successfully

üì• Processing 2005...
  Selecting year 2005
  üìã Using 'Select All' checkbox...
  ‚úì All fields selected via checkbox
  Clicking download button
    ‚úì Download button clicked (normal)
  Waiting for download to complete...
‚úì Download complete: T_F41SCHEDULE_P12_20260114_230731.zip (48664 bytes)
  Renamed to: 2005.zip
  Extracting files...
  Extracting 1 files with prefix 2005
  üìä Extracted: 1, Skipped: 0
‚úÖ 2005 completed successfully
  üßπ Synced to Drive + cleaned local for 2005 (moved 1 raw files)

üì• Processing 2006...
  Selecting year 2006
  üìã Using 'Select All' checkbox...
  ‚úì All fields already selected
  Clicking download button
    ‚úì Download button clicked (normal)
  Waiting for download to complete...
Download in progress...
‚úì Download complete: