In [1]:
import os
import zipfile
from io import BytesIO
import requests
import xml.etree.ElementTree as ET


In [2]:
# URL of the Divvy Tripdata index
BUCKET_ROOT = "https://divvy-tripdata.s3.amazonaws.com"

# Folder where extracted files will be saved
TARGET_DIR = r"C:/Users/anbun/Desktop/Portfolio projects/Google Data Analytics Capstone/Data/"
LIST_URL = BUCKET_ROOT + "?list-type=2&max-keys=1000"


In [3]:
os.makedirs(TARGET_DIR, exist_ok=True)

# Robust session with retries
from requests.adapters import HTTPAdapter, Retry
session = requests.Session()
retries = Retry(
    total=5,
    connect=5,
    read=5,
    backoff_factor=0.5,
    status_forcelist=(429, 500, 502, 503, 504),
)
session.mount("https://", HTTPAdapter(max_retries=retries))
session.headers.update({"User-Agent": "divvy-downloader/1.0"})

def list_all_zip_keys():
    """List all object keys in the public S3 bucket that end with .zip (handles pagination)."""
    keys = []
    next_token = None

    while True:
        url = LIST_URL if not next_token else f"{LIST_URL}&continuation-token={next_token}"
        r = session.get(url, timeout=60)
        r.raise_for_status()

        root = ET.fromstring(r.content)
        # XML uses default namespace; handle it safely
        ns = {"s3": root.tag.split("}")[0].strip("{")} if "}" in root.tag else {}

        for key_el in root.findall("s3:Contents/s3:Key", ns) or root.findall("Contents/Key"):
            key = key_el.text.strip()
            if key.lower().endswith(".zip"):
                keys.append(key)

        is_truncated_el = root.find("s3:IsTruncated", ns) or root.find("IsTruncated")
        truncated = (is_truncated_el is not None and is_truncated_el.text.lower() == "true")

        if truncated:
            next_el = root.find("s3:NextContinuationToken", ns) or root.find("NextContinuationToken")
            next_token = next_el.text
        else:
            break

    return keys

def download_and_extract(key):
    url = f"{BUCKET_ROOT}/{key}"
    fname = os.path.basename(key)
    print(f"Downloading {fname} ...")
    resp = session.get(url, timeout=300)
    resp.raise_for_status()

    with zipfile.ZipFile(BytesIO(resp.content)) as zf:
        print(f"  Extracting {fname} -> {TARGET_DIR}")
        # Overwrite existing files to keep things simple; comment next line to prevent overwrites
        zf.extractall(TARGET_DIR)

def main():
    print("Listing files from S3 bucket...")
    zip_keys = list_all_zip_keys()
    print(f"Found {len(zip_keys)} zip files.\n")

    for i, key in enumerate(zip_keys, start=1):
        try:
            print(f"[{i}/{len(zip_keys)}]")
            download_and_extract(key)
        except Exception as e:
            print(f"❌ Problem with {key}: {e}")

    print("\n✅ Done! All archives downloaded and extracted to:")
    print(TARGET_DIR)

if __name__ == "__main__":
    main()


Listing files from S3 bucket...
Found 84 zip files.

[1/84]
Downloading 202004-divvy-tripdata.zip ...
  Extracting 202004-divvy-tripdata.zip -> C:/Users/anbun/Desktop/Portfolio projects/Google Data Analytics Capstone/Data/
[2/84]
Downloading 202005-divvy-tripdata.zip ...
  Extracting 202005-divvy-tripdata.zip -> C:/Users/anbun/Desktop/Portfolio projects/Google Data Analytics Capstone/Data/
[3/84]
Downloading 202006-divvy-tripdata.zip ...
  Extracting 202006-divvy-tripdata.zip -> C:/Users/anbun/Desktop/Portfolio projects/Google Data Analytics Capstone/Data/
[4/84]
Downloading 202007-divvy-tripdata.zip ...
  Extracting 202007-divvy-tripdata.zip -> C:/Users/anbun/Desktop/Portfolio projects/Google Data Analytics Capstone/Data/
[5/84]
Downloading 202008-divvy-tripdata.zip ...
  Extracting 202008-divvy-tripdata.zip -> C:/Users/anbun/Desktop/Portfolio projects/Google Data Analytics Capstone/Data/
[6/84]
Downloading 202009-divvy-tripdata.zip ...
  Extracting 202009-divvy-tripdata.zip -> C:/Use