In [5]:
import os
import time
import requests
import pandas as pd
from tqdm import tqdm

# ===================== CONFIG =====================

MAPBOX_TOKEN = os.getenv("MAPBOX_TOKEN")  # set via environment variable
STYLE = "mapbox/satellite-v9"

IMAGE_SIZE = "224x224"   # CNN-friendly
ZOOM = 17                # good for neighborhood-level context
SCALE = 2                # high-resolution (retina)

SLEEP_TIME = 0.15        # prevents hitting rate limits

# ==================================================


def fetch_image(lat, lon, save_path):
    """
    Fetch a satellite image for given latitude & longitude
    and save it to disk.
    """
    url = (
        f"https://api.mapbox.com/styles/v1/{STYLE}/static/"
        f"{lon},{lat},{ZOOM}/"
        f"{IMAGE_SIZE}@{SCALE}x"
        f"?access_token={MAPBOX_TOKEN}"
    )

    response = requests.get(url, timeout=10)

    if response.status_code == 200:
        with open(save_path, "wb") as f:
            f.write(response.content)
        return True
    else:
        return False


def download_images(csv_or_excel_path, output_dir, id_col="id"):
    """
    Download satellite images for all rows in a dataset.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Load dataset
    if csv_or_excel_path.endswith(".csv"):
        df = pd.read_csv(csv_or_excel_path)
    else:
        df = pd.read_excel(csv_or_excel_path)

    assert "lat" in df.columns and "long" in df.columns, \
        "Dataset must contain 'lat' and 'long' columns"

    success, failed = 0, 0

    for _, row in tqdm(df.iterrows(), total=len(df)):
        prop_id = row[id_col]
        lat = row["lat"]
        lon = row["long"]

        save_path = os.path.join(output_dir, f"{prop_id}.png")

        # Skip already downloaded images
        if os.path.exists(save_path):
            continue

        ok = fetch_image(lat, lon, save_path)

        if ok:
            success += 1
        else:
            failed += 1

        time.sleep(SLEEP_TIME)

    print("\nDownload Summary")
    print("----------------")
    print(f"Successful: {success}")
    print(f"Failed:     {failed}")


if __name__ == "__main__":
    # Example usage
    download_images(
        csv_or_excel_path="data/train.csv",
        output_dir="data/images/train",
        id_col="id"
    )

    download_images(
        csv_or_excel_path="data/test.csv",
        output_dir="data/images/test",
        id_col="id"
    )


100%|██████████| 16209/16209 [00:10<00:00, 1522.27it/s]



Download Summary
----------------
Successful: 0
Failed:     0


100%|██████████| 5404/5404 [1:28:24<00:00,  1.02it/s]


Download Summary
----------------
Successful: 4071
Failed:     0





Check that all images are present

In [2]:
import os
import time
import requests
import pandas as pd
from tqdm import tqdm

# ===================== CONFIG =====================

MAPBOX_TOKEN = os.getenv("MAPBOX_TOKEN")
STYLE = "mapbox/satellite-v9"

IMAGE_SIZE = "224x224"
ZOOM = 17
SCALE = 2

SLEEP_TIME = 0.15
MIN_FILE_SIZE = 10_000  # bytes, filters broken downloads

# ==================================================


def fetch_image(lat, lon, save_path):
    url = (
        f"https://api.mapbox.com/styles/v1/{STYLE}/static/"
        f"{lon},{lat},{ZOOM}/"
        f"{IMAGE_SIZE}@{SCALE}x"
        f"?access_token={MAPBOX_TOKEN}"
    )

    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            with open(save_path, "wb") as f:
                f.write(response.content)
            return True
    except Exception:
        pass

    return False


def is_image_valid(path):
    if not os.path.exists(path):
        return False
    if os.path.getsize(path) < MIN_FILE_SIZE:
        return False
    return True


def verify_and_redownload(data_path, image_dir, id_col="id"):
    if data_path.endswith(".csv"):
        df = pd.read_csv(data_path)
    else:
        df = pd.read_excel(data_path)

    os.makedirs(image_dir, exist_ok=True)

    missing = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        prop_id = row[id_col]
        img_path = os.path.join(image_dir, f"{prop_id}.png")

        if not is_image_valid(img_path):
            missing.append(row)

    print(f"\nMissing or corrupted images: {len(missing)}")

    for row in tqdm(missing, desc="Re-downloading"):
        lat, lon = row["lat"], row["long"]
        prop_id = row[id_col]

        save_path = os.path.join(image_dir, f"{prop_id}.png")

        ok = fetch_image(lat, lon, save_path)
        time.sleep(SLEEP_TIME)

        if not ok:
            print(f"Failed: {prop_id}")

    print("\nVerification complete.")


if __name__ == "__main__":
    verify_and_redownload(
        data_path="data/train.csv",
        image_dir="data/images/train",
        id_col="id"
    )

    verify_and_redownload(
        data_path="data/test.csv",
        image_dir="data/images/test",
        id_col="id"
    )


100%|██████████| 16209/16209 [00:16<00:00, 1012.50it/s]



Missing or corrupted images: 0


Re-downloading: 0it [00:00, ?it/s]



Verification complete.


100%|██████████| 5404/5404 [00:05<00:00, 1002.71it/s]



Missing or corrupted images: 0


Re-downloading: 0it [00:00, ?it/s]


Verification complete.



