# Nighttime Light Remote Sensing Data Download (Fixed Version)

## Key Fixes:
1. Generate unique `event_key` at download time: `{county}_{startYYYYMMDDHHMM}_{endYYYYMMDDHHMM}`
2. Apply scale factor (0.1) to NTL band
3. **Snow pixels are KEPT** (not masked), but tracked in `snow_px` for filtering in EDA
4. Simplified file naming using event_key

In [1]:
import re
import time
import pandas as pd
from datetime import datetime, timedelta
from pathlib import Path
import ee



# 0) Initialize GEE

In [2]:
ee.Initialize(project="deductive-tempo-485113-n8")

# 1) Read outage CSV and generate unique event_key

In [3]:
CSV_PATH = "../data/processed/eaglei_outages_with_events_2014_2023_severeweather_top10_duration_dallas_harris_counties_texas_only.csv"
df = pd.read_csv(CSV_PATH, parse_dates=["start_time", "end_time"])

# Generate unique event_key: county + start_time + end_time
df["event_key"] = (
    df["county"].astype(str) + "_" +
    df["start_time"].dt.strftime("%Y%m%d%H%M") + "_" +
    df["end_time"].dt.strftime("%Y%m%d%H%M")
)

# Deduplicate by event_key (now truly unique)
df = df.drop_duplicates(subset=["event_key"]).reset_index(drop=True)

print("Data shape after deduplication:", df.shape)
print("\nUnique event_keys:")
display(df[["event_key", "county", "event_id", "start_time", "end_time"]])

Data shape after deduplication: (23, 15)

Unique event_keys:


Unnamed: 0,event_key,county,event_id,start_time,end_time
0,Dallas_201505290500_201505300645,Dallas,Texas-6,2015-05-29 05:00:00,2015-05-30 06:45:00
1,Dallas_201504190045_201504200030,Dallas,Texas-2,2015-04-19 00:45:00,2015-04-20 00:30:00
2,Dallas_201505271900_201505281030,Dallas,Texas-5,2015-05-27 19:00:00,2015-05-28 10:30:00
3,Dallas_201504250015_201504251530,Dallas,Texas-4,2015-04-25 00:15:00,2015-04-25 15:30:00
4,Dallas_201505270015_201505271345,Dallas,Texas-5,2015-05-27 00:15:00,2015-05-27 13:45:00
5,Harris_201708281215_201709011700,Harris,Texas-3,2017-08-28 12:15:00,2017-09-01 17:00:00
6,Harris_201805271415_201805290930,Harris,Texas-8,2018-05-27 14:15:00,2018-05-29 09:30:00
7,Harris_201811031330_201811041030,Harris,Texas-10,2018-11-03 13:30:00,2018-11-04 10:30:00
8,Harris_201801152200_201801161500,Harris,Texas-1,2018-01-15 22:00:00,2018-01-16 15:00:00
9,Harris_201801162215_201801171415,Harris,Texas-1,2018-01-16 22:15:00,2018-01-17 14:15:00


In [4]:
# Resume checkpoint: read completed records from cloud summary
CLOUD_SUMMARY_PATH = Path("../data/raw/imgs/GEE_VNP46A2_outages_tables/cloud_fraction_summary_v2.csv")

done_set = set()
if CLOUD_SUMMARY_PATH.exists():
    df_done = pd.read_csv(CLOUD_SUMMARY_PATH, parse_dates=["date"])
    done_set = set(
        zip(
            df_done["event_key"].astype(str),
            df_done["date"].dt.date
        )
    )
    print(f"Loaded done_set: {len(done_set)} (event_key, date) records")
else:
    print("No existing cloud summary found. Will process all dates.")

No existing cloud summary found. Will process all dates.


# 2) Prepare ROI: TIGER Counties boundaries

In [5]:
counties_fc = ee.FeatureCollection("TIGER/2018/Counties")

def get_county_geom(county_name: str):
    # Texas: STATEFP = 48
    feat = counties_fc.filter(ee.Filter.eq("STATEFP", "48")) \
                      .filter(ee.Filter.eq("NAME", county_name)) \
                      .first()
    return ee.Feature(feat).geometry()

ROI_BY_COUNTY = {
    "Dallas": get_county_geom("Dallas"),
    "Harris": get_county_geom("Harris")
}

# 3) VNP46A2 Dataset Configuration

In [6]:
VNP46A2 = ee.ImageCollection("NASA/VIIRS/002/VNP46A2")
NTL_BAND = "Gap_Filled_DNB_BRDF_Corrected_NTL"
CLOUD_BAND = "QF_Cloud_Mask"
SNOW_BAND = "Snow_Flag"

# FIX: Scale factor for NTL band
NTL_SCALE_FACTOR = 0.1

SCALE = 500
MAX_PIXELS = 1e13

# 4) Export folder configuration

In [7]:
BASE_IMAGE_FOLDER = "GEE_VNP46A2_outages_v2"
IMAGE_FOLDER_BY_COUNTY = {
    "Dallas": f"{BASE_IMAGE_FOLDER}_Dallas",
    "Harris": f"{BASE_IMAGE_FOLDER}_Harris",
}

TABLE_FOLDER = "GEE_VNP46A2_outages_tables"
TABLE_NAME = "cloud_fraction_summary_v2"

# 5) Utility functions

In [8]:
def extract_bits(img: ee.Image, band: str, start_bit: int, end_bit: int) -> ee.Image:
    """Extract specific bit range from QA band"""
    nbits = end_bit - start_bit + 1
    mask = (1 << nbits) - 1
    return img.select(band).rightShift(start_bit).bitwiseAnd(mask)


def build_cloud_masks(img: ee.Image):
    """
    Build cloud masks for VNP46A2
    
    NOTE: Snow is NOT excluded from export_valid mask.
          Snow pixels are tracked separately for filtering in EDA.
    
    Returns:
        base_valid: Night + good quality pixels
        cloudy_mask: Pixels identified as cloudy
        export_valid: Pixels valid for export (cloud/shadow/cirrus removed, SNOW KEPT)
        snow_mask: Snow/ice pixels (for tracking only)
    """
    # Day/Night flag (keep only night)
    daynight = extract_bits(img, CLOUD_BAND, 0, 0)
    night_mask = daynight.eq(0)

    # Cloud mask quality (keep Medium / High)
    cloud_q = extract_bits(img, CLOUD_BAND, 4, 5)
    good_quality = cloud_q.gte(2)

    # Cloud detection result
    cloud_state = extract_bits(img, CLOUD_BAND, 6, 7)
    clear_or_prob_clear = cloud_state.lte(1)
    cloudy_or_prob_cloudy = cloud_state.gte(2)

    # Shadow and cirrus
    shadow = extract_bits(img, CLOUD_BAND, 8, 8).eq(1)
    cirrus = extract_bits(img, CLOUD_BAND, 9, 9).eq(1)

    # Snow/ice (tracked but NOT excluded)
    snow_bit = extract_bits(img, CLOUD_BAND, 10, 10).eq(1)
    snow_flag = img.select(SNOW_BAND).eq(1)
    snow = snow_bit.Or(snow_flag)

    # Base valid: night + good quality
    base_valid = night_mask.And(good_quality)

    # Cloudy mask
    cloudy_mask = base_valid.And(cloudy_or_prob_cloudy)

    # Export valid mask - SNOW IS KEPT!
    export_valid = (
        base_valid
        .And(clear_or_prob_clear)
        .And(shadow.Not())
        .And(cirrus.Not())
        # .And(snow.Not())  # <-- REMOVED: Snow pixels are now INCLUDED
    )

    return base_valid, cloudy_mask, export_valid, snow


def cloud_fraction_and_counts(img: ee.Image, roi: ee.Geometry):
    """
    Calculate cloud fraction and pixel counts
    Returns: (cloud_frac, base_count, cloudy_count, snow_count)
    """
    base_valid, cloudy_mask, _, snow_mask = build_cloud_masks(img)
    ones = ee.Image.constant(1).rename("ones")

    base_count = ones.updateMask(base_valid).reduceRegion(
        ee.Reducer.sum(), roi, SCALE, maxPixels=MAX_PIXELS, bestEffort=True
    ).get("ones")

    cloudy_count = ones.updateMask(cloudy_mask).reduceRegion(
        ee.Reducer.sum(), roi, SCALE, maxPixels=MAX_PIXELS, bestEffort=True
    ).get("ones")

    snow_count = ones.updateMask(base_valid.And(snow_mask)).reduceRegion(
        ee.Reducer.sum(), roi, SCALE, maxPixels=MAX_PIXELS, bestEffort=True
    ).get("ones")

    frac = ee.Number(cloudy_count).divide(ee.Number(base_count).max(1))
    return frac, base_count, cloudy_count, snow_count


def sanitize(s: str) -> str:
    """Sanitize string for safe file naming"""
    s = str(s)
    s = re.sub(r"[^A-Za-z0-9_\-]+", "_", s)
    return s[:140]


def date_range_days(start_dt: datetime, end_dt: datetime):
    """Generate daily date sequence"""
    cur = start_dt.date()
    end_date = end_dt.date()
    while cur <= end_date:
        yield cur
        cur += timedelta(days=1)

# 6) Preview: Check events and estimate exports

In [9]:
PRE_DAYS = 5
POST_DAYS = 5

print("Events after dedup:", len(df))
print(df[["event_key", "county", "event_id", "start_time", "end_time"]].head(10))

# Estimate exports
df["_n_days"] = (df["end_time"].dt.normalize() - df["start_time"].dt.normalize()).dt.days + 1
df["_n_days_ext"] = df["_n_days"] + PRE_DAYS + POST_DAYS

print(f"\nEstimated total images (with +/-{PRE_DAYS} day buffer):", int(df["_n_days_ext"].sum()))
print("\nSummary by county:")
print(df.groupby("county")["_n_days_ext"].agg(["count", "sum", "mean", "max"]))

Events after dedup: 23
                          event_key  county  event_id          start_time  \
0  Dallas_201505290500_201505300645  Dallas   Texas-6 2015-05-29 05:00:00   
1  Dallas_201504190045_201504200030  Dallas   Texas-2 2015-04-19 00:45:00   
2  Dallas_201505271900_201505281030  Dallas   Texas-5 2015-05-27 19:00:00   
3  Dallas_201504250015_201504251530  Dallas   Texas-4 2015-04-25 00:15:00   
4  Dallas_201505270015_201505271345  Dallas   Texas-5 2015-05-27 00:15:00   
5  Harris_201708281215_201709011700  Harris   Texas-3 2017-08-28 12:15:00   
6  Harris_201805271415_201805290930  Harris   Texas-8 2018-05-27 14:15:00   
7  Harris_201811031330_201811041030  Harris  Texas-10 2018-11-03 13:30:00   
8  Harris_201801152200_201801161500  Harris   Texas-1 2018-01-15 22:00:00   
9  Harris_201801162215_201801171415  Harris   Texas-1 2018-01-16 22:15:00   

             end_time  
0 2015-05-30 06:45:00  
1 2015-04-20 00:30:00  
2 2015-05-28 10:30:00  
3 2015-04-25 15:30:00  
4 2015-05

# 7) Create Export Tasks

In [10]:
cloud_features = []
image_tasks = []

for _, row in df.iterrows():
    county = row["county"]
    event_key = row["event_key"]
    event_id = str(row["event_id"])
    st = row["start_time"].to_pydatetime()
    et = row["end_time"].to_pydatetime()

    # Extended window
    extended_start = st - timedelta(days=PRE_DAYS)
    extended_end = et + timedelta(days=POST_DAYS)

    roi = ROI_BY_COUNTY[county]
    drive_folder = IMAGE_FOLDER_BY_COUNTY[county]

    for d in date_range_days(extended_start, extended_end):
        # Skip if already done
        key = (event_key, d)
        if key in done_set:
            continue

        d0 = datetime(d.year, d.month, d.day)
        d1 = d0 + timedelta(days=1)

        ic_day = VNP46A2.filterDate(d0.strftime("%Y-%m-%d"), d1.strftime("%Y-%m-%d"))
        
        try:
            exists_client = ic_day.size().getInfo() > 0
        except Exception:
            exists_client = True

        img_exists = ee.Number(ic_day.size()).gt(0)
        img = ic_day.first()

        # Cloud stats (server-side)
        stats = cloud_fraction_and_counts(ee.Image(img), roi)
        frac = ee.Number(ee.Algorithms.If(img_exists, stats[0], -1))
        base_cnt = ee.Number(ee.Algorithms.If(img_exists, stats[1], 0))
        cloudy_cnt = ee.Number(ee.Algorithms.If(img_exists, stats[2], 0))
        snow_cnt = ee.Number(ee.Algorithms.If(img_exists, stats[3], 0))

        # Cloud summary feature
        cloud_features.append(
            ee.Feature(None, {
                "event_key": event_key,
                "county": county,
                "event_id": event_id,
                "date": d0.strftime("%Y-%m-%d"),
                "img_exists": img_exists,
                "base_valid_px": base_cnt,
                "cloudy_px": cloudy_cnt,
                "snow_px": snow_cnt,
                "cloud_frac": frac
            })
        )

        # Image export
        if exists_client:
            eimg = ee.Image(img)
            _, _, export_valid, _ = build_cloud_masks(eimg)  # Snow is KEPT
            
            # Apply scale factor
            masked_ntl = (
                eimg.select(NTL_BAND)
                .multiply(NTL_SCALE_FACTOR)
                .updateMask(export_valid)
                .clip(roi)
            )

            # File naming with event_key
            file_prefix = sanitize(f"{event_key}_{d0.strftime('%Y%m%d')}_VNP46A2_ntl")

            task = ee.batch.Export.image.toDrive(
                image=masked_ntl,
                description=file_prefix,
                folder=drive_folder,
                fileNamePrefix=file_prefix,
                region=roi,
                scale=SCALE,
                maxPixels=MAX_PIXELS
            )
            task.start()
            image_tasks.append(task)
            print("Started:", file_prefix)

            time.sleep(0.2)

        done_set.add(key)

# Export cloud summary table
cloud_fc = ee.FeatureCollection(cloud_features)
table_task = ee.batch.Export.table.toDrive(
    collection=cloud_fc,
    description=TABLE_NAME,
    folder=TABLE_FOLDER,
    fileNamePrefix=TABLE_NAME,
    fileFormat="CSV"
)
table_task.start()
print(f"\nStarted table export: {TABLE_NAME}")
print(f"Total image tasks: {len(image_tasks)}")

Started: Dallas_201505290500_201505300645_20150524_VNP46A2_ntl
Started: Dallas_201505290500_201505300645_20150525_VNP46A2_ntl
Started: Dallas_201505290500_201505300645_20150526_VNP46A2_ntl
Started: Dallas_201505290500_201505300645_20150527_VNP46A2_ntl
Started: Dallas_201505290500_201505300645_20150528_VNP46A2_ntl
Started: Dallas_201505290500_201505300645_20150529_VNP46A2_ntl
Started: Dallas_201505290500_201505300645_20150530_VNP46A2_ntl
Started: Dallas_201505290500_201505300645_20150531_VNP46A2_ntl
Started: Dallas_201505290500_201505300645_20150601_VNP46A2_ntl
Started: Dallas_201505290500_201505300645_20150602_VNP46A2_ntl
Started: Dallas_201505290500_201505300645_20150603_VNP46A2_ntl
Started: Dallas_201505290500_201505300645_20150604_VNP46A2_ntl
Started: Dallas_201504190045_201504200030_20150414_VNP46A2_ntl
Started: Dallas_201504190045_201504200030_20150415_VNP46A2_ntl
Started: Dallas_201504190045_201504200030_20150416_VNP46A2_ntl
Started: Dallas_201504190045_201504200030_20150417_VNP4