In [6]:
import os
import requests

# Output directory
output_dir = r"C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\Population\WorldPop_SouthAsia"
os.makedirs(output_dir, exist_ok=True)

# South Asia ISO3 codes
south_asia_iso3 = {
    'AFG': 'Afghanistan',
    'BGD': 'Bangladesh',
    'BTN': 'Bhutan',
    'IND': 'India',
    'MDV': 'Maldives',
    'NPL': 'Nepal',
    'PAK': 'Pakistan',
    'LKA': 'Sri_Lanka'
}

# URL base by year and structure
def build_url(year, iso3):
    iso3_lower = iso3.lower()
    
    if year <= 2020:
        base_url = f"https://data.worldpop.org/GIS/Population/Global_2000_2020_1km_UNadj/{year}/{iso3}/"
        filename = f"{iso3_lower}_ppp_{year}_1km_Aggregated_UNadj.tif"
    else:
        base_url = f"https://data.worldpop.org/GIS/Population/Global_2021_2022_1km_UNadj/unconstrained/{year}/{iso3}/"
        filename = f"{iso3_lower}_ppp_{year}_1km_UNadj.tif"
    
    return base_url + filename, filename

# Download loop
for year in range(2010, 2023):
    for iso3 in south_asia_iso3:
        url, filename = build_url(year, iso3)
        save_path = os.path.join(output_dir, f"{iso3}_{year}.tif")

        if os.path.exists(save_path):
            print(f"✔️ Already exists: {filename}")
            continue

        print(f"⬇️ Downloading {filename} from {url}...")
        r = requests.get(url, stream=True)
        if r.status_code == 200:
            with open(save_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"✅ Saved to: {save_path}")
        else:
            print(f"❌ Failed to download {filename} (Status code: {r.status_code})")



✔️ Already exists: afg_ppp_2010_1km_Aggregated_UNadj.tif
✔️ Already exists: bgd_ppp_2010_1km_Aggregated_UNadj.tif
✔️ Already exists: btn_ppp_2010_1km_Aggregated_UNadj.tif
✔️ Already exists: ind_ppp_2010_1km_Aggregated_UNadj.tif
✔️ Already exists: mdv_ppp_2010_1km_Aggregated_UNadj.tif
✔️ Already exists: npl_ppp_2010_1km_Aggregated_UNadj.tif
✔️ Already exists: pak_ppp_2010_1km_Aggregated_UNadj.tif
✔️ Already exists: lka_ppp_2010_1km_Aggregated_UNadj.tif
✔️ Already exists: afg_ppp_2011_1km_Aggregated_UNadj.tif
✔️ Already exists: bgd_ppp_2011_1km_Aggregated_UNadj.tif
✔️ Already exists: btn_ppp_2011_1km_Aggregated_UNadj.tif
✔️ Already exists: ind_ppp_2011_1km_Aggregated_UNadj.tif
✔️ Already exists: mdv_ppp_2011_1km_Aggregated_UNadj.tif
✔️ Already exists: npl_ppp_2011_1km_Aggregated_UNadj.tif
✔️ Already exists: pak_ppp_2011_1km_Aggregated_UNadj.tif
✔️ Already exists: lka_ppp_2011_1km_Aggregated_UNadj.tif
✔️ Already exists: afg_ppp_2012_1km_Aggregated_UNadj.tif
✔️ Already exists: bgd_ppp_2012

In [None]:
#  Data for  PM25 cannot be downaloaed with script ; it is hosted in BOX 
#  https://sites.wustl.edu/acag/datasets/surface-pm2-5/#V5.GL.05.02
#  https://wustl.app.box.com/v/ACAG-V5GL0502-GWRPM25/folder/293382100161
#  Manual Download 

In [10]:
from netCDF4 import Dataset

# Path to your NetCDF file
nc_path = r"C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\PM25\V5GL0502\V5GL0502.HybridPM25.Asia.201001-201012.nc"

# Open file
ds = Dataset(nc_path, mode='r')

# List all variables
print("Variables in file:", list(ds.variables.keys()))

Variables in file: ['lon', 'lat', 'GWRPM25']


In [2]:
import os
import numpy as np
import rasterio
from netCDF4 import Dataset
from rasterio.transform import from_bounds
from rasterio.merge import merge
from rasterio.crs import CRS

# Years to process
years = [str(y) for y in range(2010, 2023)]

# PM2.5 paths
pm25_nc_dir = r"C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\PM25\V5GL0502"
pm25_tif_dir = os.path.join(pm25_nc_dir, "tifs")
os.makedirs(pm25_tif_dir, exist_ok=True)

# Population paths
pop_dir = r"C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\Population\WorldPop_SouthAsia"
pop_tif_dir = os.path.join(pop_dir, "tifs")
os.makedirs(pop_tif_dir, exist_ok=True)

# ISO3 list for South Asia
iso3_list = ['AFG', 'BGD', 'BTN', 'IND', 'MDV', 'NPL', 'PAK', 'LKA']

# Loop over years
for year in years:
    print(f"\n🔄 Processing year {year}...")

    # === PM2.5 NetCDF to TIFF ===
    nc_file = os.path.join(pm25_nc_dir, f"V5GL0502.HybridPM25.Global.{year}01-{year}12.nc")
    ds = Dataset(nc_file)
    pm25_data = ds.variables['GWRPM25'][:, :]
    lats = ds.variables['lat'][:]
    lons = ds.variables['lon'][:]
    ds.close()

    # ✅ Flip data if lat is descending; DO NOT flip lats
    if lats[0] > lats[-1]:
        pm25_data = pm25_data[::-1, :]  # Flip data only

    # ✅ Use lats.max() as top and lats.min() as bottom
    height, width = pm25_data.shape
    transform = from_bounds(lons.min(), lats.max(), lons.max(), lats.min(), width, height)

    pm25_output_path = os.path.join(pm25_tif_dir, f"pm25_{year}.tif")
    with rasterio.open(
        pm25_output_path,
        'w',
        driver='GTiff',
        height=height,
        width=width,
        count=1,
        dtype=pm25_data.dtype,
        crs=CRS.from_epsg(4326),
        transform=transform,
        nodata=-999
    ) as dst:
        dst.write(pm25_data, 1)

    print(f"✅ PM2.5 saved to: {pm25_output_path}")

    
    # === Population Mosaic to TIFF ===
    
    src_files = []
    for iso3 in iso3_list:
        pop_file = os.path.join(pop_dir, f"{iso3}_{year}.tif")
        if os.path.exists(pop_file):
            src = rasterio.open(pop_file)
            src_files.append(src)
        else:
            print(f"⚠️ Missing: {pop_file}")

    if src_files:
        mosaic, out_transform = merge(src_files, nodata=0)
        out_meta = src_files[0].meta.copy()
        out_meta.update({
            "driver": "GTiff",
            "height": mosaic.shape[1],
            "width": mosaic.shape[2],
            "transform": out_transform,
            "nodata": 0,
            "crs": CRS.from_epsg(4326)
        })

        pop_output_path = os.path.join(pop_tif_dir, f"pop_south_asia_{year}.tif")
        with rasterio.open(pop_output_path, "w", **out_meta) as dest:
            dest.write(mosaic)

        print(f"✅ Population mosaic saved to: {pop_output_path}")

        for src in src_files:
            src.close()
    else:
        print(f"❌ No population data found for {year}")    
  



🔄 Processing year 2010...
✅ PM2.5 saved to: C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\PM25\V5GL0502\tifs\pm25_2010.tif
✅ Population mosaic saved to: C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\Population\WorldPop_SouthAsia\tifs\pop_south_asia_2010.tif

🔄 Processing year 2011...
✅ PM2.5 saved to: C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\PM25\V5GL0502\tifs\pm25_2011.tif
✅ Population mosaic saved to: C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\Population\WorldPop_SouthAsia\tifs\pop_south_asia_2011.tif

🔄 Processing year 2012...
✅ PM2.5 saved to: C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\PM25\V5GL0502\tifs\pm25_2012.tif
✅ Population mosaic saved to: C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\Population\WorldPop_SouthAsia\tifs\pop_south_asia_2012.tif

🔄 Processing year 2013...
✅ PM2.5 saved to: C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\PM25\V5GL0502\tifs\pm25_2013.tif
✅ Population mosaic saved to: C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data

In [1]:
import os
import numpy as np
import rasterio
from rasterio.warp import reproject, Resampling
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

# Paths
pop_dir = r"C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\Population\WorldPop_SouthAsia\tifs"
pm25_dir = r"C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\PM25\V5GL0502\tifs"
shapefile_path = r"C:\Users\vgald\OneDrive\Desktop\SAR_DATA\1. Data\Shapefile\WB_GAD\WB_GAD_ADM2_SAR_Clean.shp"
output_dir = r"C:\Users\vgald\OneDrive\Desktop\SAR_DATA\3. Output\PM25"
os.makedirs(output_dir, exist_ok=True)

# Load ADM2 shapefile
adm2 = gpd.read_file(shapefile_path).to_crs("EPSG:4326")

# Years to process
years = [str(y) for y in range(2010, 2023)]

for year in years:
    print(f"\n🔄 Processing year {year}...")

    # Load population
    pop_path = os.path.join(pop_dir, f"pop_south_asia_{year}.tif")
    with rasterio.open(pop_path) as pop_src:
        pop_data = pop_src.read(1, masked=True)
        pop_meta = pop_src.meta.copy()
        pop_transform = pop_src.transform
        pop_crs = pop_src.crs
        bounds = pop_src.bounds
        height, width = pop_data.shape

    # Load PM2.5 and reproject to match population
    pm_path = os.path.join(pm25_dir, f"pm25_{year}.tif")
    with rasterio.open(pm_path) as pm_src:
        pm_data_orig = pm_src.read(1, masked=True)
        pm_resampled = np.empty_like(pop_data, dtype=np.float32)

        reproject(
            source=pm_data_orig,
            destination=pm_resampled,
            src_transform=pm_src.transform,
            src_crs=pm_src.crs,
            dst_transform=pop_transform,
            dst_crs=pop_crs,
            resampling=Resampling.bilinear
        )

    # Mask valid population cells
    valid_mask = (pop_data > 0) & (~np.isnan(pm_resampled))
    exposed_mask = valid_mask & (pm_resampled > 5)

    # Create coordinates
    res_x, res_y = pop_transform.a, -pop_transform.e
    x_coords = np.arange(0.5, width) * res_x + pop_transform.c
    y_coords = pop_transform.f - np.arange(0.5, height) * res_y
    xx, yy = np.meshgrid(x_coords, y_coords)

    # Flatten arrays
    pop_vals = pop_data[valid_mask]
    exp_vals = np.where(exposed_mask, pop_data, 0)[valid_mask]
    x = xx[valid_mask]
    y = yy[valid_mask]

    # Build GeoDataFrame
    gdf = gpd.GeoDataFrame({
        'pop': pop_vals,
        'exposed_pop': exp_vals
    }, geometry=gpd.points_from_xy(x, y), crs="EPSG:4326")

    # Spatial join with ADM2 zones
    joined = gpd.sjoin(gdf, adm2[['globalid', 'L0_CODE', 'L0_NAME', 'L1_CODE', 'L1_NAME', 'L2_CODE', 'L2_NAME', 'wb_status', 'sovereign', 'Disputed', 'geometry']], how='inner', predicate='intersects')

    # --- Aggregate by ADM2 unit ---
    grouped = joined.groupby('globalid').agg({
        'pop': 'sum',
        'exposed_pop': 'sum'
    }).reset_index().rename(columns={
        'pop': 'total_pop',
        'exposed_pop': 'exposed_pop'
    })
    grouped['percent_exposed'] = 100 * grouped['exposed_pop'] / grouped['total_pop']
    grouped['year'] = year
    grouped['geo_level'] = 2

    # Merge with ADM2 shapefile and save full dataset
    merged_df = adm2.merge(grouped, on='globalid', how='left').drop(columns='geometry', errors='ignore')
    output_csv_full = os.path.join(output_dir, f"pm25_exposure_by_admin2_full_{year}.csv")
    merged_df.to_csv(output_csv_full, index=False)
    print(f"✅ Full dataset saved to {output_csv_full}")

    # --- Aggregated by L2_CODE ---
    agg_df = merged_df.groupby('L2_CODE').agg({
        'L0_CODE': 'first',
        'L0_NAME': 'first',
        'L1_CODE': 'first',
        'L1_NAME': 'first',
        'L2_NAME': 'first',
        'total_pop': 'sum',
        'exposed_pop': 'sum',
        'percent_exposed': 'mean',
        'geo_level': 'first',
        'wb_status': 'first',
        'sovereign': 'first',
        'Disputed': 'first',
        'year': 'first'
    }).reset_index()

    output_csv_agg = os.path.join(output_dir, f"pm25_exposure_by_admin2_aggregated_{year}.csv")
    agg_df.to_csv(output_csv_agg, index=False)
    print(f"✅ Aggregated dataset saved to {output_csv_agg}")




🔄 Processing year 2010...
✅ Full dataset saved to C:\Users\vgald\OneDrive\Desktop\SAR_DATA\3. Output\PM25\pm25_exposure_by_admin2_full_2010.csv
✅ Aggregated dataset saved to C:\Users\vgald\OneDrive\Desktop\SAR_DATA\3. Output\PM25\pm25_exposure_by_admin2_aggregated_2010.csv

🔄 Processing year 2011...
✅ Full dataset saved to C:\Users\vgald\OneDrive\Desktop\SAR_DATA\3. Output\PM25\pm25_exposure_by_admin2_full_2011.csv
✅ Aggregated dataset saved to C:\Users\vgald\OneDrive\Desktop\SAR_DATA\3. Output\PM25\pm25_exposure_by_admin2_aggregated_2011.csv

🔄 Processing year 2012...
✅ Full dataset saved to C:\Users\vgald\OneDrive\Desktop\SAR_DATA\3. Output\PM25\pm25_exposure_by_admin2_full_2012.csv
✅ Aggregated dataset saved to C:\Users\vgald\OneDrive\Desktop\SAR_DATA\3. Output\PM25\pm25_exposure_by_admin2_aggregated_2012.csv

🔄 Processing year 2013...
✅ Full dataset saved to C:\Users\vgald\OneDrive\Desktop\SAR_DATA\3. Output\PM25\pm25_exposure_by_admin2_full_2013.csv
✅ Aggregated dataset saved to 