In [None]:
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import earthaccess
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import haversine_distances
import sys
sys.path.append("/home/jovyan/PACEfun")
import helper
from dask.distributed import Client
import importlib, helper; importlib.reload(helper)

# TODO: INSTEAD OF LOADING WACH REGION UNIQUELY LOAD ONE AT ONCE AND COLLOCATE AND SELECT



<module 'helper' from '/home/jovyan/PACEfun/helper.py'>

In [None]:
client = Client()
auth = earthaccess.login("login.netrc")




In [16]:
# --- TIME ---

tspan = ("2024-03-01 00:00", "2025-09-01 00:00") 

# --- REGION ---
df = pd.read_csv("/home/jovyan/PACEfun/balkan_plants.csv")

# Extract latitude and longitude
coords = df[['Latitude', 'Longitude']].values

# Convert to radians for haversine
coords_rad = np.radians(coords)

# Compute the distance matrix using haversine
dist_matrix = haversine_distances(coords_rad)

# Define epsilon in radians (e.g., ~100 km)
earth_radius_km = 6371
max_distance_km = 50
epsilon = max_distance_km / earth_radius_km

# Apply DBSCAN with precomputed distances
db = DBSCAN(eps=epsilon, min_samples=1, metric='precomputed')
clusters = db.fit_predict(dist_matrix)

# Add cluster labels to the DataFrame
df['Cluster'] = clusters

cluster_means = df[df['Cluster'] != -1].groupby('Cluster')[['Latitude', 'Longitude']].mean().reset_index()

# Group by cluster and print the results
print("Clustered Coal Plants:")
for cluster_id in sorted(df['Cluster'].unique()):
    if cluster_id == -1:
        print("\nNoise (unclustered):")
    else:
        print(f"\nCluster {cluster_id}:")
        # Get mean coordinates for this cluster
        mean_coords = cluster_means[cluster_means['Cluster'] == cluster_id]
        if not mean_coords.empty:
            mean_lat = mean_coords['Latitude'].iloc[0]
            mean_lon = mean_coords['Longitude'].iloc[0]
            print(f"Mean Coordinates: Latitude = {mean_lat:.6f}, Longitude = {mean_lon:.6f}")
    cluster_df = df[df['Cluster'] == cluster_id]
    for _, row in cluster_df.iterrows():
        print(f"- {row['Name']} ({row['Country']}) at ({row['Latitude']}, {row['Longitude']})")


Clustered Coal Plants:

Cluster 0:
Mean Coordinates: Latitude = 40.401946, Longitude = 21.856229
- Agios Dimitrios power station (Greece) at (40.394185, 21.924886)
- Ptolemaïda power station (Greece) at (40.409707, 21.787573)

Cluster 1:
Mean Coordinates: Latitude = 44.575826, Longitude = 20.225978
- Nikola Tesla power station (Serbia) at (44.671341, 20.158418)
- Kolubara A power station (Serbia) at (44.480312, 20.293538)

Cluster 2:
Mean Coordinates: Latitude = 42.446471, Longitude = 23.055765
- Republika power station (Bulgaria) at (42.607124, 23.078699)
- Bobov Dol power station (Bulgaria) at (42.285818, 23.03283)

Cluster 3:
Mean Coordinates: Latitude = 42.684787, Longitude = 21.070962
- Kosovo A power station (Kosovo) at (42.676454, 21.085922)
- Kosovo B power station (Kosovo) at (42.69312, 21.056002)

Cluster 4:
Mean Coordinates: Latitude = 41.058320, Longitude = 21.484266
- Bitola power station (North Macedonia) at (41.05832, 21.484266)

Cluster 5:
Mean Coordinates: Latitude = 4

In [None]:
factors = [2,4,6,12]

aod_mean = []
aod_std = []
ae_mean = []
ae_std = []
        

def area_plot(da, lon_res, lat_res, mid, factor=1):

    # L1
    lon = da.longitude
    lat = da.latitude
    '''

    da_filtered = da.where( 
                    ( mid[1] - factor * lat_res < lat ) & ( mid[1] + factor * lat_res > lat ) &
                    ( mid[0] - factor * lon_res < lon ) & ( mid[0] + factor * lon_res > lon ),
                    drop=True
                    )
    '''
    # L2
    radius = factor * lat_res  # radius in degrees (approx)

    da_filtered = da.where(
        ((lat - mid[1])**2 + (lon - mid[0])**2) < radius**2,
        drop=True
    )
    return da_filtered

for _, row in cluster_means.iterrows():
    
    aod_mean_f = []
    aod_std_f = []
    ae_mean_f = []
    ae_std_f = []
        
    cluster_id = int(row['Cluster'])
    mean_lat = row['Latitude']
    mean_lon = row['Longitude']
    print(f"\nCluster {cluster_id}:")
    print(f"Mean Coordinates: Latitude = {mean_lat:.6f}, Longitude = {mean_lon:.6f}")
    ext = 1.0
    
    mid = ( mean_lon, mean_lat )
    
    region = (mid[0] - ext , mid[1] - ext, mid[0] + ext, mid[1] + ext)
    
    results_spx = earthaccess.search_data(
    short_name="PACE_SPEXONE_L2_AER_RTAPLAND",
    cloud_hosted=True,
    bounding_box=region,
    temporal=tspan,
    )
    paths_spx = earthaccess.open(results_spx)
    
    wv = 550
    wv_idx = helper.get_wv_idx(paths_spx[0], wv)
    crs, shape_tmp, transform_tmp = helper.crs_template(paths_spx[0], "aot", wv)
    # Select best data set who covers the most region for the study of interest
    shape, transform, _ = helper.grid_aligned_subset(region, transform_tmp, shape_tmp)


    kwargs = {"combine": "nested", "concat_dim": "time"}
    attrs = xr.open_mfdataset(paths_spx, preprocess=helper.time_from_attr, **kwargs)
    futures = client.map(
        helper.grid_match,
        paths_spx,
        dst_crs=crs,
        dst_shape=shape,
        dst_transform=transform,
        var="aot",
        wv_idx=wv_idx
    )
    da_spx_aod = xr.combine_nested(client.gather(futures), concat_dim="time")
    da_spx_aod["time"] = attrs["time"]


    kwargs_ae = {"combine": "nested", "concat_dim": "time"}
    attrs_ae = xr.open_mfdataset(paths_spx, preprocess=helper.time_from_attr, **kwargs_ae)
    futures_ae = client.map(
        helper.grid_match,
        paths_spx,
        dst_crs=crs,
        dst_shape=shape,
        dst_transform=transform,
        var="angstrom_440_670",
        wv_idx=None
    )
    da_spx_ae = xr.combine_nested(client.gather(futures_ae), concat_dim="time")
    da_spx_ae["time"] = attrs_ae["time"]

    
    # Keep your original convention
    lat_res = np.abs(transform[0])
    lon_res = np.abs(transform[4])

    for f in factors:
        da_f = area_plot(da_spx_aod, lon_res, lat_res, mid, f)

        # Spatial mean & std per time step
        mean_ts = da_f.mean(dim=["longitude", "latitude"]).mean(dim="time").values
        std_ts  = da_f.std(dim=["longitude", "latitude"]).mean(dim="time").values
        
        
        aod_mean_f.append(mean_ts)
        aod_std_f.append(std_ts)
        
        da_f = area_plot(da_spx_ae, lon_res, lat_res, mid, f)
        
        # Spatial mean & std per time step
        mean_ts = da_f.mean(dim=["longitude", "latitude"]).mean(dim="time").values
        std_ts  = da_f.std(dim=["longitude", "latitude"]).mean(dim="time").values

        ae_mean_f.append(mean_ts)
        ae_std_f.append(std_ts)

    aod_mean.append(aod_mean_f)
    aod_std.append(aod_std_f)
    ae_mean.append(ae_mean_f)
    ae_std.append(ae_std_f)


Cluster 0:
Mean Coordinates: Latitude = 40.401946, Longitude = 21.856229


QUEUEING TASKS | :   0%|          | 0/87 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/87 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/87 [00:00<?, ?it/s]

  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,



Cluster 1:
Mean Coordinates: Latitude = 44.575826, Longitude = 20.225978


QUEUEING TASKS | :   0%|          | 0/90 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/90 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/90 [00:00<?, ?it/s]



KeyboardInterrupt: 

2025-09-29 19:43:40,024 - distributed.nanny - ERROR - Worker process died unexpectedly
Process Dask Worker process (from Nanny):
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.11/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.11/asyncio/base_events.py", line 654, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File "/srv/conda/envs/notebook/lib/python3.11/site-packages/distributed/nanny.py", line 985, in run
    await worker.finished()
  File "/srv/conda/envs/notebook/lib/python3.11/site-packages/distributed/core.py", line 494, in finished
    await self._event_finished.wait()
  File "/srv/conda/envs/notebook/lib/python3.11/asyncio/locks.py", line 213, in wait
    await fut
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most rece

In [23]:
aod_mean

[[array(0.14319861, dtype=float32),
  array(0.14049861, dtype=float32),
  array(0.14388081, dtype=float32),
  array(0.14016625, dtype=float32)]]

In [None]:
import numpy as np
import matplotlib.pyplot as plt

clusters = range(len(aod_cluster_means))

plt.figure(figsize=(10,6))

for f_idx, f in enumerate(factors):
    means = [aod_cluster_means[c][f_idx] for c in clusters]
    stds  = [aod_cluster_stds[c][f_idx] for c in clusters]
    
    plt.errorbar(
        clusters,
        means,
        yerr=stds,
        marker="o",
        capsize=5,
        label=f"factor={f}"
    )

plt.xticks(clusters, [f"Cluster {c}" for c in clusters])
plt.xlabel("Cluster")
plt.ylabel("AOD (mean ± std)")
plt.title("AOD mean per cluster for each factor")
plt.legend()
plt.grid()
plt.show()