In [7]:
import matplotlib.pyplot as plt
%matplotlib inline 

In [8]:
import dask.array as da
from dask.distributed import Client
import xarray as xr
import rioxarray
import numpy as np
from sklearn.cluster import MiniBatchKMeans
import planetary_computer as pc
import pystac_client

# Initialize Dask client
client = Client()

def get_satellite_data(bbox, date_range):
    """Single function to get and process satellite data"""
    catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
    search = catalog.search(
        collections=["landsat-c2-l2"],
        bbox=bbox,
        datetime=date_range,
        limit=1
    )
    
    # Get items and sign URLs in one go
    items = list(search.get_items())
    signed_items = {band: pc.sign(item.assets[band].href) 
                   for item in items 
                   for band in ['red', 'green', 'blue']}
    
    # Load and stack bands directly with optimal chunking
    stacked_data = da.stack([
        rioxarray.open_rasterio(signed_items[band]).chunk({'x': 2048, 'y': 2048}) 
        for band in ['red', 'green', 'blue']
    ])
    
    return stacked_data

def cluster_and_visualize(data, n_clusters=10):
    """Direct clustering and visualization"""
    valid_mask = data > 0
    valid_data = data[valid_mask].compute()

    #Reshape the data to 2D array for sklearn
    valid_data = valid_data.reshape(-1, 1)
    
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(valid_data)
    
    # Visualization code here
    return clusters, kmeans

# Main execution
bbox = [-105.0, 39.5, -104.5, 40.0]
date_range = "2023-01-01/2023-12-31"

data = get_satellite_data(bbox, date_range)
clusters, model = cluster_and_visualize(data)


Perhaps you already have a cluster running?
Hosting the HTTP server on port 46281 instead


In [9]:
def cluster_and_visualize(data, n_clusters=10):
    """Direct clustering and visualization"""
    valid_mask = data > 0
    valid_data = data[valid_mask].compute()
    print("Data processed. Shape:", valid_data.shape)
    
    # Reshape the data to 2D array
    valid_data = valid_data.reshape(-1, 1)
    
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(valid_data)
    
    # Create new figure
    fig = plt.figure(figsize=(12, 6))
    
    # Plot 1: Original Data Distribution
    plt.subplot(121)
    plt.hist(valid_data, bins=50)
    plt.title('Original Data Distribution')
    plt.xlabel('Pixel Values')
    plt.ylabel('Frequency')
    
    # Plot 2: Cluster Centers
    plt.subplot(122)
    for i in range(n_clusters):
        mask = clusters == i
        plt.hist(valid_data[mask], bins=50, alpha=0.5, label=f'Cluster {i}')
    plt.title('Data Distribution by Cluster')
    plt.xlabel('Pixel Values')
    plt.ylabel('Frequency')
    plt.legend()
    
    plt.tight_layout()
    plt.draw()
    plt.show()
    
    return clusters, kmeans