In [1]:
import pandas as pd
import random
import os

def create_sampled_locations(no_sites, time_index, valid_coordinates, output_path=None, random_seed=1):
    """
    Create a DataFrame of sampled locations by randomly selecting valid (xc, yc) coordinates and assigning a time index.

    Parameters:
        no_sites (int): Number of random locations to generate.
        time_index (int): The time index to assign to all locations.
        valid_coordinates (list or array): Array or list of valid (xc, yc) coordinate pairs to sample from.
        output_path (str, optional): Path to save the resulting DataFrame as a CSV file.
                                     If provided, no_sites and time_index are appended to the filename.
        random_seed (int): Seed for reproducibility of random sampling. Default is 1.

    Returns:
        pd.DataFrame: DataFrame containing sampled locations with columns ['xc', 'yc', 'time_index'].
    """
    random.seed(random_seed)

    # Ensure valid_coordinates is a list of tuples
    valid_coordinates = list(valid_coordinates)

    # Ensure no_sites does not exceed available coordinates
    if no_sites > len(valid_coordinates):
        raise ValueError("Number of sites exceeds the available valid coordinates.")

    # Randomly sample from the valid coordinates
    sampled_coords = random.sample(valid_coordinates, no_sites)

    # Create the DataFrame
    sampled_locations = pd.DataFrame(sampled_coords, columns=['x', 'y'])
    sampled_locations['time_idx'] = time_index  # Assign the same time index to all locations

    # Save the DataFrame to the specified path if provided
    if output_path:
        # Append no_sites and time_index to the filename
        base, ext = os.path.splitext(output_path)
        output_file = f"{base}_sites{no_sites}_time{time_index}{ext}"
        sampled_locations.to_csv(output_file, index=False)
        print(f"Sampled locations saved to {output_file}")

    return sampled_locations

In [2]:
import sys

sys.path.append('/cluster/home/haroldh/spGDMM/3_src/1_data')

from fetch_sinmod_data import fetch_sinmod_data


In [9]:
import xarray as xr

sinmod_path = "/cluster/home/haroldh/spGDMM/1_data/1_raw/biostates_surface_normalised.nc"
target_variables = ['diatoms', 'flagellates', 'ciliates', 'HNANO', 'bacteria', 'calanus_finmarchicus', 'calanus_glacialis']#,'detritus_slow', 'detritus_fast']

ds = xr.open_dataset(sinmod_path)['diatoms'].isel(time=0)

valid_mask = ds.notnull()

# Stack the dimensions into pairs
stacked = valid_mask.stack(z=('xc', 'yc'))

# Filter valid locations without using `.where()`
valid_coords = stacked[stacked.values].z.values

time_idx = 285
no_sites = 100

sampled_locations = create_sampled_locations(no_sites, time_idx, valid_coords)

result = fetch_sinmod_data(sinmod_path, sampled_locations, target_variables)

In [10]:
sampled_locations

Unnamed: 0,x,y,time_idx
0,110400.0,284800.0,285
1,524000.0,398400.0,285
2,751200.0,286400.0,285
3,709600.0,257600.0,285
4,49600.0,201600.0,285
...,...,...,...
95,505600.0,336800.0,285
96,649600.0,415200.0,285
97,722400.0,225600.0,285
98,622400.0,287200.0,285


In [11]:
result

Unnamed: 0,x,y,time_idx,diatoms,flagellates,ciliates,HNANO,bacteria,calanus_finmarchicus,calanus_glacialis
0,110400.0,284800.0,285,0.193077,0.009574,0.498352,0.168727,0.128705,0.001565,0.0
1,524000.0,398400.0,285,0.142859,0.013030,0.478044,0.179093,0.080709,0.106263,0.0
2,751200.0,286400.0,285,0.162423,0.007004,0.422058,0.171850,0.106216,0.130449,0.0
3,709600.0,257600.0,285,0.171848,0.007156,0.514734,0.174456,0.096182,0.035625,0.0
4,49600.0,201600.0,285,0.174835,0.006675,0.467747,0.144424,0.099168,0.107150,0.0
...,...,...,...,...,...,...,...,...,...,...
95,505600.0,336800.0,285,0.175347,0.007929,0.491165,0.172515,0.087215,0.065829,0.0
96,649600.0,415200.0,285,0.186577,0.012977,0.404714,0.193230,0.112954,0.089550,0.0
97,722400.0,225600.0,285,0.178834,0.009000,0.530740,0.159911,0.116373,0.005143,0.0
98,622400.0,287200.0,285,0.157100,0.010568,0.388035,0.160822,0.095396,0.188078,0.0
