In [3]:
import xarray as xr
import pandas as pd
import glob
import os
import numpy as np
import re

def create_station_csvs_final():
    """Final working version with model grid coordinates"""
    
    netcdf_folder = r"C:\Deepak\HOLAPS\monthly"
    stations_csv = "plot_sites_coordinates2.csv"
    output_folder = "station_data2"
    
    os.makedirs(output_folder, exist_ok=True)
    stations_df = pd.read_csv(stations_csv)
    
    for _, station in stations_df.iterrows():
        station_id = station['SITE_ID']
        station_lat = station['LATITUDE']
        station_lon = station['LONGITUDE']
        
        print(f"Processing station: {station_id}")
        all_station_data = []
        
        # Get all NetCDF files
        all_nc_files = glob.glob(os.path.join(netcdf_folder, "*.nc"))
        print(f"  Found {len(all_nc_files)} NetCDF files")
        
        for file_path in all_nc_files:
            try:
                filename = os.path.basename(file_path)
                
                # Extract year and variable
                year_match = re.search(r'(\d{4})', filename)
                var_match = re.search(r'HOLAPS-([A-Za-z0-9]+)-Europe', filename)
                
                if not year_match or not var_match:
                    continue
                
                year = int(year_match.group(1))
                var = var_match.group(1)
                
                print(f"  Reading {filename}")
                ds = xr.open_dataset(file_path)
                
                # Use nearest neighbor selection
                closest_data = ds.sel(
                    latitude=station_lat,
                    longitude=station_lon,
                    method='nearest'
                )
                
                # Get the ACTUAL model grid coordinates that were selected
                model_lat = float(closest_data.latitude.values)
                model_lon = float(closest_data.longitude.values)
                
                # Calculate distance from station to model grid point
                distance = np.sqrt((model_lat - station_lat)**2 + (model_lon - station_lon)**2)
                
                # Get data variable (exclude bounds variables)
                data_vars = [v for v in ds.data_vars.keys() if 'bnds' not in v]
                if not data_vars:
                    ds.close()
                    continue
                
                data_var = data_vars[0]
                
                # Extract data - handle different data shapes
                data_values = closest_data[data_var]
                
                # Convert to numpy array and ensure it's 1D
                if data_values.ndim == 0:  # Scalar
                    data_values = np.array([data_values.values])
                else:  # Array
                    data_values = data_values.values
                
                # Flatten if needed to get 1D time series
                if data_values.ndim > 1:
                    data_values = data_values.flatten()
                
                time_size = len(data_values)
                print(f"    Extracted {time_size} time steps for {var} {year}")
                print(f"    Model grid: ({model_lat:.4f}, {model_lon:.4f}), Distance: {distance:.4f}°")
                
                for time_idx, value in enumerate(data_values):
                    if not pd.isna(value):
                        # Handle the 13-month issue: skip first if 13 months
                        if time_size == 13 and time_idx == 0:
                            continue  # Skip first month
                        
                        actual_month = time_idx + 1 if time_size != 13 else time_idx
                        
                        all_station_data.append({
                            'station_id': station_id,
                            'year': year,
                            'month': actual_month,
                            'variable': var,
                            'value': float(value),
                            'station_latitude': station_lat,    # Original station coordinates
                            'station_longitude': station_lon,   # Original station coordinates
                            'model_latitude': model_lat,        # Actual model grid point used
                            'model_longitude': model_lon,       # Actual model grid point used
                            'grid_distance_degrees': distance   # Distance between station and grid point
                        })
                
                ds.close()
                
            except Exception as e:
                print(f"Error with {file_path}: {e}")
                continue
        
        # Save station data
        if all_station_data:
            df = pd.DataFrame(all_station_data)
            
            try:
                df_wide = df.pivot_table(
                    index=['station_id', 'year', 'month', 'station_latitude', 'station_longitude', 
                           'model_latitude', 'model_longitude', 'grid_distance_degrees'],
                    columns='variable', 
                    values='value'
                ).reset_index()
                
                output_file = os.path.join(output_folder, f"{station_id}_all_variables.csv")
                df_wide.to_csv(output_file, index=False)
                print(f"✓ Saved: {output_file} with {len(df_wide)} records")
                
                # Show summary including grid info
                years = df_wide['year'].unique()
                avg_distance = df_wide['grid_distance_degrees'].mean()
                variables = [col for col in df_wide.columns if col not in [
                    'station_id', 'year', 'month', 'station_latitude', 'station_longitude',
                    'model_latitude', 'model_longitude', 'grid_distance_degrees'
                ]]
                
                print(f"  Years: {len(years)} ({min(years)}-{max(years)})")
                print(f"  Variables: {variables}")
                print(f"  Avg grid distance: {avg_distance:.4f}°")
                
            except Exception as e:
                print(f"Error creating wide format: {e}")
        else:
            print(f"✗ No data found for station {station_id}")
        
        print()

# Run the final version
create_station_csvs_final()

Processing station: AT-Neu
  Found 260 NetCDF files
  Reading HOLAPS-GHF-Europe-2001_monthly.nc
    Extracted 13 time steps for GHF 2001
    Model grid: (47.1000, 11.3069), Distance: 0.0211°
  Reading HOLAPS-GHF-Europe-2002_monthly.nc
    Extracted 13 time steps for GHF 2002
    Model grid: (47.1000, 11.3069), Distance: 0.0211°
  Reading HOLAPS-GHF-Europe-2003_monthly.nc
    Extracted 13 time steps for GHF 2003
    Model grid: (47.1000, 11.3069), Distance: 0.0211°
  Reading HOLAPS-GHF-Europe-2004_monthly.nc
    Extracted 13 time steps for GHF 2004
    Model grid: (47.1000, 11.3069), Distance: 0.0211°
  Reading HOLAPS-GHF-Europe-2005_monthly.nc
    Extracted 13 time steps for GHF 2005
    Model grid: (47.1000, 11.3069), Distance: 0.0211°
  Reading HOLAPS-GHF-Europe-2006_monthly.nc
    Extracted 13 time steps for GHF 2006
    Model grid: (47.1000, 11.3069), Distance: 0.0211°
  Reading HOLAPS-GHF-Europe-2007_monthly.nc
    Extracted 13 time steps for GHF 2007
    Model grid: (47.1000, 11.