In [1]:
import xarray as xr
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
from cartopy.feature import NaturalEarthFeature, LAND, COASTLINE
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
import pygeos
import math
from shapely.geometry import Polygon
import netCDF4

## Import data

In [2]:
# Surf data on correct (WRF) grid
wrf_surfdat_path = "surfdata_wrf_CA_hist_16pfts_CMIP6_1981_c220715.nc"

# Surf data with correct values
surfdat_path = "CA_surfdat_211202.nc"

# Domain
target_domain = gpd.read_file("target_domain_111623.shp") # CZ transect

# California boundary
path_to_ca = '/home/adam/cloud/gdrive/fire_project/local_data/ca-state-boundary/CA_State_TIGER2016.shp'

## Create bounding box of Polly's surface data

In [4]:
# top_left = (surfdat["LATIXY"].values.max(), surfdat["LONGXY"].values.min())  # Latitude and longitude for the top-left corner
# bottom_right = (surfdat["LATIXY"].values.min(), surfdat["LONGXY"].values.max())  # Latitude and longitude for the bottom-right corner

# # Create a Polygon representing the bounding box
# bounding_box = Polygon([(top_left[1], top_left[0]), (bottom_right[1], top_left[0]),
#                         (bottom_right[1], bottom_right[0]), (top_left[1], bottom_right[0])])

## Visualize Polly's surface data

In [5]:
# ca = gpd.read_file(path_to_ca).to_crs(target_domain.crs)

# fig, ax = plt.subplots(figsize=(15, 15))

# # Add california
# ca.plot(ax = ax, facecolor="none", edgecolor='black', lw=0.7)

# # Add target domain
# target_domain.plot(ax = ax,facecolor="none", edgecolor='black', lw=0.7)

# # Add bounding box of Polly's surface data
# gdf_polly_bounding = gpd.GeoDataFrame({'geometry': [bounding_box]})
# gdf_polly_bounding.plot(ax = ax, alpha = 0.1)

# # Points to view on map
# points = gpd.points_from_xy([-119.978773,-119.988773,-119.26531432946383],
#                             [38.17940086346834,38.18940086346834,37.032763507337556], z=None, crs="EPSG:4326")

# # Create geodataframe of points to put on map
# df_shell = pd.DataFrame({'id':[0,1,2]})
# gdf_points = gpd.GeoDataFrame(df_shell, geometry=points)
# gdf_points.plot(ax = ax)

## Functions

In [None]:
# def find_nearest_index(lat, lon, lat_array, lon_array):
#     """
#     Find the nearest lsmlat and lsmlon indices for a given latitude and longitude.

#     :param lat: Latitude of the point to find.
#     :param lon: Longitude of the point to find.
#     :param lat_array: 2D array of latitudes from the NetCDF file.
#     :param lon_array: 2D array of longitudes from the NetCDF file.
#     :return: Tuple of (lsmlat_index, lsmlon_index).
#     """
#     # Calculate the square of the Euclidean distance
#     dist_sq = (lat_array - lat)**2 + (lon_array - lon)**2

#     # Find the index of the minimum distance
#     lsmlat_index, lsmlon_index = np.unravel_index(np.argmin(dist_sq), dist_sq.shape)
#     return lsmlat_index, lsmlon_index

# # Load NetCDF file
# nc_file = netCDF4.Dataset('CA_surfdat_211202.nc', 'r')

# # Extract LATIXY and LONGXY
# lat_array = nc_file.variables['LATIXY'][:]
# lon_array = nc_file.variables['LONGXY'][:] - 360

# # Example usage
# latitude = 37.0311  # Replace with your latitude
# longitude = -119.256599  # Replace with your longitude
# nearest_lsmlat, nearest_lsmlon = find_nearest_index(latitude, longitude, lat_array, lon_array)

# print("Nearest lsmlat index:", nearest_lsmlat)
# print("Nearest lsmlon index:", nearest_lsmlon)

In [28]:
# def find_nearest_wrf_indices(wrf_grid_data,target_lat,target_lon):
    
#     # Get shape of WRF grid
#     wrf_grid_2D_shape = wrf_grid_data['LATIXY'].values.shape
#     print(wrf_grid_2D_shape)
#     wrf_grid_1D_shape = wrf_grid_2D_shape[0] * wrf_grid_2D_shape[1]
    
#     # numpy array of wrf coordinate (LAT,LONG)
#     wrf_coordinates = np.array((wrf_grid_data['LATIXY'].values.reshape(wrf_grid_1D_shape),
#                                 wrf_grid_data['LONGXY'].values.reshape(wrf_grid_1D_shape)))

#     distances = np.sqrt( (wrf_coordinates[0,:] - target_lat)**2  \
#                         + (wrf_coordinates[1,:] - target_lon)**2 \
#                        )
    
#     # Get index of closest point
#     wrf_1D_index = np.argmin(distances)
#     wrf_2D_row_index = wrf_1D_index // wrf_grid_2D_shape[0]
#     wrf_2D_col_index = wrf_1D_index % wrf_grid_2D_shape[0]
    
#     return wrf_2D_row_index,wrf_2D_col_index

def find_nearest_indices(nc_file_path, target_lat, target_lon):
    """
    Find the nearest lsmlat and lsmlon indices for a given latitude and longitude in a NetCDF file.

    :param nc_file_path: Path to the NetCDF file.
    :param target_lat: Target latitude.
    :param target_lon: Target longitude.
    :return: Tuple of (lsmlat_index, lsmlon_index).
    """
    # Load NetCDF file
    with netCDF4.Dataset(nc_file_path, 'r') as nc_file:
        # Extract LATIXY and LONGXY
        lat_array = nc_file.variables['LATIXY'][:]
        lon_array = nc_file.variables['LONGXY'][:]

    if np.max(lon_array) > 180:
        lon_array = lon_array - 360
        print("Made conversion to -180 to 180 scale")
        
    # Calculate the square of the Euclidean distance
    dist_sq = (lat_array - target_lat)**2 + (lon_array - target_lon)**2

    # Find the index of the minimum distance
    lsmlat_index, lsmlon_index = np.unravel_index(np.argmin(dist_sq), dist_sq.shape)
    return lsmlat_index, lsmlon_index


def is_point_in_polygon(point_coordinates, polygon):
    # Create a Point object for the coordinates
    point = Point(point_coordinates)
    
    # Check if the Point is within the Polygon
    return point.within(polygon)


def update_surf_dat_in_wrf_surf_file(wrf_surfdat_path,
                                     surfdat_path,
                                     target_domain,
                                     vars_to_subtitute = ['SOIL_COLOR','PCT_SAND','PCT_CLAY','ORGANIC']):

    
    '''
    
    The function substitutes values of surface data varibales from a data set with correct values (surfdat_path)
    to a dataset that is already correctly formated on a wrf grid.
    
    param wrf_surfdat_path: path to the surface data that is on a wrf grid that you want to modify.
    param surfdat_path: path to the surface data with the correct values you want to use
    param target_domain: geopandas.geodataframe.GeoDataFrame holding a polygon with the target domain
                         for substituting data.
    param vars_substitute: list of variable names that you want substituded
    
    '''
    
    # Import wrf surface data
    wrf_surfdat = xr.open_dataset(wrf_surfdat_path)
    
    # Rescale from 0:360 to -180:180
    if np.max(wrf_surfdat['LONGXY'].values) > 180:
        wrf_surfdat['LONGXY'] = wrf_surfdat['LONGXY'] - 360
        print("Converting to -180 to 180 scale")
    
    # Create new wrf surface data to alter
    new_wrf_surfdat = wrf_surfdat.copy()
    
    # Import surface data with correct values
    surfdat = xr.open_dataset(surfdat_path)
    
    # Rescale from 0:360 to -180:180
    if np.max(surfdat['LONGXY'].values) > 180:
        surfdat['LONGXY'] = surfdat['LONGXY'] - 360
        print("Converting to -180 to 180 scale")
    
    
    # Loop through wrf data points within the target domain
    # to update each point with the correct surface data
    
    
    new_wrf_surf_dat_2D_shape = new_wrf_surfdat['LATIXY'].values.shape
    i = 0
    
    for wrf_row in range(new_wrf_surf_dat_2D_shape[0]):
        for wrf_col in range(new_wrf_surf_dat_2D_shape[1]):

            target_lat = new_wrf_surfdat["LATIXY"].values[wrf_row,wrf_col]
            target_lon = new_wrf_surfdat["LONGXY"].values[wrf_row,wrf_col]

            # Check if lat/lon is in target domain
            if is_point_in_polygon((target_lon,target_lat),target_domain.geometry[0]):
                i = i + 1
                
                print("Working on point",i,"in target domain")
                
                print("Target lat:",target_lat)
                print("Target log:",target_lon)
                
                # Get the surface data indices where the data should be substituted into the new WRF data
                surf_row, surf_col = find_nearest_indices(surfdat_path,target_lat,target_lon)
                
                for var in vars_to_subtitute:
                    
                    if surfdat[var].dims == ('nlevsoi', 'lsmlat', 'lsmlon'):
                        print("Getting data from surface data",surf_row, surf_col,"to put in wrf",wrf_row,wrf_col)
                        new_values = surfdat[var].values[:,surf_row,surf_col]
                        new_wrf_surfdat[var].values[:,wrf_row,wrf_col] = new_values

                    elif surfdat[var].dims == ('lsmlat', 'lsmlon'):
                        new_values = surfdat[var].values[surf_row,surf_col]
                        new_wrf_surfdat[var].values[wrf_row,wrf_col] = new_values
                        
                    else:
                        print("Variable",var,"has unknown dimensions. Skipping")
                        continue
            else:
                continue
    return new_wrf_surfdat

## Apply to new wrf surf data

In [29]:
%%time
new_wrf_xds = update_surf_dat_in_wrf_surf_file(wrf_surfdat_path,
                                 surfdat_path,
                                 target_domain)

Converting to -180 to 180 scale
Working on point 1 in target domain
Target lat: 36.67720413208008
Target log: -119.38523864746094
Made conversion to -180 to 180 scale
Getting data from surface data 102 121 to put in wrf 71 70
Getting data from surface data 102 121 to put in wrf 71 70
Getting data from surface data 102 121 to put in wrf 71 70
Working on point 2 in target domain
Target lat: 36.7250862121582
Target log: -119.3009033203125
Made conversion to -180 to 180 scale
Getting data from surface data 103 123 to put in wrf 71 71
Getting data from surface data 103 123 to put in wrf 71 71
Getting data from surface data 103 123 to put in wrf 71 71
Working on point 3 in target domain
Target lat: 36.74477767944336
Target log: -119.44505310058594
Made conversion to -180 to 180 scale
Getting data from surface data 103 120 to put in wrf 72 70
Getting data from surface data 103 120 to put in wrf 72 70
Getting data from surface data 103 120 to put in wrf 72 70
Working on point 4 in target domai

Made conversion to -180 to 180 scale
Getting data from surface data 117 126 to put in wrf 76 76
Getting data from surface data 117 126 to put in wrf 76 76
Getting data from surface data 117 126 to put in wrf 76 76
Working on point 44 in target domain
Target lat: 37.35068893432617
Target log: -119.09103393554688
Made conversion to -180 to 180 scale
Getting data from surface data 118 128 to put in wrf 76 77
Getting data from surface data 118 128 to put in wrf 76 77
Getting data from surface data 118 128 to put in wrf 76 77
Working on point 45 in target domain
Target lat: 37.398406982421875
Target log: -119.00559997558594
Made conversion to -180 to 180 scale
Getting data from surface data 119 130 to put in wrf 76 78
Getting data from surface data 119 130 to put in wrf 76 78
Getting data from surface data 119 130 to put in wrf 76 78
Working on point 46 in target domain
Target lat: 37.44606018066406
Target log: -118.92002868652344
Made conversion to -180 to 180 scale
Getting data from surfa

CPU times: user 3.32 s, sys: 44.5 ms, total: 3.36 s
Wall time: 3.37 s


## Write to new netcdf

In [30]:
new_wrf_xds.to_netcdf("surfdata_wrf_CA_hist_16pfts_CMIP6_1981_c220715_updated_ahb_CZ2_domain_122123.nc", format = "NETCDF3_64BIT", mode = "w")

## Understanding why the soil data is different at single site CZ2 versus the WRF cell overlapping CZ2 in the regional simulation

Answer: The coordinates at the center of the WRF grid that overlaps CZ2 are slightly different to the coordinates of CZ2 itself. The curstom algorithm above therefore pulls data from the surface data at an adjacent (but not the same) cell as if you input CZ2 coordinates themselves. This is OK and expected. We have to pull soil data algorithmically based on the nearest neighbor to the center of each WRF grid cell. The fact that the vegetation dyanmcis are so different with this small change in soil data is problematic from a model structure perspective.

### Closest point to CZ2 in the surface data

In [40]:
find_nearest_indices(surfdat_path,37.0311,-119.256599)

Made conversion to -180 to 180 scale


(110, 124)

In [41]:
surfdat = xr.open_dataset(surfdat_path)
surfdat.PCT_SAND.values[:,110, 124]

array([60., 60., 60., 60., 60., 60., 60., 60., 60., 60.])

This matches what I used at single point site

### Closest point to CZ2 in the updated wrf surface data

In [42]:
find_nearest_wrf_indices("surfdata_wrf_CA_hist_16pfts_CMIP6_1981_c220715_updated_ahb_CZ2_domain_122123.nc", 37.0311,-119.256599)

(74, 74)

In [47]:
new_wrf_surf = xr.open_dataset("surfdata_wrf_CA_hist_16pfts_CMIP6_1981_c220715_updated_ahb_CZ2_domain_122123.nc")
new_wrf_surf.PCT_SAND.values[:,74, 74]

array([69.38775635, 69.38775635, 69.38775635, 69.38775635, 69.38775635,
       69.38775635, 69.38775635, 69.38775635, 69.38775635, 69.38775635])

The soil data above is different to the single site CZ2 data because when we search through all the wrf grid points we want to update the data for, the coordinates in the center of the wrf grid cell that is closest to CZ2 is not the direct coordinates we use for CZ2. Therefore, the nearest neighbor in the surface data file is different.

In [51]:
## Closest lat/lon to CZ2 in the wrf grid is
print(find_nearest_wrf_indices(wrf_surfdat_path,37.0311,-119.256599))
wrf_surfdat = xr.open_dataset(wrf_surfdat_path)
print(wrf_surfdat.LATIXY.values[74,74])
print(wrf_surfdat.LONGXY.values[74,74])

(74, 74)
37.07170104980469
-119.22645568847656


In [53]:
#Which has different soil data
surfdat = xr.open_dataset(surfdat_path)
print(find_nearest_indices(surfdat_path,37.07170104980469,-119.22645568847656))
print(surfdat.PCT_SAND.values[:,111,125])

Made conversion to -180 to 180 scale
(111, 125)
[69.38775635 69.38775635 69.38775635 69.38775635 69.38775635 69.38775635
 69.38775635 69.38775635 69.38775635 69.38775635]
