## Method to fill observed meteorological data with Met Nordic gridded data

Testing out MICE for gap-filling using Met Nordic.

In [18]:
import numpy as np
import pandas as pd
import xarray as xr
from fancyimpute import IterativeImputer
from sklearn.preprocessing import StandardScaler


In [19]:
# test example:

data = {
    'Temperature': [15.0, 14.2, np.nan, np.nan, 17.0, np.nan, 13.1, 12.5, 14.8, np.nan],
    'Humidity': [80, 78, np.nan, np.nan, 90, np.nan, 75, np.nan, 82, 81],
    'WindSpeed': [5.0, np.nan, np.nan, np.nan, 5.1, 4.5, 3.8, 4.0, np.nan, 4.3]
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)

# Initialize MICE Imputer
imputer = IterativeImputer(max_iter=10, random_state=0)

# Perform imputation
df_imputed = imputer.fit_transform(df)

# Convert back to DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

print("\nImputed Data:")
print(df_imputed)


Original Data:
   Temperature  Humidity  WindSpeed
0         15.0      80.0        5.0
1         14.2      78.0        NaN
2          NaN       NaN        NaN
3          NaN       NaN        NaN
4         17.0      90.0        5.1
5          NaN       NaN        4.5
6         13.1      75.0        3.8
7         12.5       NaN        4.0
8         14.8      82.0        NaN
9          NaN      81.0        4.3

Imputed Data:
   Temperature   Humidity  WindSpeed
0    15.000000  80.000000   5.000000
1    14.200000  78.000000   4.473759
2    14.464116  79.556835   4.429090
3    14.464116  79.556835   4.429090
4    17.000000  90.000000   5.100000
5    14.478003  79.353229   4.500000
6    13.100000  75.000000   3.800000
7    12.500000  71.101454   4.000000
8    14.800000  82.000000   4.258958
9    14.634923  81.000000   4.300000


In [None]:
import numpy as np
import pandas as pd
import xarray as xr
from fancyimpute import IterativeImputer
from sklearn.preprocessing import StandardScaler

# Load point observation data
df = pd.read_csv("../data/Tuddal_data.csv", index_col=0, na_values=np.nan)

# Load NetCDF reanalysis data
nc = xr.open_dataset("../data/met_analysis_1_0km_nordic_v3_yr_20161108_20240528_formatSURFEXnewTuddal.nc")

# Specify variables of interest from each dataset (original names)
vars_df = ["precip_int_h_D", "air_pressure", "wind_dir", "RH", "specific_humidity",
           "wind_speed", "R_LW_in_corr", "R_SW_in", "air_temperature"]

vars_nc = list(nc.keys())
vars_nc_to_remove = ['Rainf', 'Snowf', 'FRC_TIME_STP', 'LON', 'LAT', 'ZS', 'SCA_SWdown', 'CO2air', 'ZREF', 'UREF']
vars_nc = list(set(vars_nc) - set(vars_nc_to_remove))

# Extract data from NetCDF file and reset index
nc_data = nc[vars_nc].to_dataframe().reset_index()
df = df[vars_df].reset_index()

# Resample the NetCDF data to 30-minute intervals by interpolating
nc_data_resampled = nc_data.set_index('time').resample('30min').interpolate(method='linear').reset_index()

# Define the new column names based on your naming convention
rename_mapping_df = {
    "precip_int_h_D": "precipitation",
    "air_pressure": "pressure",
    "wind_dir": "wind_direction",
    "RH": "relative_humidity",
    "specific_humidity": "humidity",
    "wind_speed": "wind_speed",
    "R_LW_in_corr": "longwave_radiation",
    "R_SW_in": "shortwave_radiation",
    "air_temperature": "temperature"
}

rename_mapping_nc = {
    original: new + '_nc'
    for original, new in rename_mapping_df.items()
}

# Rename columns in point observation DataFrame
df.rename(columns=rename_mapping_df, inplace=True)

# Rename columns in resampled NetCDF DataFrame
nc_data_resampled.rename(columns=rename_mapping_nc, inplace=True)

# Merge point observations with NetCDF data on the time index
df['timestamp'] = pd.to_datetime(df['timestamp'])
nc_data_resampled['time'] = pd.to_datetime(nc_data_resampled['time'])
merged_df = pd.merge(df, nc_data_resampled, left_on='timestamp', right_on='time', suffixes=('_point', '_nc'))

# Drop the time column extracted from NetCDF as the 'timestamp' column serves the same purpose
merged_df.drop(columns=['time'], inplace=True)

# Update impute_vars to use the new column names
impute_vars = list(rename_mapping_df.values()) + list(rename_mapping_nc.values())

print("Column names of merged DataFrame after renaming:")
print(merged_df.columns)

# Continue with scaling and imputation as before
scaler = StandardScaler()
scaled_data = scaler.fit_transform(merged_df[impute_vars])

imputer = IterativeImputer(max_iter=10, random_state=0)
imputed_data = imputer.fit_transform(scaled_data)

imputed_data = scaler.inverse_transform(imputed_data)

for i, var in enumerate(impute_vars):
    merged_df[var] = imputed_data[:, i]

# Separate point observations and reanalysis data if needed
df_filled = merged_df[['timestamp'] + list(rename_mapping_df.values())]
nc_filled = merged_df[['timestamp'] + list(rename_mapping_nc.values())]

print("Imputed Point Observations Data:")
print(df_filled.head())

print("\nImputed NetCDF Data:")
print(nc_filled.head())


In [None]:
# Load point observation data
df = pd.read_csv("../data/Tuddal_data.csv", index_col=0, na_values=np.nan)

# Load NetCDF reanalysis data
nc = xr.open_dataset("../data/met_analysis_1_0km_nordic_v3_yr_20161108_20240528_formatSURFEXnewTuddal.nc")

# Specify variables of interest from each ds
vars_df = ["precip_int_h_D", "air_pressure", "wind_dir", "RH", "specific_humidity",
           "wind_speed", "R_LW_in_corr", "R_SW_in", "air_temperature"]

vars_nc = list(nc.keys())
vars_nc_to_remove = ['Rainf', 'Snowf', 'FRC_TIME_STP', 'LON', 'LAT', 'ZS', 'SCA_SWdown', 'CO2air', 'ZREF', 'UREF']
vars_nc = list(set(vars_nc) - set(vars_nc_to_remove))

nc_data = nc[vars_nc].to_dataframe().reset_index()
df = df[vars_df].reset_index()

# Resample the NetCDF data to 30-minute intervals by interpolating
nc_data_resampled = nc_data.set_index('time').resample('30min').interpolate(method='linear').reset_index()

# rename both ds to the following:
new_var_names = ["precipitation", "air_pressure", "wind_dir", "RH", "specific_humidity",
           "wind_speed", "LW_in", "SW_in", "air_temperature"]

# rename ds:


# Merge point observations with NetCDF data on the time index
df['timestamp'] = pd.to_datetime(df['timestamp'])
nc_data['time'] = pd.to_datetime(nc_data['time'])
merged_df = pd.merge(df, nc_data_resampled, left_on='timestamp', right_on='time', suffixes=('_point', '_nc'))

# Drop the time column extracted from NetCDF as the 'timestamp' column serves the same purpose
merged_df.drop(columns=['time'], inplace=True)

# Selecting only the relevant variables for imputation
impute_vars = vars_df + [var + '_nc' for var in vars_df]


In [50]:
# Scale the data before imputation
scaler = StandardScaler()
scaled_data = scaler.fit_transform(merged_df[impute_vars])

# Apply IterativeImputer (MICE)
imputer = IterativeImputer(max_iter=10, random_state=0)
imputed_data = imputer.fit_transform(scaled_data)

# Inverse transform to return to original scale
imputed_data = scaler.inverse_transform(imputed_data)

# Update the DataFrame with the imputed values
for i, var in enumerate(impute_vars):
    merged_df[var] = imputed_data[:, i]

# If needed, separate point observations and reanalysis data after imputation
df_filled = merged_df[['timestamp'] + vars_df]
nc_filled = merged_df[['timestamp'] + [var + '_nc' for var in vars_df]]

print("Imputed Point Observations Data:")
print(df_filled.head())

print("\nImputed NetCDF Data:")
print(nc_filled.head())



KeyError: "['RH', 'precip_int_h_D_nc', 'air_pressure_nc', 'wind_dir_nc', 'specific_humidity_nc', 'wind_speed_nc', 'R_LW_in_corr_nc', 'R_SW_in_nc', 'air_temperature_nc'] not in index"