In [2]:
from pickle import load, dump
import numpy as np
import xarray as xr

In [30]:
station_codes = {
    'muOsna': '01766',
    'wernig': '05490',
    'redlen': '13713',
    'braunl': '00656'}

initial_time = [0]
lead_time = [3,4,5]
stations = ['wernig', 'redlen', 'muOsna', 'braunl']

### Retrieves precipitation records from DWD

In [31]:
def add_precipitation_fixed(forecast_data, station_code):
    
    ## Loads rain station data
    station_path = "/p/project/deepacf/deeprain/rojascampos1/data/rain_stations/netcdf/" + station_code + ".nc"
    rain_data = xr.open_dataset(station_path)
    rain_data['time'] = rain_data['measurement_date']
    rain_data = rain_data.drop('measurement_date')
    ## Drop innecessary variables
    rain_data = rain_data.drop(('quality', 'has_rain', 'precipitation_category_WR'))
    ## Filters nans in rain data
    rain_data = rain_data.where(np.invert(np.isnan(rain_data["precipitation_height"])), drop=True)
    ## Drops forecast data for which there is no rain
    forecast_data = forecast_data.where(xr.DataArray.isin(forecast_data['time'], rain_data['time']), drop=True)
    ## Sorts datasets
    rain_data = rain_data.sortby(rain_data['time'])
    forecast_data = forecast_data.sortby(forecast_data['time'])
    ## Collects precipitation for forecast days
    precipitation = []
    for date in forecast_data['time']:
        rain_of_day = rain_data.sel(time=date)['precipitation_height'].values
        precipitation.append(rain_of_day)
    ## Creates data array with precipitation and time as coord
    precipitation_dataarray = xr.DataArray(data=np.array(precipitation), coords=[('time', forecast_data['time'])])
    ## Assign precipitation to forecast data
    forecast_data['rain_precipitation_height'] = precipitation_dataarray
    return forecast_data

### Preprocessing routine 

In [32]:
for station in stations:

    ### Load pickles provided by DWD
    train_path = f'/p/project/deepacf/deeprain/rojascampos1/data/pickles/{station}.an90pct.121g.2011to7.i00to21.l01to21.seAnly60dWdw.pickle'
    test_path  = f'/p/project/deepacf/deeprain/rojascampos1/data/pickles/{station}.an10pct.121g.2011to7.i00to21.l01to21.seAnly60dWdw.pickle'
    train_dataset = pickle.load(open(train_path, 'rb'))
    test_dataset  = pickle.load(open(test_path, 'rb'))

    ## Filter by initial and lead time
    train_dataset   = train_dataset.where(train_dataset['ini'].isin(initial_time), drop=True)
    test_dataset    = test_dataset.where(test_dataset['ini'].isin(initial_time), drop=True)
    train_dataset   = train_dataset.where(train_dataset['lea'].isin(lead_time), drop=True)
    test_dataset    = test_dataset.where(test_dataset['lea'].isin(lead_time[1]), drop=True)

    # Cuts data to 5x5 area
    train_dataset   = train_dataset.where(((train_dataset.x > 3) & (train_dataset.x < 9) & (train_dataset.y > 3) & (train_dataset.y < 9)), drop=True)
    test_dataset    = test_dataset.where(((test_dataset.x > 3) & (test_dataset.x < 9) & (test_dataset.y > 3) & (test_dataset.y < 9)), drop=True)

    ## Add precipitation
    train_dataset   = add_precipitation_fixed(train_dataset, station_codes[station])
    test_dataset    = add_precipitation_fixed(test_dataset, station_codes[station])

    train_dataset = train_dataset.drop(['ini', 'lea'])
    test_dataset  = test_dataset.drop(['ini', 'lea'])

    ## From dataset to np.array
    train_x = train_dataset.drop(['rain_precipitation_height']).to_array().values
    train_x = np.moveaxis(train_x, 0, -1)
    train_x = train_x.reshape(train_x.shape[0], -1)
    
    test_x = test_dataset.drop(['rain_precipitation_height']).to_array().values
    test_x = np.moveaxis(test_x, 0, -1)
    test_x = test_x.reshape(test_x.shape[0], -1)

    train_x_mean = np.mean(train_x, axis=0)
    train_x_std  = np.std(train_x, axis=0)
    
    # z score transformation
    train_x = (train_x - train_x_mean)/train_x_std
    test_x  = (test_x - train_x_mean)/train_x_std

    train_y = train_dataset['rain_precipitation_height'].values
    test_y  = test_dataset['rain_precipitation_height'].values
    
    trn_time = train_dataset['time'].values
    tst_time = test_dataset['time'].values
 
    np.save(stations = ['braunl', 'muOsna', 'redlen', 'wernig']
    np.save('data/' + station + '/trn_y.npy', train_y)
    np.save('data/' + station + '/tst_x.npy', test_x)
    np.save('data/' + station + '/tst_y.npy', test_y)
    np.save('data/' + station + '/trn_t.npy', trn_time)
    np.save('data/' + station + '/tst_t.npy', tst_time)
    
    print(station, 'trn_x.shape =', train_x.shape, 'trn_y.shape =', train_y.shape, 'trn_time.shape =', trn_time.shape)
    print(station, 'tst_x.shape =', test_x.shape, 'tst_y.shape =', test_y.shape, 'tst_time.shape =', tst_time.shape)

wernig trn_x.shape = (6812, 3575) trn_y.shape = (6812,) trn_time.shape = (6812,)
wernig tst_x.shape = (265, 3575) tst_y.shape = (265,) tst_time.shape = (265,)
redlen trn_x.shape = (6937, 3575) trn_y.shape = (6937,) trn_time.shape = (6937,)
redlen tst_x.shape = (234, 3575) tst_y.shape = (234,) tst_time.shape = (234,)
muOsna trn_x.shape = (6909, 3575) trn_y.shape = (6909,) trn_time.shape = (6909,)
muOsna tst_x.shape = (246, 3575) tst_y.shape = (246,) tst_time.shape = (246,)
braunl trn_x.shape = (6894, 3575) trn_y.shape = (6894,) trn_time.shape = (6894,)
braunl tst_x.shape = (237, 3575) tst_y.shape = (237,) tst_time.shape = (237,)
