In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp
from pickle import load, dump

from scipy import stats
from sklearn.metrics import *
from sklearn.model_selection import KFold
from validate import validate
import matplotlib.pyplot as plt

def ld(f): return load(open(f, 'rb'))  # _pickle.load
def dp(what, fP): dump(what, open(fP, 'wb'))  # _pickle.dump

## Station-specific classification

In [None]:
n_runs = 20
data_dir = './data'
locations = ['muOsna', 'wernig', 'braunl', 'redlen']
station = locations[3]
print(station)

In [None]:
## Changes according to station. Obtained from arch_search.ipynb
archs = {'muOsna': {'depth': 6, 'units': 64},
         'wernig': {'depth': 8, 'units': 64},
         'braunl': {'depth': 8, 'units': 32},
         'redlen': {'depth': 6, 'units': 64},
         'general': {'depth': 8, 'units': 64}}

best_depth = archs[station]['depth']
best_units = archs[station]['units']

### Traininig phase

In [None]:
## Load the dataset
id_predictors   = np.loadtxt(data_dir+'/id_predictors/'+station+'.txt', dtype=np.int32, delimiter=',') - 1 #Matlab index start in 1

trn_x = np.load(data_dir + '/' + station + '/trn_x.npy')
trn_x = trn_x[:, id_predictors]
trn_y = np.load(data_dir + '/' + station + '/trn_y.npy')

## Boolean rain / no rain
trn_y = np.array(trn_y >= 0.1, dtype=float)
print('Train dataset:', trn_x.shape, '---> ', trn_y.shape)

In [None]:
for r in range(n_runs):
    
    print('Run', r+1, '/', n_runs)
    
    cross_validation_models = []
    cross_validation_losses = []
    cross_validation_histos = []


    kf = KFold(n_splits=10, shuffle=True)
    for train_index, val_index in kf.split(trn_x, trn_y):

        batch_size = 64
        epochs = 128

        optimizer = tf.optimizers.Adam(learning_rate=0.0001)
        negloglik = lambda y, dist: -dist.log_prob(y)
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5)

        activation = 'relu'

        model = tf.keras.Sequential()

        model.add(tf.keras.layers.InputLayer(input_shape=(trn_x.shape[1])))

        for d in range(best_depth):
            model.add(tf.keras.layers.Dense(best_units, activation=activation))
        
        ## Probabilistic output
        model.add(tf.keras.layers.Dense(1))
        model.add(tfp.layers.DistributionLambda(lambda t: tfp.distributions.Bernoulli(t[..., :1])))
        
        model.compile(optimizer, negloglik)

        hist = model.fit(trn_x[train_index], trn_y[train_index], batch_size, epochs, validation_data=(trn_x[val_index], trn_y[val_index]), shuffle=True, verbose=0, callbacks=[early_stop])
        loss = model.evaluate(trn_x[val_index], trn_y[val_index], verbose=0)

        cross_validation_models.append(model)
        cross_validation_losses.append(loss)
        cross_validation_histos.append(hist)


    ## Pick the best model from cross validation
    idx_best = np.argmin(cross_validation_losses)

    best_model = cross_validation_models[idx_best]
    best_model.save('results/probabilistic_classification/specific/' + station + '/model' + str(r).zfill(2) + '.h5')
    best_hist = cross_validation_histos[idx_best]

    fig, ax = plt.subplots()
    ax.plot(best_hist.history['loss'], label='loss')
    ax.plot(best_hist.history['val_loss'], label='val_loss')
    fig.legend()
    
    fig.savefig('results/probabilistic_classification/specific/' + station + '/loss' + str(r).zfill(2) + '.png')

### Testing phase

In [None]:
tst_t = np.load(data_dir + '/' + station + '/tst_t.npy')
tst_x = np.load(data_dir + '/' + station + '/tst_x.npy')
tst_x = tst_x[:, id_predictors]
tst_y = np.load(data_dir + '/' + station + '/tst_y.npy')

tst_y = np.array(tst_y >= 0.1, dtype=int)
print('Test dataset:', tst_x.shape, '--->', tst_y.shape)

In [None]:
results = []

for r in range(n_runs):

    model = tf.keras.models.load_model('results/probabilistic_classification/specific/' + station + '/model' + str(r).zfill(2) + '.h5', compile=False)
    pred = model(tst_x)
    pred = pred.mean()
    pred = np.squeeze(pred)
    results.append(pred)

results = np.array(results)
print('Results [runs x ensembles x predictions] =', results.shape)    

In [None]:
skill_all = []

for r in range(n_runs):
    
    brier_preds = brier_score_loss(tst_y, results[r])

    ## Get COSMO performance
    px = './data/pickles/'
    # _load this data once for each weather station
    obsAndRef = ld(px+f'{station}.forValid.1x1.y1to7.l1to21.pickle')
    obsAndRefLocl = obsAndRef[obsAndRef['ini'].isin([0])]
    obsAndRefLocl = obsAndRefLocl[obsAndRefLocl['lea'].isin([4])]
    obsAndRefLocl = obsAndRefLocl[obsAndRefLocl.tista.isin(tst_t)]
    varis = ['TOT_PREC_delta__median']
    obsAndRefLocl = obsAndRefLocl.filter(varis)
    # obsAndRefLocl = obsAndRefLocl.filter(regex='^TOT_PREC_delta__p')
    cosmo_ensemble = obsAndRefLocl.to_numpy()
    cosmo_ensemble = np.squeeze(cosmo_ensemble)
    cosmo_ensemble = np.array(cosmo_ensemble >= 0.1, dtype=float)
    print(cosmo_ensemble.shape)
    
    brier_cosmo = brier_score_loss(tst_y, cosmo_ensemble)
    
    brier_skill = 1 - (brier_preds / brier_cosmo)
    skill_all.append(brier_skill)
    
    
print(station, 'Results:', 'min =', np.min(skill_all), 'median = ', np.median(skill_all), 'max =', np.max(skill_all))
np.save('results/probabilistic_classification/specific/' + station + '/results_brier_skill.npy', skill_all)

## General probabilistic classification

In [None]:
best_depth = archs['general']['depth']
best_units = archs['general']['units']

### Merge all datasets

In [None]:
trn_x = []
trn_y = []

for station in locations:
    
    trn_x.append(np.load(data_dir + '/' + station + '/trn_x.npy'))
    trn_y.append(np.load(data_dir + '/' + station + '/trn_y.npy'))

trn_x = np.concatenate(trn_x)
trn_y = np.concatenate(trn_y)

## Boolean rain / no rain
trn_y = np.array(trn_y >= 0.1, dtype=float)
print('Train dataset:', trn_x.shape, '---> ', trn_y.shape)

In [None]:
for r in range(n_runs):
    
    print('Run', r+1, '/', n_runs)
    
    cross_validation_models = []
    cross_validation_losses = []
    cross_validation_histos = []


    kf = KFold(n_splits=10, shuffle=True)
    for train_index, val_index in kf.split(trn_x, trn_y):

        batch_size = 64
        epochs = 256

        optimizer = tf.optimizers.Adam(learning_rate=0.001)
        negloglik = lambda y, dist: -dist.log_prob(y)
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5)

        activation = 'relu'

        model = tf.keras.Sequential()

        model.add(tf.keras.layers.InputLayer(input_shape=(trn_x.shape[1])))

        for d in range(best_depth):
            model.add(tf.keras.layers.Dense(best_units, activation=activation))

        
       ## Probabilistic output
        model.add(tf.keras.layers.Dense(1))
        model.add(tfp.layers.DistributionLambda(lambda t: tfp.distributions.Bernoulli(t[..., :1])))
        
        model.compile(optimizer, negloglik)

        hist = model.fit(trn_x[train_index], trn_y[train_index], batch_size, epochs, validation_data=(trn_x[val_index], trn_y[val_index]), shuffle=True, verbose=0, callbacks=[early_stop])
        loss = model.evaluate(trn_x[val_index], trn_y[val_index], verbose=0)

        cross_validation_models.append(model)
        cross_validation_losses.append(loss)
        cross_validation_histos.append(hist)


    ## Pick the best model from cross validation
    idx_best = np.argmin(cross_validation_losses)

    best_model = cross_validation_models[idx_best]
    best_model.save('results/probabilistic_classification/general/model' + str(r).zfill(2) + '.h5')
    best_hist = cross_validation_histos[idx_best]

    fig, ax = plt.subplots()
    ax.plot(best_hist.history['loss'], label='loss')
    ax.plot(best_hist.history['val_loss'], label='val_loss')
    fig.legend()
    
    fig.savefig('results/probabilistic_classification/general/loss' + str(r).zfill(2) + '.png')

### Test the models

In [None]:
for station in locations:
    
    tst_t = np.load(data_dir + '/' + station + '/tst_t.npy')
    tst_x = np.load(data_dir + '/' + station + '/tst_x.npy')
    tst_y = np.load(data_dir + '/' + station + '/tst_y.npy')

    tst_y = np.array(tst_y >= 0.1, dtype=int)
    print('Test dataset:', tst_x.shape, '--->', tst_y.shape)
    
    ## Load model and get predictions
    results = []

    for r in range(n_runs):

        model = tf.keras.models.load_model('results/probabilistic_classification/general/model' + str(r).zfill(2) + '.h5', compile=False)
        pred = model(tst_x)
        pred = pred.mean()
        pred = np.squeeze(pred)
        results.append(pred)

    results = np.array(results)
    print('Results [runs x ensembles x predictions] =', results.shape) 
    

    ## Get COSMO peformance
    px = './data/pickles/'
    # _load this data once for each weather station
    obsAndRef = ld(px+f'{station}.forValid.1x1.y1to7.l1to21.pickle')
    obsAndRefLocl = obsAndRef[obsAndRef['ini'].isin([0])]
    obsAndRefLocl = obsAndRefLocl[obsAndRefLocl['lea'].isin([4])]
    obsAndRefLocl = obsAndRefLocl[obsAndRefLocl.tista.isin(tst_t)]
    varis = ['TOT_PREC_delta__median']
    obsAndRefLocl = obsAndRefLocl.filter(varis)
    cosmo_ensemble = obsAndRefLocl.to_numpy()
    cosmo_ensemble = np.squeeze(cosmo_ensemble)
    cosmo_ensemble = np.array(cosmo_ensemble >= 0.1, dtype=float)
    print(cosmo_ensemble.shape)
    brier_cosmo = brier_score_loss(tst_y, cosmo_ensemble)
    
    
    ## Evaluate predictions
    skill_all = []
    for r in range(n_runs):

        brier_preds = brier_score_loss(tst_y, results[r])
        brier_skill = 1 - (brier_preds / brier_cosmo)
        skill_all.append(brier_skill)
    
    print(station, 'Results:', 'min =', np.min(skill_all), 'median = ', np.median(skill_all), 'max =', np.max(skill_all))
    np.save('results/probabilistic_classification/general/' + station + '_results_brier_skill.npy', skill_all)