In [None]:
import yaml
import tqdm
import gc
import bridgescaler
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from cartopy import crs as ccrs
from cartopy import feature as cfeature
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.callbacks import Callback, ModelCheckpoint, CSVLogger 
from tensorflow.python.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from callbacks import get_callbacks
from metrics import average_acc
from seed import seed_everything
from utils import read_config
from copy import deepcopy
from collections import OrderedDict
from plotting import plot_confusion_matrix
from reliability import reliability_diagram, reliability_diagrams, compute_calibration

## mPING data class

In [None]:
class mpingData:
    def __init__(self, 
                 conf):
        
        # variables and groupings
        self.ptypes = conf['ptypes']
        self.scaleGroups = conf['scale_groups'] 
        self.varGroups = []
        
        for group in self.scaleGroups:                                               
            self.varGroups.append(conf[group])
            
        self.features = np.array(self.varGroups).ravel()
            
        # data parameters
        
        self.seed = conf['seed']
        self.savePath = conf['save_loc']
        self.mpingPath = conf['mping_path']
        self.nSplits = conf['n_splits']
        self.trainSize = conf['train_size']
        
        self.xTrain = None
        self.yTrain = None
        self.xValid = None
        self.yValid = None
        self.xTest = None
        self.yTest = None
            
        # case study parameters
        
        self.caseStudies = conf['case_studies']
            
    def _split_data(self, 
                    qc, 
                    wet_bulb,
                    split_date):
        
        seed_everything(self.seed)
        all_data = pd.read_parquet(self.mpingPath)
        
        all_data['day'] = all_data['datetime'].apply(lambda x: str(x).split(' ')[0])
        
        if qc:
            all_data = all_data[(all_data[f"wetbulb{wet_bulb}_filter"] == 0.0) & (all_data['usa'] == 1.0)]
            
        case_study_dates = []
        for case_study in self.caseStudies:
            for date in self.caseStudies[case_study]:
                case_study_dates.append(date)
        
        test_data = all_data[all_data['datetime'].isin(case_study_dates)]
        data = all_data[~all_data['datetime'].isin(case_study_dates)]
        
        tmp_data = data[data['datetime'] >= split_date]
        test_data = pd.concat([test_data, tmp_data], ignore_index=True)
        data = data[data['datetime'] < split_date]

        splitter = GroupShuffleSplit(n_splits=self.nSplits, 
                                     train_size=self.trainSize, 
                                     random_state=self.seed)
        train_idx, valid_idx = list(splitter.split(data, groups=data['day']))[0]
        train_data, valid_data = data.iloc[train_idx], data.iloc[valid_idx]
        
        self.xTrain = train_data[self.features]
        self.yTrain = train_data[self.ptypes]
        self.xValid = valid_data[self.features]
        self.yValid = valid_data[self.ptypes]
        self.xTest = test_data[self.features]
        self.yTest = test_data[self.ptypes]
    
    def _scale_data(self, 
                    scale_type):
        
        scale_types = {'GroupStandardScaler': bridgescaler.group.GroupStandardScaler()}
        scaler = scale_types[scale_type] 
        
        self.xTrain = scaler.fit_transform(x=self.xTrain, groups=self.varGroups)
        self.xValid = scaler.transform(x=self.xValid)
        self.xTest = scaler.transform(x=self.xTest)
    
    def _save_splits(self):
        self.xTrain.to_numpy()
        self.yTrain.to_numpy()
        self.xValid.to_numpy()
        self.yValid.to_numpy()
        self.xTest.to_numpy()
        self.yTest.to_numpy()
        
        np.save(f"{self.savePath}xtrain.npy", self.xTrain)
        np.save(f"{self.savePath}ytrain.npy", self.yTrain)
        np.save(f"{self.savePath}xvalid.npy", self.xValid)
        np.save(f"{self.savePath}yvalid.npy", self.yValid)
        np.save(f"{self.savePath}xtest.npy", self.xTest)
        np.save(f"{self.savePath}ytest.npy", self.yTest)
        
    def preprocess(self,
                   split=True,
                   qc=True,
                   wet_bulb='5.0',
                   split_date='2021-06-01',
                   scale=True,
                   scale_type='GroupStandardScaler',
                   save=True):
        
        if split:
            print("splitting data...")
            self._split_data(qc, wet_bulb, split_date)
            print("completed")

        if scale:
            print("scaling data...")
            self._scale_data(scale_type)
            print("completed")

        if save:
            print("saving data...")
            self._save_splits()
            print("completed")

## Data preprocessing

In [None]:
try:
    del mping
    gc.collect()
except:
    print('pass')

conf = read_config("config/ptype.yml")

mping = mpingData(conf)
mping.preprocess(wet_bulb='5.0')
# mping.preprocess(qc=False)

## MLP class

In [None]:
class multiLayerPerceptron:
    def __init__(self, 
                 conf):
        
        # variables, groupings, data
        self.savePath = conf['save_loc']
        self.ptypes = conf['ptypes']
        self.scaleGroups = conf['scale_groups'] 
        self.varGroups = []
        
        for group in self.scaleGroups:                                               
            self.varGroups.append(conf[group])
            
        self.features = np.array(self.varGroups).ravel()
        
        # metric parameters
        
        metrics = {'average_acc': average_acc}
        
        self.metric = metrics[conf['metric']]
        self.direction = conf['direction']
        
        # model parameters
        
        self.hiddenLayers = conf['model']['hidden_layers']
        self.hiddenNeurons = conf['model']['hidden_neurons']
        self.useDropout = conf['model']['use_dropout']
        
        if self.useDropout == 1:
            self.dropoutAlpha = conf['model']['dropout_alpha']
        
        self.batchSize = conf['model']['batch_size']
        self.epochs = conf['model']['epochs']
        self.learningRate = conf['model']['lr']
        self.activation = conf['model']['activation']
        self.outputActivation = conf['model']['output_activation']
        self.runEagerly = conf['model']['run_eagerly']
        
        optimizers = {'adam': tf.keras.optimizers.Adam(self.learningRate)}
        losses = {'categorical_crossentropy': tf.keras.losses.CategoricalCrossentropy()}
        
        self.optimizer = optimizers[conf['model']['optimizer']]
        self.loss = losses[conf['model']['loss']]
        
        # callback parameters
        
        self.callbacks = get_callbacks(conf)
    
    def _build_mlp_model(self):
        input_size = np.array(self.varGroups).ravel().shape[0]
        output_size = len(self.ptypes)
        
        model = tf.keras.models.Sequential()
        
        if self.activation == 'leaky':
            model.add(tf.keras.layers.Dense(input_size))
            model.add(tf.keras.layers.LeakyReLU())
        
            for i in range(self.hiddenLayers):
                if self.hiddenLayers == 1:
                    model.add(tf.keras.layers.Dense(self.hiddenNeurons))
                    model.add(tf.keras.layers.LeakyReLU())
                else:
                    model.add(tf.keras.layers.Dense(self.hiddenNeurons))
                    model.add(tf.keras.layers.LeakyReLU())
                    if self.useDropout == 1:
                        model.add(tf.keras.layers.Dropout(self.dropoutAlpha))
        else:
            model.add(tf.keras.layers.Dense(input_size, activation=self.activation))
        
            for i in range(self.hiddenLayers):
                if self.hiddenLayers == 1:
                    model.add(tf.keras.layers.Dense(self.hiddenNeurons, activation=self.activation))
                else:
                    model.add(tf.keras.layers.Dense(self.hiddenNeurons, activation=self.activation))
                    if self.useDropout == 1:
                        model.add(tf.keras.layers.Dropout(self.dropoutAlpha))
        
        model.add(tf.keras.layers.Dense(output_size, activation=self.outputActivation))
        model.build((self.batchSize, input_size))
        model.summary()
    
        return model
    
    def train(self, 
              x_train, 
              y_train, 
              x_valid, 
              y_valid):

        #add option to load previous model weights
        
        model = self._build_mlp_model()
        
        model.compile(loss=self.loss, 
                      optimizer=self.optimizer, 
                      metrics=[self.metric],
                     run_eagerly=self.runEagerly)
        
        model.fit(x_train, 
                  y_train, 
                  validation_data=(x_valid, y_valid), 
                  callbacks=self.callbacks,
                  batch_size=self.batchSize,  
                  epochs=self.epochs)
        
        return model
    
    def load_saved_model(self,
                        load_path=None):
        
        if load_path is None:
            load_path = f"{self.savePath}best_weights.h5"
            
        print(f"loading from {load_path}")
        
        model = self._build_mlp_model()
        model.load_weights(load_path)
        
        return model
    
    def predict(self, 
                model, 
                x_test, 
                y_test):
        
        predictions = model.predict(x_test)

        probs = np.max(predictions, 1)
        preds = np.argmax(predictions, 1)
        labels = np.argmax(y_test, 1)
        
        return labels, preds, probs

## Model Training

In [None]:
try:
    del mlp
    K.clear_session()
    gc.collect()
except:
    print('pass')

conf = read_config("config/ptype.yml")

x_train = np.load(f"{conf['save_loc']}xtrain.npy")
y_train = np.load(f"{conf['save_loc']}ytrain.npy")
x_valid = np.load(f"{conf['save_loc']}xvalid.npy")
y_valid = np.load(f"{conf['save_loc']}yvalid.npy")

print(x_train.shape, y_train.shape, x_valid.shape, y_valid.shape)

mlp = multiLayerPerceptron(conf)

model = mlp.train(x_train,
                  y_train,
                  x_valid,
                  y_valid)

## Model Evaluation

### Define variables, load previous model weights, predict on test set 

In [None]:
qc = True
wet_bulb = '5.0'
dataset = 'mping2021wb5'
image_path = f"/glade/u/home/jwillson/winter-ptype/images/{dataset}/"
class_names = ['ra', 'sn', 'pl', 'fzra']

try:
    del mlp
    K.clear_session()
    gc.collect()
except:
    print('pass')

conf = read_config("config/ptype.yml")

x_test = np.load(f"{conf['save_loc']}xtest.npy")
y_test = np.load(f"{conf['save_loc']}ytest.npy")

print(x_test.shape, y_test.shape)

mlp = multiLayerPerceptron(conf)
model = mlp.load_saved_model()

labels, preds, probs = mlp.predict(model,
                                   x_test,
                                   y_test)

### Confusion matrices

In [None]:
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plot_confusion_matrix(labels, 
                      preds, 
                      class_names, 
                      dataset,
                      title=f"{dataset} Confusion Matrix", 
                      filename='cm.png')

# Plot normalized confusion matrix axis=1
plot_confusion_matrix(labels, 
                      preds, 
                      class_names, 
                      dataset, 
                      normalize=True, 
                      axis=1,
                      title=f"{dataset} Confusion Matrix (normalized axis=1)", 
                      filename='cm_norm1.png')

# Plot normalized confusion matrix axis=0
plot_confusion_matrix(labels, 
                      preds, 
                      class_names, 
                      dataset, 
                      normalize=True, 
                      axis=0,
                      title=f"{dataset} Confusion Matrix (normalized axis=0)", 
                      filename='cm_norm0.png')

plt.show()

### Reliability Diagrams

In [None]:
test_data = pd.DataFrame.from_dict({"pred_labels": preds,
                                    "true_labels": labels, 
                                    "pred_conf": probs})

fig = reliability_diagram(test_data["true_labels"].to_numpy(),
                          test_data["pred_labels"].to_numpy(),
                          test_data["pred_conf"].to_numpy(),
                          num_bins=10, 
                          draw_ece=True,
                          draw_bin_importance="alpha", 
                          draw_averages=True,
                          title=f"{dataset} Reliability Diagram", 
                          figsize=(5, 5), 
                          dpi=300,
                          return_fig=True)

plt.savefig(f'{image_path}reliability.png', 
            dpi=300, 
            bbox_inches='tight')
plt.show()

In [None]:
ptypes = [f'{dataset} Rain', f'{dataset} Snow', f'{dataset} Ice Pellets', f'{dataset} Freezing Rain']

cond0 = (test_data["true_labels"] == 0)
cond1 = (test_data["true_labels"] == 1)
cond2 = (test_data["true_labels"] == 2)
cond3 = (test_data["true_labels"] == 3)

results = OrderedDict()
results[ptypes[0]] = {
    "true_labels": test_data[cond0]["true_labels"].values,
    "pred_labels": test_data[cond0]["pred_labels"].values,
    "confidences": test_data[cond0]["pred_conf"].values
}
results[ptypes[1]] = {
    "true_labels": test_data[cond1]["true_labels"].values,
    "pred_labels": test_data[cond1]["pred_labels"].values,
    "confidences": test_data[cond1]["pred_conf"].values
}
results[ptypes[2]] = {
    "true_labels": test_data[cond2]["true_labels"].values,
    "pred_labels": test_data[cond2]["pred_labels"].values,
    "confidences": test_data[cond2]["pred_conf"].values
}
results[ptypes[3]] = {
    "true_labels": test_data[cond3]["true_labels"].values,
    "pred_labels": test_data[cond3]["pred_labels"].values,
    "confidences": test_data[cond3]["pred_conf"].values
}
fig = reliability_diagrams(results, 
                           num_bins=10, 
                           draw_bin_importance="alpha", 
                           num_cols=2, 
                           dpi=300, 
                           return_fig=True)

plt.savefig(f'{image_path}class_reliability.png', 
            dpi=300, 
            bbox_inches='tight')
plt.show()

### Cumulative accuracy vs. confidence plots

In [None]:
def compute_cov(df, 
                col = "pred_conf", 
                quan = "uncertainty", 
                ascending = False):
    
    df = df.copy()
    df = df.sort_values(col, ascending = ascending)
    df["dummy"] = 1
    df[f"cu_{quan}"] = df[quan].cumsum() / df["dummy"].cumsum()
    df[f"cu_{col}"] = df[col].cumsum() / df["dummy"].cumsum()
    df[f"{col}_cov"] = df["dummy"].cumsum() / len(df)
    
    return df

test_data["acc"] = (test_data["pred_labels"] == test_data["true_labels"]).to_numpy()
test_data_sorted = compute_cov(test_data, col = "pred_conf", quan = "acc")

fig = plt.figure(figsize=(12, 7))
ax = fig.add_subplot(1, 1, 1)
ax.plot(
    test_data_sorted["pred_conf_cov"],
    test_data_sorted["cu_acc"]
)
ax.set_xticklabels([1.2, 1.0, 0.8, 0.6, 0.4, 0.2, 0.0])

ax2 = ax.twiny()
ax2.set_xlabel("Test Data Fraction", fontsize=14)
ax2.plot(
    test_data_sorted["pred_conf_cov"],
    test_data_sorted["cu_acc"]
)
ax.set_ylabel("Cumulative Accuracy", fontsize=14)
ax.set_xlabel("Confidence", fontsize=14)

plt.savefig(f'{image_path}acc_vs_cov.png',
            dpi=300, 
            bbox_inches='tight')
plt.show()

In [None]:
fig = plt.figure(figsize=(12, 7))
ax = fig.add_subplot(1, 1, 1)
ax2 = ax.twiny()
for c in [cond0, cond1, cond2, cond3]:
    _test_data_sorted = compute_cov(test_data[c], col = "pred_conf", quan = "acc")
    ax.plot(
        _test_data_sorted["pred_conf_cov"],
        _test_data_sorted["cu_acc"]
    )
    ax2.plot(
        _test_data_sorted["pred_conf_cov"],
        _test_data_sorted["cu_acc"]
    )

ax.set_xticklabels([1.2, 1.0, 0.8, 0.6, 0.4, 0.2, 0.0])
ax.set_ylabel("Cumulative Accuracy", fontsize=14)
ax2.set_xlabel("Test Data Fraction", fontsize=14)
ax.set_xlabel("Confidence", fontsize=14)
ax.legend(ptypes)
plt.savefig(f'{image_path}class_acc_vs_cov.png', 
            dpi=300, 
            bbox_inches='tight')
plt.show()

### Case Studies

In [None]:
ptype_list = conf['ptypes']
case_studies = conf['case_studies']

data = pd.read_parquet(conf['mping_path'])
if qc:
    data = data[(data[f"wetbulb{wet_bulb}_filter"] == 0.0) & (data['usa'] == 1.0)]

case_study_dates = []
for case_study in case_studies:
    for date in case_studies[case_study]:
        case_study_dates.append(date)
        
data = data[data['datetime'].isin(case_study_dates)]

plot_types = ["true_label", "pred_label", "pred_conf"]

def plot_case_study(df,
                    case_study,
                    plot_type, 
                    savefig=False, 
                    image_path=""):
    latN = 54.0
    latS = 20.0
    lonW = -63.0
    lonE = -125.0
    cLat = (latN + latS)/2
    cLon = (lonW + lonE )/2
    colors = {0:'lime', 1:'dodgerblue', 2:'red', 3:'black'}
    
    proj = ccrs.LambertConformal(central_longitude=cLon, central_latitude=cLat)
    res = '50m'  # Coarsest and quickest to display; other options are '10m' (slowest) and '50m'.
    fig = plt.figure(figsize=(18, 12))
    ax = plt.subplot(1, 1, 1, projection=proj)
    ax.set_extent([lonW, lonE, latS, latN])
    ax.add_feature(cfeature.LAND.with_scale(res))
    ax.add_feature(cfeature.OCEAN.with_scale(res))
    ax.add_feature(cfeature.COASTLINE.with_scale(res))
    ax.add_feature(cfeature.LAKES.with_scale(res), alpha=0.5)
    ax.add_feature(cfeature.STATES.with_scale(res))
    
    first_day = str(min(df['datetime'])).split(' ')[0]
    last_day = str(max(df['datetime'])).split(' ')[0]

    zorder = [1,2,4,3]
    if plot_type == "pred_conf":
        for i in range(4):
            sc = ax.scatter(df["lon"][df["pred_label"] == i]-360,
                            df["lat"][df["pred_label"] == i],
                            c=df[plot_type][df["pred_label"] == i],
                            s=60, 
                            transform=ccrs.PlateCarree(), 
                            cmap='Greys', 
                            vmin=df[plot_type].min(), 
                            vmax=df[plot_type].max())
        
        cbar = plt.colorbar(sc, 
                            orientation="horizontal", 
                            pad=0.025, 
                            shrink=0.9325)
        
        cbar.set_label('Confidence', size=20)
    
    else:
        for i in range(4):
            ax.scatter(df["lon"][df[plot_type] == i]-360,
                       df["lat"][df[plot_type] == i],
                       c=df[plot_type][df[plot_type] == i].map(colors),
                       s=60, 
                       alpha=0.2,
                       transform=ccrs.PlateCarree(), 
                       zorder=zorder[i])

        plt.legend(colors.values(), 
                   labels=["Rain", "Snow", "Ice Pellets", "Freezing Rain"], 
                   fontsize=24, 
                   markerscale=3, 
                   loc="lower right")
    
    titles = {"true_label": "True Labels",
             "pred_label": "Pred Labels",
             "pred_conf": "Confidences"}
    
    plt.title(f"{dataset} {first_day} to {last_day} {titles[plot_type]}", 
              fontsize=30)
    
    if savefig:
        plt.savefig(f'{image_path}{case_study}_{plot_type}.png', 
                    dpi=300, 
                    bbox_inches='tight')
    plt.show()
    

for case_study in case_studies:
    df = data[data['datetime'].isin(case_studies[case_study])]
    x_test = np.array(df[mlp.features])
    y_test = np.array(df[mlp.ptypes])
    
    labels, preds, probs = mlp.predict(model,
                                       x_test,
                                       y_test)
    df["pred_label"] = preds
    df["true_label"] = labels 
    df["pred_conf"] = probs
    
    for plot_type in plot_types:
        plot_case_study(df,
                        case_study,
                        plot_type,
                        savefig=True,
                        image_path=image_path)