In [44]:
# Modules
import numpy as np
import xarray as xr
import pandas as pd

from data import data_loader
from data import gesla_preprocessing
from data import era5_preprocessing
from data import preprocessing

In [45]:
# ---
# Preprocessing
# ---

# Get timeseries of predictor and predictand
percentile = 0.95
predictor = "sp"
season = "winter"

X, Y, t = preprocessing.preprocessing1(season, predictor, percentile)

# Handle NaN values: 
# Insert numerical value that is not in data.
# ML will hopefully recognize it.
X[np.where(np.isnan(X))] = -999

# Save number of lat/lon for interpreting model output later
ndim = X.shape[0]
nlat = X.shape[1]
nlon = X.shape[2]

# Prepare shape for model
X = X.reshape(ndim, -1) # (ndim, nclasses)
y = Y[:, 0] # Select only one station

Load ERA5-Predictor: sp in region: lon-0530_lat7040 for years: 1999-2008 in season: winter
Load Predictand from GESLA
Applied one-hot-encoding
Get overlapping timeseries of ERA5 and GESLA


In [47]:
#---
# Train Model
#---
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

# Setup Model
max_depth = 3

model = RandomForestClassifier(criterion='gini',
n_estimators=100, #- nTrees 
max_depth=max_depth, 
random_state=0, # To compare results when changing hyperparameters
class_weight="balanced",
oob_score=True,
)

model.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', max_depth=3, oob_score=True,
                       random_state=0)

In [54]:
#---
# Evaluate model / Diagnostic
#--- 
# Score & Importance
test_score = model.score(X_test, y_test)
train_score = model.score(X_train, y_train)
importances = model.feature_importances_

print(f"test_score: {test_score}")
print(f"train_score: {train_score}")
print(f"importances: {importances}")

# Predcition
y_pred = model.predict(X_test)
proba = model.predict_proba(X_test)

# Show false predictions
false_idx = np.where(y_pred != y_test)
false_pred = y_test[false_idx]

print(f"True values at index of false prediction:\n {false_pred}")

test_score: 0.8539823008849557
train_score: 0.8552437223042836
importances: [0.00081332 0.         0.         ... 0.00075431 0.         0.        ]
True values at index of false prediction:
 [0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 1. 0. 0. 0. 1. 0. 0. 0. 0.]


In [None]:
# Postprocessing

In [None]:
# Diagnostics

In [None]:
# Visualization