In [2]:
# Modules
import numpy as np
import xarray as xr
import pandas as pd

from data import data_loader
from data import gesla_preprocessing
from data import era5_preprocessing
from data import preprocessing

# Work if theres time

1. Describe all modules in .py file <br>

2. Do we need to minimize risk of misclassifcation (see Olli>Lernverfahren>05_classification)? <br>

# Next Steps

1. How do we optimize automatic search? <br>

2. Function that saves test and trainstatistics during modelrun to analyze overfitting. 

3. Use Learning Curves to analyze overfitting during fitting process of hyperparameters?

4. Evaluation module <br>

4.1 Confusion Matrix (done) <br> 

4.2 Error rate (#errors / #instances) --> Success rate / accuracy = 1 - error rate <br>

5. Do I need a validation set? Now I do have train-test datasets. But one could use train, validation and testsets.

6. When using combined predictors, I need to normalize features, right (see Scaling in ML-Course Udemy)? I did not do that in previous runs (rf005, rf006)

7. Use Gradient boosting? (see meta learning, Lernverfahren)

8. Regression using Random Forests

9. Use SVM with Kernel for classification?

10. Add Pipeline to GridSearchCV?

# Working Area

In [None]:
#---
# Scale data
#---
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
s.fit(X_train)
X_train = s.transform(X_train)
X_test = s.transform(X_test)

In [None]:
#---
# Learning Curve
#---

#- Train & Test split
X = df.drop('success', axis = 1).values #- Copies DF
y = df['success'].values

#- Plot Learning Curve
from sklearn.model_selection import learning_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import numpy as np

X, y = shuffle(X, y) #- random selection of data. Good if you dont know if data is ordered
train_sizes_abs, train_scores, test_scores = learning_curve(LogisticRegression(), X, y)
%matplotlib inline

import matplotlib.pyplot as plt

plt.plot(train_sizes_abs, np.mean(train_scores, axis = 1)) #- learning curve macht automatisch k-fold crossvalidation. deswegen mean
plt.plot(train_sizes_abs, np.mean(test_scores, axis = 1)) #- learning curve macht automatisch k-fold crossvalidation. deswegen mean

plt.show()

#- Note: Do this more often to get mean. Sometimes Curves look weird.

In [None]:
#---
# Pipeline GridSearchCV: 
# Add this to modelfit.py?
#---
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC 

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC()),
])

from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(pipeline, param_grid = {
    "svc__C": [0.001, 0.01, 0.1, 1, 10,],
    "svc__gamma": [0.001, 0.01, 0.1, 1, 10,],
})

clf.fit(X_train, y_train)

print(clf.best_params_)

print(clf.score(X_validation, y_validation)) # Accuracy on basis of test data

print(clf.best_score_) # Accuracy based on k-fold cross-validation

# Modularized Preprocessing

In [5]:
#---
# Modularize Preprocessing
#---

# Get timeseries of predictor and predictand
season = "winter" # ["winter", "autumn",] 
predictors = ["sp", "tp", "u10", "v10",]
percentile = 0.95 # [0.95, 0.99,] 
preprocess = "preprocess1" # ["preprocess1"]
range_of_years = "1999-2008" # ["1999-2008", "2009-2018", "2019-2022",]
subregion = "lon-0530_lat7040" # ["lon-0530_lat7040"]
station_names = ["hanko-han-fin-cmems",]

In [10]:
# Load already preprocessed Era5 Data
# Preprocessing done with cdo
#---
predictor = predictors[0]
era5_predictor = data_loader.load_daymean_era5(range_of_years, subregion, season, predictor, preprocess)

era5_predictor.shape

(903, 121, 141)

In [7]:
#---
# Preprocess GESLA Data
#---

# Load Predictand
#---
gesla_predictand = data_loader.load_gesla(station_names)

# Select a season
#---
gesla_predictand = gesla_preprocessing.select_season(gesla_predictand, season)

# Select only sea_level analysis data
#---
gesla_predictand = gesla_preprocessing.get_analysis(gesla_predictand)

# Subtract mean of data grouped by station
#---
gesla_predictand = gesla_predictand["sea_level"] # Detrend expects pd.Series
gesla_predictand = gesla_preprocessing.detrend(gesla_predictand, level="station")

# Apply one hot encoding
gesla_predictand = gesla_preprocessing.apply_dummies(gesla_predictand, percentile=percentile, level="station")
print(f"Applied one-hot-encoding with Percentile: {percentile}")

# Convert to DataArray
# nan values: no measurement at that timestamp for specific station
gesla_predictand = gesla_predictand.to_xarray()

Load Predictand from GESLA
Applied one-hot-encoding with Percentile: 0.95


In [11]:
#---
# Get overlapping time-series
#---
X, Y, t = preprocessing.intersect_time(era5_predictor, gesla_predictand)

print(f"X: {X.shape}")
print(f"Y: {Y.shape}")
print(f"t: {t.shape}")

Get overlapping timeseries of ERA5 and GESLA
X: (903, 121, 141)
Y: (903, 1)
t: (903,)


In [12]:
# Reshape for model input
#---
ndim = t.shape[0]
X = X.reshape(ndim, -1) # (ndim, nclasses)
y = Y[:, 0] # Select one station
print(X.shape)
print(y.shape)

(903, 17061)
(903,)


In [13]:
#---
# Handle NaN Values
#---

# Insert numerical value that is not in data.
# ML will hopefully recognize it.
X[np.where(np.isnan(X))] = -999

# Modularized Model Setup

In [42]:
#---
# Train Model
#---
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)
X_train.shape

(677, 17061)

In [44]:
#---
#  Optimize Hyperparameters
#---
from models import modelfit

clf = RandomForestClassifier()
optimizer = "RandomSearchCV"
k = 3 # k-fold cross-validation
n_iter = 100 # number of combinations

# Number of trees in random forest
n_estimators = [int(erx) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(f"Tested Hyperparameters: {param_grid}")

best_params = modelfit.optimize_hyperparameter(X_train, y_train, clf, optimizer, param_grid, k, n_iter=n_iter, n_jobs=-1)

Tested Hyperparameters: {'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
Optimize Hyperparameters using RandomSearchCV
Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [54]:
#---
#  Save hyperparameters
#---
from data import saver

model_run = f"rf004"
folder = "results/random_forest/" 
saver.save_hpdict(best_params, predictor, model_run, percentile, folder)

In [46]:
#---
# Fit the model
#---
model = RandomForestClassifier(criterion='gini',
n_estimators=best_params["n_estimators"], #- nTrees 
max_depth=best_params["max_depth"], 
max_features=best_params["max_features"],
min_samples_leaf=best_params["min_samples_leaf"],
min_samples_split=best_params["min_samples_split"],
bootstrap=best_params["bootstrap"],
random_state=0, # To compare results when changing hyperparameters
class_weight="balanced",
oob_score=True,
)

model.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', max_depth=20,
                       max_features='sqrt', min_samples_leaf=2,
                       n_estimators=600, oob_score=True, random_state=0)

In [47]:
#---
# Saving the model
#---
filename = f'RandomForest_{optimizer}.sav'
pickle.dump(model, open(f'{folder}{filename}', 'wb'))

In [50]:
#---
# Load the model from disk
#---
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)

<bound method BaseEstimator.get_params of RandomForestClassifier(class_weight='balanced', max_depth=20,
                       max_features='sqrt', min_samples_leaf=2,
                       n_estimators=600, oob_score=True, random_state=0)>

# Evaluate Model

In [None]:
#---
# Evaluate model / Diagnostic
#--- 
print("Evaluate Model \n")

# Score & Importance
#---
test_score = model.score(X_test, y_test)
train_score = model.score(X_train, y_train)
importance = model.feature_importances_

print(f"test_score: {test_score}")
print(f"train_score: {train_score}")
print(f"importance: {importance}")

fname = f"importance_{predictor}{str(percentile)[-2:]}"
np.save(f"{folder}{fname}", importance)
print(f"saved importance to : {folder}{fname}")

# Confusion matrix
#---
# Format: 
# Reality / Model: Negative, Positive
# Negative    Right Negative, False Positive 
# Positive    False Negative, Right Positive

from sklearn.metrics import confusion_matrix
from models import evaluation

print("Show Confusion Matrix \n")

cfm_fig = evaluation.plot_cf(model, X_test, y_test)
cfm_fig.show()

# Save CFM
fname = f"{folder}cf_matrix_{predictor}{str(percentile)[-2:]}.jpg"
cfm_fig.savefig(fname)
print(f"saved cf matrix to : {fname}")

# Calculate CFM-Metrics
metrics = evaluation.cfm_metrics(model, X_test, y_test)
fname = f"cf_metrics_{predictor}{str(percentile)[-2:]}.pkl"

with open(f"{folder}{fname}", 'wb') as f:
    pickle.dump(metrics, f)

print(f"saved cf metrics to : {fname}")


# AUROC
# Receiver Operating Characteristics & Area Under the Curve
#---
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

print("Show AUROC \n")

y_test_proba = model.predict_proba(X_test)[:, 1] # Prob. for predicting 0 or 1, we only need second col

fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)
auc = roc_auc_score(y_test, y_test_proba)

print(f'AUC: {auc}')

fig, ax = plt.subplots(tight_layout=True)

ax.plot(fpr, tpr)
ax.set_xlabel("FPR")
ax.set_ylabel("TPR")
ax.set_title(f"AUROC with AUC = {auc}")

fig.show()

fname = f"{folder}AUROC_{predictor}{str(percentile)[-2:]}.jpg"
fig.savefig(fname)
print(f"saved AUROC to : {fname}")

#---
# Visualize importance maps
#---
# Load lat lons
lats, lons = preprocessing.get_lonlats(
    range_of_years,
    subregion,
    season,
    predictor,
    preprocess,
)

# Plot importance-map
from models import evaluation

tflag = f"{predictor}{str(percentile)[-2:]}"

fig = evaluation.importance_map(importance, lons, lats, tflag=tflag)

# Save importance-map
folder = f"results/random_forest/{model_run}/"
fname = f"importance_map_{predictor}{str(percentile)[-2:]}"
fig.savefig(f"{folder}{fname}.jpg")

## Evaluate the model

# Not Modularized Examples

In [2]:
# ---
# Preprocessing
# ---

model_run = "rf002"

# Get timeseries of predictor and predictand
percentile = 0.95
predictors = ["sp", "tp", "u10", "v10",]
predictor = "sp"
season = "winter"


X, Y, t = preprocessing.preprocessing1(season, predictor, percentile)

# Handle NaN values: 
# Insert numerical value that is not in data.
# ML will hopefully recognize it.
X[np.where(np.isnan(X))] = -999

# Save number of lat/lon for interpreting model output later
ndim = X.shape[0]
nlat = X.shape[1]
nlon = X.shape[2]

# Prepare shape for model
X = X.reshape(ndim, -1) # (ndim, nclasses)
y = Y[:, 0] # Select only one station

Load ERA5-Predictor: sp in region: lon-0530_lat7040 for years: 1999-2008 in season: winter
Load Predictand from GESLA
Applied one-hot-encoding
Get overlapping timeseries of ERA5 and GESLA


In [None]:
#---
# Load Hyperparameters rf002
#---

import pickle

predictors = ["sp", "tp", "u10", "v10",]
model_run = "rf002"
percentile = 0.99
pflag = str(percentile)[-2:]
folder = f"models/random_forest/{model_run}/"

print(f"model_run: {model_run}, percentile: {percentile}")
for predictor in predictors:
    fname = f"{model_run}_{predictor}{pflag}.pkl"

    with open(f"{folder}{fname}", 'rb') as f:
        data = pickle.load(f)

    print(predictor)
    print(data)

#---
# Hyperparameters rf003
#---
model_run = "rf003"
folder = f"models/random_forest/{model_run}/"

print(f"model_run: {model_run}, percentile: {percentile}")
for predictor in predictors:
    fname = f"{model_run}_{predictor}{pflag}.pkl"

    with open(f"{folder}{fname}", 'rb') as f:
        data = pickle.load(f)

    print(predictor)
    print(data)


In [None]:
# Postprocessing