# Description

- Rerun rf004 with optimized Hyperparameter ranges and scaled (!) predictors
- Use new savings of metrics, e.g. confusion matrix for train-set, relative accuracy etc.

# Next Steps

- Modularize the hyperparameter and modelfit optimization, s.t. I can choose the classifier for each run and give a param grid to the run depending on the classifier (done)

- Check if folders are there when creating it (done)

- Check if GridSearchCV or RandomSearchCV need StandardScaler() as argument to scale the training data! Very important! Maybe I need to use Pipelines for this and implement the Pipeline in the fitting module. (I scaled the data beforehand, that should be sufficient). (done)

- implement this: use_params (dict): Dictionary of hyperparameters that should be used for model. If None, hyperparameters are optimized. (Defaults: None). done


# Working Area

In [3]:
#---
# Initialize
#---
from models.random_forest import rf008
from sklearn.ensemble import RandomForestClassifier
import numpy as np

model_run = "rf008"
season = "winter" # ["winter", "autumn",] 
percentile = 0.95 # [0.95, 0.99,] 
station_names = ["hanko-han-fin-cmems",]
preds = [
    ["sp", "u10",], # run_id 0
    ["sp", "tp", "u10",], # run_id 1,...
    ["sp", "tp",],
    ["tp","u10"],
    ["sp", "tp", "u10", "v10"],
]

clf = RandomForestClassifier
optimizer = "RandomSearchCV" #["RandomSearchCV", "GridSearchCV"]
n_iter = 100 
k = 3
#---
# Build Hyperparameter Grid to optimize from.
# For this run, use exactly the same as in rf004 to see if 
# scaling of predictor data leads to any changes.
#---
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 0, stop = 1000, num = 10)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 55, num = 5)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Create the random grid
hparam_grid = {'n_estimators': n_estimators, # hparam grid if optimization is needed
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'criterion' : ['gini',],
            'random_state' : [0,], # To compare results when changing hyperparameters
            'class_weight' : ["balanced",],
            'oob_score' : [True,],
            }

for run_id, predictors in enumerate(preds):
    rf008.run(season, predictors, percentile, station_names, 
    clf, hparam_grid, optimizer, 
    run_id, model_run, 
    k, n_iter, is_optimized=True, is_scaled=True)

Load Predictand from GESLA
Applied one-hot-encoding with Percentile: 0.95
Add predictor sp to model input features
Get overlapping timeseries of ERA5 and GESLA
Add predictor u10 to model input features
Get overlapping timeseries of ERA5 and GESLA
Assert that timeinterval of all predictors are the same
Time-interval is the same
Time-interval is the same
All Time-intervals are the same
Prepare input data for model training
Data is prepared as follows
X.shape : (903, 34122)
y.shape : (903,)
Start Model Training
Do Train-Test-Split
Scale training data
Optimize Hyperparameters using RandomSearchCV
Tested Hyperparameters: {'n_estimators': [0, 111, 222, 333, 444, 555, 666, 777, 888, 1000], 'max_depth': [5, 17, 30, 42, 55, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'criterion': ['gini'], 'random_state': [0], 'class_weight': ['balanced'], 'oob_score': [True]}
Optimize Hyperparameters using RandomSearchCV
Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [29]:
#---
# Goal: Check if clf = classifier of my choice is adjustable or not like 
# model = clf(param_grid)
# Use best_params from modelfit.optimize_hyperparameter output to apply params to model.
#---

#---
# Load example data 
#---
import pandas as pd
import zipfile
from sklearn.model_selection import train_test_split


with zipfile.ZipFile('../courses/machine_learning/Kursmaterialien.zip', 'r') as source:
    with source.open('Kursmaterialien/Abschnitt 26 - Entscheidungsbaeume/classification.csv') as file:
        df = pd.read_csv(file, low_memory=False)

df.head()

#- Train & Test split
X = df.drop('success', axis = 1).values #- Copies DF
y = df['success'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

# Example parameters for model fit
#---
clf = RandomForestClassifier
best_params = {
    "n_estimators" : 1,
    "max_depth" : 4, 
    "criterion" : "entropy",
    "min_samples_leaf" : 2, 
    "min_samples_split" : 2, 
    "random_state" : 0, 
    "class_weight" : "balanced",
    "oob_score" : True,
}

#---
# Fit the model
#---
model1 = clf(**best_params) # One can set parameters afterwards via model.set_params()

model2 = RandomForestClassifier(criterion='gini',
n_estimators=best_params["n_estimators"], #- nTrees 
max_depth=best_params["max_depth"], 
min_samples_leaf=best_params["min_samples_leaf"],
min_samples_split=best_params["min_samples_split"],
random_state=0, # To compare results when changing hyperparameters
class_weight="balanced",
oob_score=True,
) 

print(f"Used model: {model1}")
print("Fit model")
model1.fit(X_train, y_train)
print(f"Testscore: {model1.score(X_test, y_test)}")
print(f"Trainscore: {model1.score(X_train, y_train)}")

Used model: RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=4, min_samples_leaf=2, n_estimators=1,
                       oob_score=True, random_state=0)
Fit model
Testscore: 0.9333333333333333
Trainscore: 0.9459459459459459


  warn(


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from models import modelfit

type(LogisticRegression())
modelfit.optimize_hyperparameter?

[1;31mSignature:[0m
[0mmodelfit[0m[1;33m.[0m[0moptimize_hyperparameter[0m[1;33m([0m[1;33m
[0m    [0mX_train[0m[1;33m,[0m[1;33m
[0m    [0my_train[0m[1;33m,[0m[1;33m
[0m    [0mclf[0m[1;33m,[0m[1;33m
[0m    [0moptimizer[0m[1;33m,[0m[1;33m
[0m    [0mparam_grid[0m[1;33m,[0m[1;33m
[0m    [0mk[0m[1;33m,[0m[1;33m
[0m    [0mn_iter[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;33m-[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Description: 
    Return best hyperparameters for a model based on chosen optimizer
Parameters:
    X_train (): Predictor train data
    y_train (): Predictand train data
    clf (): Base Model
    optimizer (): GridSearchCV or RandomizedSearchCV
    param_grid (dict): Dictionary with hyperparameter ranges
    k (int): k-fold Cross-Validation
    n_iter (int): Number of combinations used for RandomizedSearchCV (Defaults:None)
    

In [None]:
#---
# Scale data
#---
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
s.fit(X_train)
X_train = s.transform(X_train)
X_test = s.transform(X_test)

In [None]:
#---
# Learning Curve
#---

#- Train & Test split
X = df.drop('success', axis = 1).values #- Copies DF
y = df['success'].values

#- Plot Learning Curve
from sklearn.model_selection import learning_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import numpy as np

X, y = shuffle(X, y) #- random selection of data. Good if you dont know if data is ordered
train_sizes_abs, train_scores, test_scores = learning_curve(LogisticRegression(), X, y)
%matplotlib inline

import matplotlib.pyplot as plt

plt.plot(train_sizes_abs, np.mean(train_scores, axis = 1)) #- learning curve macht automatisch k-fold crossvalidation. deswegen mean
plt.plot(train_sizes_abs, np.mean(test_scores, axis = 1)) #- learning curve macht automatisch k-fold crossvalidation. deswegen mean

plt.show()

#- Note: Do this more often to get mean. Sometimes Curves look weird.

In [None]:
#---
# Pipeline GridSearchCV: 
# Add this to modelfit.py?
#---
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC 

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC()),
])

from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(pipeline, param_grid = {
    "svc__C": [0.001, 0.01, 0.1, 1, 10,],
    "svc__gamma": [0.001, 0.01, 0.1, 1, 10,],
})

clf.fit(X_train, y_train)

print(clf.best_params_)

print(clf.score(X_validation, y_validation)) # Accuracy on basis of test data

print(clf.best_score_) # Accuracy based on k-fold cross-validation