In [2]:
#---
# Load Modules
#---
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from models.random_forest import software

#---
# Initialize
#---
model_run = "rf023" # Name of the model run. Saves results into labeled folders
run_id = 0 # Starting run_id for labeling
season = "winter" # Season of GESLA data
detrend_type = "linear" # "constant", "linear"
predictors_of_model = [ # Note!: If prefilling "pf" is used, they have to be always stated after ERA5 predictors
    #["sp", "tp", "pf"],
    #["pf"]
]
timelags_of_model = [ # Note!: If pf is used alone as a predictor, timelags need to be in hours instead of days
    # [0, 2, 7], 
    # [7 * 24,]
]
models_path = [
]
percentile = 0.95
clf = RandomForestClassifier
station_names_of_model = [
    ["kalixstoron-kal-swe-cmems",], # North-Sweden
    ["hanko-han-fin-cmems",], # Finland
    ["hamina-ham-fin-cmems",], # Finland deep in bay
    ["daugavgriva-dau-lva-cmems",], # Lettland (Riga) 2005 - 2020 
    ["travemuende-tra-deu-cmems",], # Germany (Travemuende) 2005 - 2020 
    ["oskarshamn-osk-swe-cmems"], # West-Sweden (South) 
    ["forsmark-for-swe-cmems",], # West-Sweden
]
model_run_flag = [
    "NSWE",
    "FIN", 
    "FINBAY", 
    "LVA",
    "DEU",
    "WSWE", 
    "WSWE2",
]

#---
# Build Hyperparameter Grid to optimize from.
#---
n_estimators = [int(x) for x in np.linspace(start = 0, stop = 1000, num = 4)] # Number of trees in random forest
max_depth = [int(x) for x in np.linspace(1, 3, num = 3)] # Maximum number of levels in tree
random_state = 0
# min_samples_split = [2, 5,] # Minimum number of samples required to split a node
# min_samples_leaf = [1, 2, 4] # Minimum number of samples required at each leaf node

hparam_grid = {'n_estimators': n_estimators, # hparam grid if optimization is needed
            'max_depth': max_depth,
            # 'min_samples_split': min_samples_split,
            # 'min_samples_leaf': min_samples_leaf,
            'criterion' : ['gini',],
            'random_state' : [random_state,], # To compare results when changing hyperparameters
            'class_weight' : ["balanced",], # “balanced_subsample"
            'oob_score' : [True,],
            }

optimizer = "RandomSearchCV" # "RandomSearchCV", "GridSearchCV"
k = 3 # k-fold Cross Validation
n_iter = 100 # Iterations of validation
test_size = 0.25 # For train-test split
is_optimized = True # Optimizing Hyperparameters?
is_scaled = True # Scale training data? 
is_overlay_importance = True # Plot top 1% importance into predictor maps?
is_station_name = False # Plot station names on top of marker?

In [1]:
for i, station_names in enumerate(station_names_of_model):
    model_run_code = f"{model_run}_{model_run_flag[i]}"
    print(model_run_code)
    software.run(
        model_run_code, 
        run_id, 
        season,
        station_names,
        detrend_type, 
        predictors_of_model, 
        timelags_of_model, 
        models_path, 
        percentile, 
        clf, 
        hparam_grid, 
        optimizer, 
        k, 
        n_iter, 
        random_state,
        test_size,
        is_optimized, 
        is_scaled,
        is_overlay_importance,
        is_station_name,
        )