In [1]:
%run ../src/dataset.py
%run ../src/augmentation.py

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

random_grid = {
    'n_estimators': list(range(100, 2000, 100)),
    'criterion': ["gini", "entropy"],
    'max_features': ["auto", "log2", "sqrt"],
    'max_depth': list(range(10, 110, 10)) + [None],
    'min_samples_split': list(range(2, 20)),
    'min_samples_leaf': list(range(1, 10)),
    'bootstrap': [True, False]
}

results = {}
for scenario in (1, 2):
    for n in (100, 500, 1000, 2000):
        transform = Compose(Sampler(n=n), HillNumbers(q_step=0.25))
        data = PresimulatedDataset.load(
            f"../data/scenario-{scenario}-trainset-new.npz", transform=transform)
        X, theta = data.dataset, data.theta
        y = (theta[:, 0].numpy() != 0).astype(int)

        # rf = ExtraTreesClassifier()
        rf = RandomForestClassifier()
        rf_random = RandomizedSearchCV(
            estimator=rf, 
            param_distributions=random_grid, 
            n_iter=500, 
            cv=5,
            scoring="roc_auc",
            verbose=1, 
            random_state=42, 
            n_jobs=20
        )

        rf_random.fit(X, y)
        results[scenario, n] = rf_random

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Fitting 5 folds for each of 500 candidates, totalling 2500 fits


In [6]:
import json

params = {}
for (scenario, n), estimator in results.items():
    params[f"{scenario}-{n}"] = estimator.best_params_

with open("clf_params-rf.json", "w") as out:
    json.dump(params, out, indent=4)

In [4]:
params

{'1-100': {'n_estimators': 700,
  'min_samples_split': 9,
  'min_samples_leaf': 1,
  'max_features': 'sqrt',
  'max_depth': 10,
  'criterion': 'gini',
  'bootstrap': True},
 '1-500': {'n_estimators': 1500,
  'min_samples_split': 18,
  'min_samples_leaf': 1,
  'max_features': 'sqrt',
  'max_depth': 20,
  'criterion': 'gini',
  'bootstrap': True},
 '1-1000': {'n_estimators': 700,
  'min_samples_split': 9,
  'min_samples_leaf': 1,
  'max_features': 'sqrt',
  'max_depth': 10,
  'criterion': 'gini',
  'bootstrap': True},
 '1-2000': {'n_estimators': 1500,
  'min_samples_split': 18,
  'min_samples_leaf': 1,
  'max_features': 'sqrt',
  'max_depth': 20,
  'criterion': 'gini',
  'bootstrap': True},
 '2-100': {'n_estimators': 700,
  'min_samples_split': 6,
  'min_samples_leaf': 8,
  'max_features': 'sqrt',
  'max_depth': 90,
  'criterion': 'entropy',
  'bootstrap': True},
 '2-500': {'n_estimators': 1500,
  'min_samples_split': 18,
  'min_samples_leaf': 1,
  'max_features': 'sqrt',
  'max_depth': 