## Remayn usage with Grid Search

### 1. Running some experiments with GridSearchCV and saving the results

A Logistic Regression model and a Ridge Classifier are trained using a GridSearch cross-validation procedure. Then, the results are saved including the best parameters found.

In [2]:
from remayn.result import make_result
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import time
from shutil import rmtree
from remayn.result_set import ResultFolder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:

# Clean up the results folder if exists
rmtree('./results', ignore_errors=True)

# Repeat the experiment 10 times with different random seeds
for seed in range(10):
    for model, param_grid in [(LogisticRegression, {'C': [0.1, 1, 10], 'max_iter': [50, 100, 150]}),
                              (RidgeClassifier, {'alpha': [0.1, 1, 10], 'max_iter': [50, 100, 150]})]:
        # Generate a sample dataset
        X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=2, n_clusters_per_class=2, random_state=0)

        # Split the dataset into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

        # Train the model
        start_time = time.time()

        gs = GridSearchCV(model(), param_grid=param_grid, cv=5)
        gs.fit(X_train, y_train)

        train_time = time.time() - start_time

        # Make predictions
        y_train_pred = gs.predict(X_train)
        y_test_pred = gs.predict(X_test)

        # Prepare estimator config that is going to be saved
        estimator_config = gs.get_params()
        # Remove the 'estimator' key from the config, as it is not serializable
        estimator_config.pop('estimator')

        # Create a dictionary that represents the config of this experiment.
        # Any information relevant for the experiment can be included here.
        # In this case, all the hyperparameters of the estimator are included.
        experiment_config = {
            "estimator_config": estimator_config,
            "estimator_name": model.__name__,
            "seed": seed,
        }

        # Save the results of the experiment
        make_result(
            base_path='./results',
            config=experiment_config,
            targets=y_test,
            predictions=y_test_pred,
            train_targets=y_train,
            train_predictions=y_train_pred,
            time=train_time,

            # Save the best hyperparameters and the best model
            best_params=gs.best_params_,
            best_model=gs.best_estimator_
        ).save()



### 2. Loading the results folder and visualizing the results

In [4]:
# Load the results from the folder
rf = ResultFolder('./results')
print(rf)

# Iterate over the results and print them
for i, result in enumerate(rf):
    print(result)

    # Print only the first 3 results
    if i == 2:
        break

ResultSet with 20 results
Config: {
    "estimator_config": {
        "cv": 5,
        "error_score": NaN,
        "estimator__C": 1.0,
        "estimator__class_weight": null,
        "estimator__dual": false,
        "estimator__fit_intercept": true,
        "estimator__intercept_scaling": 1,
        "estimator__l1_ratio": null,
        "estimator__max_iter": 100,
        "estimator__multi_class": "auto",
        "estimator__n_jobs": null,
        "estimator__penalty": "l2",
        "estimator__random_state": null,
        "estimator__solver": "lbfgs",
        "estimator__tol": 0.0001,
        "estimator__verbose": 0,
        "estimator__warm_start": false,
        "n_jobs": null,
        "param_grid": {
            "C": [
                0.1,
                1,
                10
            ],
            "max_iter": [
                50,
                100,
                150
            ]
        },
        "pre_dispatch": "2*n_jobs",
        "refit": true,
        "return_trai

Note that in the above example, only the config of the result is loaded. The predictions are only loaded when it is required or when we force it.

In [4]:
# Load all the data for the first result
first_result = list(rf)[0]
first_result_data = first_result.get_data()

# Print the shape of test predictions
print(f"Predictions shape: {first_result_data.predictions.shape}")

# Print the shape of test targets
print(f"Targets shape: {first_result_data.targets.shape}")

# Print the best hyperparameters
print(f"Best hyperparameters: {first_result_data.best_params}")


Predictions shape: (200,)
Targets shape: (200,)
Best hyperparameters: {'C': 0.1, 'max_iter': 50}


Note that if we print the result now, all the information is already loaded

In [5]:
first_result

Config: {
    "estimator_config": {
        "cv": 5,
        "error_score": NaN,
        "estimator__C": 1.0,
        "estimator__class_weight": null,
        "estimator__dual": false,
        "estimator__fit_intercept": true,
        "estimator__intercept_scaling": 1,
        "estimator__l1_ratio": null,
        "estimator__max_iter": 100,
        "estimator__multi_class": "auto",
        "estimator__n_jobs": null,
        "estimator__penalty": "l2",
        "estimator__random_state": null,
        "estimator__solver": "lbfgs",
        "estimator__tol": 0.0001,
        "estimator__verbose": 0,
        "estimator__warm_start": false,
        "n_jobs": null,
        "param_grid": {
            "C": [
                0.1,
                1,
                10
            ],
            "max_iter": [
                50,
                100,
                150
            ]
        },
        "pre_dispatch": "2*n_jobs",
        "refit": true,
        "return_train_score": false,
        "

A specific result can also be obtained using the filter function:

In [6]:
# Define a filter function. Receives the result and should return true if the result
# should be included in the filtered results.
def filter_fn(result):
    return result.config['estimator_name'] == 'LogisticRegression' and result.config['seed'] == 0

# Filter the results
filtered_rs = rf.filter(filter_fn)

print(filtered_rs)

# Iterate over the filtered results and print them
for result in filtered_rs:
    print(result)

# Or get the first result (the only one in this case)
list(filtered_rs)[0]

ResultSet with 1 result
Config: {
    "estimator_config": {
        "cv": 5,
        "error_score": NaN,
        "estimator__C": 1.0,
        "estimator__class_weight": null,
        "estimator__dual": false,
        "estimator__fit_intercept": true,
        "estimator__intercept_scaling": 1,
        "estimator__l1_ratio": null,
        "estimator__max_iter": 100,
        "estimator__multi_class": "auto",
        "estimator__n_jobs": null,
        "estimator__penalty": "l2",
        "estimator__random_state": null,
        "estimator__solver": "lbfgs",
        "estimator__tol": 0.0001,
        "estimator__verbose": 0,
        "estimator__warm_start": false,
        "n_jobs": null,
        "param_grid": {
            "C": [
                0.1,
                1,
                10
            ],
            "max_iter": [
                50,
                100,
                150
            ]
        },
        "pre_dispatch": "2*n_jobs",
        "refit": true,
        "return_train_

Config: {
    "estimator_config": {
        "cv": 5,
        "error_score": NaN,
        "estimator__C": 1.0,
        "estimator__class_weight": null,
        "estimator__dual": false,
        "estimator__fit_intercept": true,
        "estimator__intercept_scaling": 1,
        "estimator__l1_ratio": null,
        "estimator__max_iter": 100,
        "estimator__multi_class": "auto",
        "estimator__n_jobs": null,
        "estimator__penalty": "l2",
        "estimator__random_state": null,
        "estimator__solver": "lbfgs",
        "estimator__tol": 0.0001,
        "estimator__verbose": 0,
        "estimator__warm_start": false,
        "n_jobs": null,
        "param_grid": {
            "C": [
                0.1,
                1,
                10
            ],
            "max_iter": [
                50,
                100,
                150
            ]
        },
        "pre_dispatch": "2*n_jobs",
        "refit": true,
        "return_train_score": false,
        "

Also, if we have the exact config of a experiment, we can obtain it. The ResultSet (or ResultFolder) behaves like a dictionary where the key can be a config dictionary (or the serialized config as string) or even the Result object.

In [7]:
# Get the config from the first experiment in our result folder
first_result_config = list(rf)[0].config

# Find this experiment in the result folder using the config
found_result = rf[first_result_config]

print(found_result)

Config: {
    "estimator_config": {
        "cv": 5,
        "error_score": NaN,
        "estimator__C": 1.0,
        "estimator__class_weight": null,
        "estimator__dual": false,
        "estimator__fit_intercept": true,
        "estimator__intercept_scaling": 1,
        "estimator__l1_ratio": null,
        "estimator__max_iter": 100,
        "estimator__multi_class": "auto",
        "estimator__n_jobs": null,
        "estimator__penalty": "l2",
        "estimator__random_state": null,
        "estimator__solver": "lbfgs",
        "estimator__tol": 0.0001,
        "estimator__verbose": 0,
        "estimator__warm_start": false,
        "n_jobs": null,
        "param_grid": {
            "C": [
                0.1,
                1,
                10
            ],
            "max_iter": [
                50,
                100,
                150
            ]
        },
        "pre_dispatch": "2*n_jobs",
        "refit": true,
        "return_train_score": false,
        "

### 3. Use the best model saved to compute some predictions for new data

In [8]:
X_new, y_new = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=2, n_clusters_per_class=2, random_state=1)

best_model = first_result_data.best_model
y_new_pred = best_model.predict(X_new)

print(y_new_pred.shape)
print(y_new.shape)

(1000,)
(1000,)


### 4. Define a function to compute some metrics for the saved results


In [9]:
def compute_metrics(targets, predictions):
    return {
        "accuracy": accuracy_score(targets, predictions),
        "precision": precision_score(targets, predictions),
        "recall": recall_score(targets, predictions),
        "f1": f1_score(targets, predictions),
    }

### 5. Create a dataframe with all the results and the requested metrics

In [10]:
# First, define the columns from the config that we want to include in the dataframe
config_colums = [
    "estimator_name",

    # Nested values can be accessed using dot notation
    "estimator_config.param_grid.C",
    "estimator_config.param_grid.max_iter",
]

# Define also the best parameters that we want to include
best_params_columns = [
    "C",
    "max_iter",
]

df = rf.create_dataframe(
    config_columns=config_colums,
    metrics_fn=compute_metrics,
    # By default only the test metrics are included
    include_train=True,
    best_params_columns=best_params_columns,
)

df

Unnamed: 0,config_estimator_name,config_estimator_config.param_grid.C,config_estimator_config.param_grid.max_iter,best_C,best_max_iter,accuracy,precision,recall,f1,train_accuracy,train_precision,train_recall,train_f1,time
0,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",0.1,50,0.95,0.963855,0.91954,0.941176,0.94875,0.957921,0.941606,0.949693,0.470979
1,RidgeClassifier,,"[50, 100, 150]",,50,0.925,0.907216,0.93617,0.921466,0.9525,0.959799,0.945545,0.952618,0.266672
2,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",10.0,50,0.93,0.935185,0.935185,0.935185,0.95875,0.963636,0.951282,0.957419,0.391975
3,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",0.1,50,0.945,0.959184,0.930693,0.944724,0.9475,0.951654,0.942065,0.946835,0.374334
4,RidgeClassifier,,"[50, 100, 150]",,50,0.96,0.962617,0.962617,0.962617,0.94125,0.945596,0.933504,0.939511,0.283837
5,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",0.1,50,0.93,0.908163,0.946809,0.927083,0.955,0.962312,0.94802,0.955112,0.388845
6,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",1.0,50,0.945,0.972477,0.929825,0.950673,0.945,0.952128,0.932292,0.942105,0.399152
7,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",0.1,50,0.96,0.962617,0.962617,0.962617,0.9425,0.950392,0.930946,0.940568,0.388529
8,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",0.1,50,0.925,0.929412,0.897727,0.913295,0.9575,0.954106,0.963415,0.958738,0.387736
9,RidgeClassifier,,"[50, 100, 150]",,50,0.945,0.952381,0.91954,0.935673,0.94,0.950372,0.931873,0.941032,0.345243


### 6. Create another dataframe applying a filter based on the config

In [11]:
def filter_fn(result):
    return result.config['estimator_name'] == 'LogisticRegression' and result.get_data().best_params['C'] == 0.1

df2 = rf.create_dataframe(
    config_columns=config_colums,
    metrics_fn=compute_metrics,
    # By default only the test metrics are included
    include_train=True,
    best_params_columns=best_params_columns,
    # Apply a filter function
    filter_fn=filter_fn
)

df2

Unnamed: 0,config_estimator_name,config_estimator_config.param_grid.C,config_estimator_config.param_grid.max_iter,best_C,best_max_iter,accuracy,precision,recall,f1,train_accuracy,train_precision,train_recall,train_f1,time
0,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",0.1,50,0.95,0.963855,0.91954,0.941176,0.94875,0.957921,0.941606,0.949693,0.470979
1,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",0.1,50,0.945,0.959184,0.930693,0.944724,0.9475,0.951654,0.942065,0.946835,0.374334
2,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",0.1,50,0.93,0.908163,0.946809,0.927083,0.955,0.962312,0.94802,0.955112,0.388845
3,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",0.1,50,0.96,0.962617,0.962617,0.962617,0.9425,0.950392,0.930946,0.940568,0.388529
4,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",0.1,50,0.925,0.929412,0.897727,0.913295,0.9575,0.954106,0.963415,0.958738,0.387736
5,LogisticRegression,"[0.1, 1, 10]","[50, 100, 150]",0.1,50,0.95,0.938776,0.958333,0.948454,0.95125,0.957179,0.945274,0.951189,0.391026
