## Remayn basic usage

### 1. Running some experiments and saving the results

In this case, a sklearn Logistic Regressor is trained using sample data and the results of the experiment are saved using remayn.

In [2]:
from remayn.result import make_result
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import time
from remayn.result_set import ResultFolder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from shutil import rmtree

In [3]:


# Repeat the experiment 10 times with different random seeds
for seed in range(10):
    # Generate a sample dataset
    X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=2, n_clusters_per_class=2, random_state=0)

    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    # Train a logistic regression model
    start_time = time.time()

    lr = LogisticRegression()
    lr.fit(X_train, y_train)

    train_time = time.time() - start_time

    # Make predictions
    y_train_pred = lr.predict(X_train)
    y_test_pred = lr.predict(X_test)

    # Create a dictionary that represents the config of this experiment.
    # Any information relevant for the experiment can be included here.
    # In this case, all the hyperparameters of the estimator are included.
    experiment_config = {
        "estimator_config": lr.get_params(),
        "estimator_name": "LogisticRegression",
        "seed": seed,
    }

    # Save the results of the experiment
    make_result(
        base_path='./results',
        config=experiment_config,
        targets=y_test,
        predictions=y_test_pred,
        train_targets=y_train,
        train_predictions=y_train_pred,
        time=train_time,
    ).save()



### 2. Loading the results folder and visualizing the results

In [4]:
# Load the results from the folder
rf = ResultFolder('./results')
print(rf)

# Iterate over the results and print them
for result in rf:
    print(result)

ResultSet with 19 results
Config: {
    "estimator_config": {
        "cv": 5,
        "error_score": NaN,
        "estimator__C": 1.0,
        "estimator__class_weight": null,
        "estimator__dual": false,
        "estimator__fit_intercept": true,
        "estimator__intercept_scaling": 1,
        "estimator__l1_ratio": null,
        "estimator__max_iter": 100,
        "estimator__multi_class": "auto",
        "estimator__n_jobs": null,
        "estimator__penalty": "l2",
        "estimator__random_state": null,
        "estimator__solver": "lbfgs",
        "estimator__tol": 0.0001,
        "estimator__verbose": 0,
        "estimator__warm_start": false,
        "n_jobs": null,
        "param_grid": {
            "C": [
                0.1,
                1,
                10
            ],
            "max_iter": [
                50,
                100,
                150
            ]
        },
        "pre_dispatch": "2*n_jobs",
        "refit": true,
        "return_trai

Note that in the above example, only the config of the result is loaded. The predictions are only loaded when it is required or when we force it.

In [5]:
# Load the predictions for the first result
first_result = list(rf)[0]
first_result_data = first_result.get_data()

# Print the shape of test predictions
print(f"Predictions shape: {first_result_data.predictions.shape}")

# Print the shape of test targets
print(f"Targets shape: {first_result_data.targets.shape}")


Predictions shape: (200,)
Targets shape: (200,)


Note that if we print the result now, all the information is already loaded

In [6]:
first_result

Config: {
    "estimator_config": {
        "cv": 5,
        "error_score": NaN,
        "estimator__C": 1.0,
        "estimator__class_weight": null,
        "estimator__dual": false,
        "estimator__fit_intercept": true,
        "estimator__intercept_scaling": 1,
        "estimator__l1_ratio": null,
        "estimator__max_iter": 100,
        "estimator__multi_class": "auto",
        "estimator__n_jobs": null,
        "estimator__penalty": "l2",
        "estimator__random_state": null,
        "estimator__solver": "lbfgs",
        "estimator__tol": 0.0001,
        "estimator__verbose": 0,
        "estimator__warm_start": false,
        "n_jobs": null,
        "param_grid": {
            "C": [
                0.1,
                1,
                10
            ],
            "max_iter": [
                50,
                100,
                150
            ]
        },
        "pre_dispatch": "2*n_jobs",
        "refit": true,
        "return_train_score": false,
        "

### 3. Define a function to compute some metrics for the saved results


In [7]:
def compute_metrics(targets, predictions):
    return {
        "accuracy": accuracy_score(targets, predictions),
        "precision": precision_score(targets, predictions),
        "recall": recall_score(targets, predictions),
        "f1": f1_score(targets, predictions),
    }

### 4. Create a dataframe with all the results and the requested metrics

In [8]:
# First, define the columns from the config that we want to include in the dataframe
config_colums = [
    "estimator_name",

    # Nested values can be accessed using dot notation
    "estimator_config.C",
    "estimator_config.max_iter",
]

df = rf.create_dataframe(
    config_columns=config_colums,
    metrics_fn=compute_metrics,
    # By default only the test metrics are included
    include_train=True,
)

df

Unnamed: 0,config_estimator_name,config_estimator_config.C,config_estimator_config.max_iter,accuracy,precision,recall,f1,train_accuracy,train_precision,train_recall,train_f1,time
0,LogisticRegression,,,0.97,0.980392,0.961538,0.970874,0.945,0.953368,0.93401,0.94359,0.466302
1,LogisticRegression,,,0.95,0.938776,0.958333,0.948454,0.95125,0.957179,0.945274,0.951189,0.492846
2,LogisticRegression,1.0,100.0,0.945,0.972477,0.929825,0.950673,0.945,0.952128,0.932292,0.942105,0.007456
3,LogisticRegression,,,0.945,0.972477,0.929825,0.950673,0.945,0.952128,0.932292,0.942105,0.417536
4,LogisticRegression,,,0.93,0.935185,0.935185,0.935185,0.95875,0.963636,0.951282,0.957419,0.51441
5,LogisticRegression,1.0,100.0,0.93,0.935185,0.935185,0.935185,0.9575,0.963542,0.948718,0.956072,0.006919
6,LogisticRegression,1.0,100.0,0.94,0.92,0.958333,0.938776,0.95,0.954774,0.945274,0.95,0.006627
7,LogisticRegression,1.0,100.0,0.965,0.971698,0.962617,0.967136,0.94,0.950131,0.925831,0.937824,0.013046
8,LogisticRegression,,,0.95,0.963855,0.91954,0.941176,0.94875,0.957921,0.941606,0.949693,0.421934
9,LogisticRegression,1.0,100.0,0.95,0.963855,0.91954,0.941176,0.9475,0.957816,0.939173,0.948403,0.006912


In [9]:
# Clean up the results folder
rmtree('./results')