# Logging Experiments

``rubicon_ml``'s core functionality is centered around logging **experiments** to explain and explore various
model runs throughout the model development lifecycle. This example will take a quick look at how we can log
model metadata to ``rubicon_ml`` in the context of a simple classification project.

We'll leverage the ``palmerpenguins`` dataset collected by Dr. Kristen Gorman as our training/testing data. More
information on the dataset can be found here:
> https://allisonhorst.github.io/palmerpenguins/

In [None]:
! pip install palmerpenguins

In [None]:
from palmerpenguins import load_penguins

penguins_df = load_penguins()
target_values = penguins_df['species'].unique()

print(f"target classes (species): {target_values}")
penguins_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

for column in ["species", "island", "sex"]:
    penguins_df[column] = LabelEncoder().fit_transform(penguins_df[column])

print(f"target classes (species): {penguins_df['species'].unique()}")
penguins_df.head()

In [None]:
from sklearn.model_selection import train_test_split

train_penguins_df, test_penguins_df = train_test_split(penguins_df, test_size=.30)

target_name = "species"
feature_names = [c for c in train_penguins_df.columns if c != target_name]

X_train, y_train = train_penguins_df[feature_names], train_penguins_df[target_name]
X_test, y_test = test_penguins_df[feature_names], test_penguins_df[target_name]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

imputer_strategy = "mean"
classifier_n_neighbors = 5

steps = [
    ("si", SimpleImputer(strategy=imputer_strategy)),
    ("kn", KNeighborsClassifier(n_neighbors=classifier_n_neighbors)),
]

penguin_pipeline = Pipeline(steps=steps)
penguin_pipeline.fit(X_train, y_train)

score = penguin_pipeline.score(X_test, y_test)
score

In [None]:
from rubicon_ml import Rubicon

rubicon = Rubicon(
    persistence="filesystem",
    root_dir="./rubicon-root",
    auto_git_enabled=True,
)
project = rubicon.get_or_create_project(name="classifying penguins")
experiment = project.log_experiment()

for feature_name in feature_names:
    experiment.log_feature(name=feature_name)

experiment.log_parameter(name="strategy", value=imputer_strategy)
experiment.log_parameter(name="n_neighbors", value=classifier_n_neighbors)
experiment.log_metric(name="accuracy", value=score)

print(experiment)
print()
print(f"git info:")
print(f"\tbranch name: {experiment.branch_name}\n\tcommit hash: {experiment.commit_hash}")
print(f"features: {[f.name for f in experiment.features()]}")
print(f"parameters: {[(p.name, p.value) for p in experiment.parameters()]}")
print(f"metrics: {[(m.name, m.value) for m in experiment.metrics()]}")

In [None]:
from sklearn.base import clone

for imputer_strategy in ["mean", "median", "most_frequent"]:
    for classifier_n_neighbors in [5, 10, 15, 20]:
        pipeline = clone(penguin_pipeline)
        pipeline.set_params(
            si__strategy=imputer_strategy,
            kn__n_neighbors=classifier_n_neighbors,
        )
        
        pipeline.fit(X_train, y_train)
        score = pipeline.score(X_test, y_test)

        experiment = project.log_experiment(tags=["parameter search"])

        for feature_name in feature_names:
            experiment.log_feature(name=feature_name)
        experiment.log_parameter(name="strategy", value=imputer_strategy)
        experiment.log_parameter(name="n_neighbors", value=classifier_n_neighbors)
        experiment.log_metric(name="accuracy", value=score)

print("experiments:")
for experiment in project.experiments(tags=["parameter search"]):
    print(
        f"\tid: {experiment.id}, "
        f"parameters: {[(p.name, p.value) for p in experiment.parameters()]}, "
        f"metrics: {[(m.name, m.value) for m in experiment.metrics()]}"
    )

In [None]:
import pickle
import pandas as pd
from sklearn.metrics import confusion_matrix

experiment = project.experiments(tags=["parameter search"])[-1]

trained_model = pipeline._final_estimator
experiment.log_artifact(data_bytes=pickle.dumps(trained_model), name="trained model")

y_pred = pipeline.predict(X_test)
confusion_matrix_df = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=target_values,
    index=target_values,
)
experiment.log_dataframe(confusion_matrix_df, name="confusion matrix")

print(pickle.loads(experiment.artifact(name="trained model").data))
experiment.dataframe(name="confusion matrix").data