# data prep

In [None]:
! pip install palmerpenguins

In [None]:
from palmerpenguins import load_penguins

penguins_df = load_penguins()
penguins_df.head()

In [None]:
import plotly.express as px

px.scatter(penguins_df, x="flipper_length_mm", y="bill_length_mm", color="species")

In [None]:
from sklearn.preprocessing import LabelEncoder

penguin_encoder = LabelEncoder()

for column in ["species", "island", "sex"]:
    penguins_df[column] = penguin_encoder.fit_transform(penguins_df[column])

penguins_df.head()

# pipeline setup

In [None]:
from sklearn.model_selection import train_test_split

train_penguins_df, test_penguins_df = train_test_split(penguins_df, test_size=.30)

target_column = "species"
feature_columns = [c for c in train_penguins_df.columns if c != target_column]

X_train, y_train = train_penguins_df[feature_columns], train_penguins_df[target_column]
X_test, y_test = test_penguins_df[feature_columns], test_penguins_df[target_column]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

steps = [
    ("si", SimpleImputer(strategy="mean")),
    ("kn", KNeighborsClassifier(n_neighbors=5)),
]

penguin_pipeline = Pipeline(steps=steps)
penguin_pipeline.fit(X_train, y_train)

score = penguin_pipeline.score(X_test, y_test)
score

# add rubicon-ml manually

In [None]:
from rubicon_ml import Rubicon

rubicon = Rubicon(
    persistence="filesystem",
    root_dir="./rubicon-root",
    auto_git_enabled=True,
)
project = rubicon.get_or_create_project(name="demo")

experiment = project.log_experiment(name="classifying penguins")
parameter_strategy = experiment.log_parameter(name="strategy", value="mean")
parameter_n_neighbors = experiment.log_parameter(name="n_neighbors", value=5)
metric_accuracy = experiment.log_metric(name="accuracy", value=score)

In [None]:
print(experiment)
print()
print([(p.name, p.value) for p in experiment.parameters()])
print([(m.name, m.value) for m in experiment.metrics()])

# again with RubiconPipeline

In [None]:
from rubicon_ml.sklearn import RubiconPipeline

rubicon_penguin_pipeline = RubiconPipeline(
    project=project,
    experiment_kwargs={"name": "KNeighborsClassifier", "tags": ["knn"]},
    steps=steps,
)
rubicon_penguin_pipeline.fit(X_train, y_train)

pipeline_experiment = rubicon_penguin_pipeline.experiment

rubicon_penguin_pipeline.score(X_test, y_test)

In [None]:
print(pipeline_experiment)
print()
print([(p.name, p.value) for p in pipeline_experiment.parameters()])
print([(m.name, m.value) for m in pipeline_experiment.metrics()])

# grid search

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "si__strategy": ["mean", "median", "most_frequent"],
    "kn__n_neighbors": [2, 4, 8, 16, 32, 64],
}

grid_search_project = rubicon.get_or_create_project(name="grid search demo")
rubicon_penguin_pipeline.project = grid_search_project

grid_search = GridSearchCV(
    rubicon_penguin_pipeline,
    cv=2,
    param_grid=parameters,
    refit=False,
    verbose=True,
)

grid_search.fit(X_train, y_train)
grid_search_project.experiments()

# visualizations

In [None]:
from rubicon_ml.viz import ExperimentsTable

ExperimentsTable(experiments=grid_search_project.experiments()).show()

you can also publish programmatically...

```python
from rubicon_ml import publish

publish(my_experiment, output_path)
```

# sharing

In [None]:
import intake

catalog = intake.open_catalog("./rubicon-ml-catalog.yml")

for source in catalog:
    catalog[source].discover()
    
shared_experiments = [catalog[source].read() for source in catalog]

[(e.id, e.metric(name="score").value) for e in shared_experiments]

# trying a new model

In [None]:
from sklearn.ensemble import RandomForestClassifier

new_steps = [
    ("si", SimpleImputer(strategy="mean")),
    ("rf", RandomForestClassifier(n_estimators=100)),
]

new_rubicon = Rubicon(
    persistence="filesystem",
    root_dir="./new-rubicon-root",
    auto_git_enabled=True,
)
new_project = new_rubicon.get_or_create_project(name="demo")

new_pipeline = RubiconPipeline(
    project=new_project,
    experiment_kwargs={"name": "RandomForestClassifier", "tags": ["rf"]},
    steps=new_steps,
)

new_parameters = {
    "si__strategy": ["mean", "median", "most_frequent"],
    "rf__n_estimators": [25, 50, 100, 200, 400],
}

new_grid_search = GridSearchCV(
    new_pipeline,
    cv=2,
    param_grid=new_parameters,
    refit=False,
)

new_grid_search.fit(X_train, y_train)

In [None]:
from rubicon_ml.viz import MetricCorrelationPlot

MetricCorrelationPlot(
    experiments=new_project.experiments(),
    parameter_names=["si__strategy", "rf__n_estimators"],
).show()

In [None]:
from rubicon_ml import publish

combined_catalog = publish(
    shared_experiments + new_project.experiments(),
    "./combined-catalog.yml",
)

# custom estimators and loggers

In [None]:
from sklearn.base import BaseEstimator

class ComboEstimator(BaseEstimator):
    def __init__(self, n_neighbors=2, n_estimators=25):
        super().__init__()
        
        self.n_neighbors = n_neighbors
        self.n_estimators = n_estimators
        
        self.knn = KNeighborsClassifier(n_neighbors)
        self.rf = RandomForestClassifier(n_estimators)
        
    def fit(self, X, y):
        self.knn.fit(X, y)
        self.rf.fit(X, y)
        
    def score(self, X):
        knn_score = self.knn.score(X)
        rf_score = self.rf.score(X)
        
        return (knn_score + (rf_score * 2)) / 3

In [None]:
import pickle

from rubicon_ml.sklearn.estimator_logger import EstimatorLogger

class ModelLogger(EstimatorLogger):
    def log_parameters(self):
        super().log_parameters()
        
        self.experiment.log_artifact(data_bytes=pickle.dumps(self.estimator.knn), name="knn")
        self.experiment.log_artifact(data_bytes=pickle.dumps(self.estimator.rf), name="rf")

In [None]:
from rubicon_ml.sklearn import make_pipeline

make_pipeline_steps = [
    SimpleImputer(strategy="mean"),
    (ComboEstimator(n_neighbors=16, n_estimators=100), ModelLogger()),
]

another_pipeline = make_pipeline(new_project, *make_pipeline_steps)
another_pipeline.fit(X_train, y_train)

[(a.name, a) for a in another_pipeline.experiment.artifacts()]

In [None]:
pickle.loads(another_pipeline.experiment.artifacts()[0].data)