# data prep

In [None]:
! pip install palmerpenguins

In [None]:
from palmerpenguins import load_penguins

penguins_df = load_penguins()
penguins_df.head()

In [None]:
import plotly.express as px

px.scatter(penguins_df, x="flipper_length_mm", y="bill_length_mm", color="species")

In [None]:
from sklearn.preprocessing import LabelEncoder

penguin_encoder = LabelEncoder()

for column in ["species", "island", "sex"]:
    penguins_df[column] = penguin_encoder.fit_transform(penguins_df[column])

penguins_df.head()

# pipeline setup

In [None]:
from sklearn.model_selection import train_test_split

train_penguins_df, test_penguins_df = train_test_split(penguins_df, test_size=.30)

target_column = "species"
feature_columns = [c for c in train_penguins_df.columns if c != target_column]

X_train, y_train = train_penguins_df[feature_columns], train_penguins_df[target_column]
X_test, y_test = test_penguins_df[feature_columns], test_penguins_df[target_column]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

penguin_pipeline = Pipeline(
    steps=[
        ("si", SimpleImputer(strategy="mean")),
        ("kn", KNeighborsClassifier(n_neighbors=5)),
    ],
)

penguin_pipeline.fit(X_train, y_train)
score = penguin_pipeline.score(X_test, y_test)

score

# add rubicon-ml manually

In [None]:
from rubicon_ml import Rubicon

rubicon = Rubicon(persistence="memory")
project = rubicon.get_or_create_project(name="demo")

experiment = project.log_experiment(name="classifying penguins")
parameter_strategy = experiment.log_parameter(name="strategy", value="mean")
parameter_n_neighbors = experiment.log_parameter(name="n_neighbors", value=5)
metric_accuracy = experiment.log_metric(name="accuracy", value=score)

In [None]:
print(experiment)
print()
print([(p.name, p.value) for p in experiment.parameters()])
print([(m.name, m.value) for m in experiment.metrics()])

# again with RubiconPipeline

In [None]:
from rubicon_ml.sklearn import RubiconPipeline

rubicon_penguin_pipeline = RubiconPipeline(
    project=project,
    steps=[
        ("si", SimpleImputer(strategy="mean")),
        ("kn", KNeighborsClassifier(n_neighbors=5)),
    ],
)

rubicon_penguin_pipeline.fit(X_train, y_train)
pipeline_experiment = rubicon_penguin_pipeline.experiment

rubicon_penguin_pipeline.score(X_test, y_test)

In [None]:
print(pipeline_experiment)
print()
print([(p.name, p.value) for p in pipeline_experiment.parameters()])
print([(m.name, m.value) for m in pipeline_experiment.metrics()])

# grid search

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "si__strategy": ["mean", "median", "most_frequent"],
    "kn__n_neighbors": [2, 4, 8, 16, 32, 64],
}

grid_search_project = rubicon.get_or_create_project(name="grid search demo")
rubicon_penguin_pipeline.project = grid_search_project

grid_search = GridSearchCV(
    rubicon_penguin_pipeline,
    cv=2,
    param_grid=parameters,
    refit=False,
    verbose=True,
)

grid_search.fit(X_train, y_train)
grid_search_project.experiments()

# visualizations

In [None]:
from rubicon_ml.viz import ExperimentsTable

ExperimentsTable(experiments=grid_search_project.experiments()).show()