# Weights & Biases

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/adamelliotfields/ml/blob/main/notebooks/wandb.ipynb)
[![Render nbviewer](https://img.shields.io/badge/render-nbviewer-f37726)](https://nbviewer.org/github/adamelliotfields/ml/blob/main/notebooks/wandb.ipynb)

Experiment tracking with [W&B](https://wandb.ai). This notebook includes a couple Scikit-learn estimators to demonstrate the basics.

**Resources**

* [Alerts](https://docs.wandb.ai/guides/runs/alert)
* [Environment variables](https://docs.wandb.ai/guides/track/environment-variables)
* Artifacts:
  - [TTL](https://docs.wandb.ai/guides/artifacts/ttl)
  - [Webhooks](https://docs.wandb.ai/guides/artifacts/project-scoped-automations)
* Integrations:
  - [🤗 Transformers](https://docs.wandb.ai/guides/integrations/huggingface)
  - [🤗 Diffusers](https://docs.wandb.ai/guides/integrations/diffusers)
  - [Keras](https://docs.wandb.ai/guides/integrations/keras)
  - [TensorBoard](https://docs.wandb.ai/guides/integrations/tensorboard)
  - [Lightning](https://docs.wandb.ai/guides/integrations/lightning)
  - [LightGBM](https://docs.wandb.ai/guides/integrations/lightgbm)
  - [Sklearn](https://docs.wandb.ai/guides/integrations/scikit)
  - [OpenAI](https://docs.wandb.ai/guides/integrations/openai-api)

In [None]:
import subprocess
import sys
import os

from importlib.util import find_spec

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
os.environ["KERAS_BACKEND"] = "tensorflow"

if not find_spec("wandb"):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "wandb"])

if find_spec("google.colab"):
    from google.colab import userdata

    # disable saving notebook if scratchpad
    # os.environ["WANDB_DISABLE_CODE"] = "true"
    os.environ["WANDB_DISABLE_GIT"] = "true"
    os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")

# can also use `wandb.login` for interactive login
assert os.environ.get("WANDB_API_KEY"), "missing WANDB_API_KEY"

In [None]:
import io
import wandb

import numpy as np
import pandas as pd
import tensorflow as tf
import plotly.express as px
import matplotlib.pyplot as plt

from PIL import Image as PILImage
from yellowbrick.classifier import ConfusionMatrix

from wandb.sklearn import plot_precision_recall, plot_feature_importances
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_diabetes, load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
)

In [None]:
# @title Config
WANDB_ENTITY = "adamelliotfields"  # @param {type:"string"}
WANDB_PROJECT = "test"  # @param {type:"string"}

## Iris Classification

In [None]:
iris = load_iris()

X_iris, y_iris = iris.data, iris.target
X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(
    X_iris,
    y_iris,
    test_size=0.2,
    random_state=42,
)

iris_df = pd.DataFrame(data=np.c_[X_iris, y_iris], columns=iris.feature_names + ["target"])
iris_df.target = pd.Categorical.from_codes(y_iris, iris.target_names)

In [None]:
fig = px.scatter(
    iris_df,
    color="target",
    trendline="ols",
    marginal_x="box",
    marginal_y="violin",
    x="sepal width (cm)",
    y="sepal length (cm)",
)
fig.show()

In [None]:
# classifier = DecisionTreeClassifier()
# classifier = RandomForestClassifier(n_estimators=300, min_samples_split=5, min_samples_leaf=2, random_state=42)
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_iris_train, y_iris_train)
y_probas = classifier.predict_proba(X_iris_test)

In [None]:
# yellowbrick confusion matrix
cm = ConfusionMatrix(classifier, classes=iris.target_names, cmap="Blues", is_fitted=True)
cm.fit(X_iris_train, y_iris_train)
cm.score(X_iris_test, y_iris_test)

# save as PIL image
buf = io.BytesIO()
plt.savefig(buf, format="png")
plt.show()
buf.seek(0)
img = PILImage.open(buf)

In [None]:
# returns a run instance, which can also be accessed on `wandb.run`
wandb.init(
    group="iris",
    tags=["CPU"],
    job_type="train",
    entity=WANDB_ENTITY,
    project=WANDB_PROJECT,
    notes="KNN classifier",
    config=classifier.get_params(),
)

# log additional information
wandb.config.update(
    {
        "test_size": 0.2,
        "model": "KNeighborsClassifier",
    }
)

# renders an interactive Plotly figure (in the dashboard)
# wandb.log is shorthand for wandb.run.log
wandb.log({"Plotly": wandb.Plotly(fig)})

# renders a static image
wandb.log({"Confusion Matrix": wandb.Image(img)})

# create a dataset artifact and additionally attach the raw CSV
iris_df.to_csv("iris.csv", index=False)
iris_table = wandb.Table(dataframe=iris_df)
iris_artifact = wandb.Artifact("data", type="dataset")
iris_artifact.add(iris_table, "table")
iris_artifact.add_file("iris.csv")
wandb.log({"data": iris_table})
wandb.log_artifact(iris_artifact)

# built-in wandb plots for scikit-learn
plot_class_proportions(y_iris_train, y_iris_test, iris.target_names)
plot_learning_curve(classifier, X_iris_train, y_iris_train, random_state=42)
plot_roc(y_iris_test, y_probas, iris.target_names)
plot_precision_recall(y_iris_test, y_probas, iris.target_names)
# plot_feature_importances(classifier, iris.feature_names)  # only for trees

# must call finish in a notebook (if not using context)
wandb.finish()

## Diabetes Regression

In [None]:
diabetes = load_diabetes()

X_diabetes, y_diabetes = diabetes.data, diabetes.target
X_diabetes_train, X_diabetes_test, y_diabetes_train, y_diabetes_test = train_test_split(
    X_diabetes,
    y_diabetes,
    random_state=42,
)

diabetes_df = pd.DataFrame(
    data=np.c_[X_diabetes, y_diabetes],
    columns=diabetes.feature_names + ["target"],
)

In [None]:
regressor = GradientBoostingRegressor(
    max_depth=2,
    subsample=0.9,
    random_state=42,
    n_estimators=100,
    min_samples_leaf=2,
    min_samples_split=10,
)

regressor.fit(X_diabetes_train, y_diabetes_train)
y_pred = regressor.predict(X_diabetes_test)

# logging these will automatically plot them
r2 = r2_score(y_diabetes_test, y_pred)
mse = mean_squared_error(y_diabetes_test, y_pred)
mae = mean_absolute_error(y_diabetes_test, y_pred)
mape = mean_absolute_percentage_error(y_diabetes_test, y_pred)

In [None]:
# use a context manager so you don't need to call `finish`
with wandb.init(
    tags=["CPU"],
    job_type="train",
    group="diabetes",
    entity=WANDB_ENTITY,
    project=WANDB_PROJECT,
    config=regressor.get_params(),
    notes="GradientBoostingRegressor",
) as run:
    wandb.config.update(
        {
            "test_size": 0.2,
            "model": "GradientBoostingRegressor",
        }
    )

    # use a slash to group
    run.log({"metrics/R2": r2, "metrics/MSE": mse, "metrics/MAE": mae, "metrics/MAPE": mape})