In [1]:
%load_ext lab_black

import altair as alt
import numpy as np
import pandas as pd

# 2. Introducing Scikit-Learn
## Application: Exploring Hand-written Digits

In [2]:
from sklearn.datasets import load_digits

digits = load_digits()

In [3]:
pd.Series(digits.data[:30].tolist()).apply(pd.Series).stack().to_frame(
    "color"
).reset_index().rename(columns={"level_0": "image", "level_1": "index"}).eval(
    "column=index%8"
).eval(
    "row=index//8"
).pipe(
    lambda df: alt.Chart(df, height=80, width=80)
    .mark_rect()
    .encode(
        alt.X("column:O", axis=None),
        alt.Y("row:O", axis=None),
        alt.Color(
            "color:Q",
            legend=None,
            scale=alt.Scale(scheme=alt.SchemeParams("greys", extent=[0, 1])),
        ),
        alt.Facet("image:N", columns=10, title=None),
    )
)

In [4]:
from sklearn.manifold import Isomap

pd.DataFrame(Isomap().fit_transform(digits.data), columns=["x", "y"]).assign(
    color=digits.target
).pipe(
    lambda df: alt.Chart(df, width=600, height=400)
    .mark_point()
    .encode(x="x:Q", y="y:Q", color="color:N", tooltip=["color:N"])
)

In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X_train, X_test, y_train, y_test = train_test_split(
    digits.data, digits.target, random_state=0, test_size=0.25
)
y_pred = GaussianNB().fit(X_train, y_train).predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)

0.8333333333333334

In [6]:
pd.DataFrame(
    cm, columns=digits.target_names, index=digits.target_names
).reset_index().melt(id_vars="index").rename(
    columns={"index": "true", "variable": "predicted", "value": "count"}
).pipe(
    lambda df: alt.Chart(df, height=50, width=50)
    .mark_rect()
    .encode(
        alt.Column("predicted:O", title="Predicted"),
        alt.Row("true:O", title="True"),
        alt.Color("count:Q", scale=alt.Scale(scheme="purples", type="symlog")),
        alt.Tooltip(["count:Q", "true:O", "predicted:O"]),
    )
    .configure_facet(spacing=0)
)

# 3. Hyperparameters and Model Validation
## Thinking about Model Validation

In [7]:
from scipy import stats
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier

iris = load_iris()
iris

accuracy_score(
    iris.target,
    KNeighborsClassifier(n_neighbors=1).fit(iris.data, iris.target).predict(iris.data),
)

1.0

In [8]:
cross_val_score(KNeighborsClassifier(n_neighbors=1), iris.data, iris.target, cv=5)

array([0.96666667, 0.96666667, 0.93333333, 0.93333333, 1.        ])

In [9]:
stats.describe(
    cross_val_score(
        KNeighborsClassifier(n_neighbors=1), iris.data, iris.target, cv=LeaveOneOut()
    )
)

DescribeResult(nobs=150, minmax=(0.0, 1.0), mean=0.96, variance=0.038657718120805366, skewness=-4.694855340334425, kurtosis=20.041666666666682)

## Selecting the Best Model

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

rng = np.random.RandomState(0)
data = (
    pd.DataFrame({"x": rng.rand(100) ** 2})
    .eval("y = 10 - 1/(x+.1)")
    .assign(y=lambda df: df.y + rng.randn(100))
)

vc = validation_curve(
    make_pipeline(PolynomialFeatures(), LinearRegression()),
    data.x[:, None],
    data.y,
    "polynomialfeatures__degree",
    np.arange(0, 31),
)

(
    alt.Chart(data, title="Data").mark_point().encode(x="x:Q", y="y:Q")
    | (
        pd.DataFrame({"train": np.median(vc[0], 1), "test": np.median(vc[1], 1)})
        .reset_index()
        .melt(id_vars="index")
        .pipe(
            lambda df: alt.Chart(df, title="Validation Curve")
            .mark_line(point=True)
            .encode(
                alt.X("index:O", title="Degree"),
                alt.Y("value:Q", title="Score"),
                alt.Color("variable:N", sort=None),
                alt.Tooltip(["index:O", "variable:N", "value:Q"]),
            )
        )
    )
)

## Learning Curves

In [11]:
rng = np.random.RandomState(0)
data = (
    pd.DataFrame({"x": rng.rand(1000) ** 2})
    .eval("y = 10 - 1/(x+.1)")
    .assign(y=lambda df: df.y + rng.randn(1000))
)

pd.concat(
    pd.DataFrame(
        {"degree": degree, "train": np.median(lc[1], 1), "test": np.median(lc[2], 1)}
    )
    for degree in np.arange(2, 11)
    if (
        lc := learning_curve(
            make_pipeline(PolynomialFeatures(degree), LinearRegression()),
            data.x[:, None],
            data.y,
            cv=7,
            train_sizes=np.linspace(0.1, 1, 10),
        )
    )
).pipe(
    lambda df: alt.Chart(
        df.reset_index().melt(id_vars=["index", "degree"]), height=150, width=300
    )
    .mark_line(point=True)
    .encode(
        alt.X("index:O", axis=None, title=None),
        alt.Y("value:Q", scale=alt.Scale(zero=False), title="Score"),
        alt.Color("variable:N", sort=None, title="Set"),
        alt.Facet("degree:O", columns=3, title="Degree"),
        alt.Tooltip(["variable:N", "value:Q"]),
    )
)