In [1]:
%load_ext lab_black

import altair as alt
import numpy as np
import pandas as pd

# Introducing Scikit-Learn
## Application: Exploring Hand-written Digits

In [2]:
from sklearn.datasets import load_digits

digits = load_digits()

In [3]:
pd.Series(digits.data[:30].tolist()).apply(pd.Series).stack().to_frame(
    "color"
).reset_index().rename(columns={"level_0": "image", "level_1": "index"}).eval(
    "column=index%8"
).eval(
    "row=index//8"
).pipe(
    lambda df: alt.Chart(df, height=80, width=80)
    .mark_rect()
    .encode(
        alt.X("column:O", axis=None),
        alt.Y("row:O", axis=None),
        alt.Color(
            "color:Q",
            legend=None,
            scale=alt.Scale(scheme=alt.SchemeParams("greys", extent=[0, 1])),
        ),
        alt.Facet("image:N", columns=10, title=None),
    )
)

In [4]:
from sklearn.manifold import Isomap

pd.DataFrame(Isomap().fit_transform(digits.data), columns=["x", "y"]).assign(
    color=digits.target
).pipe(
    lambda df: alt.Chart(df, width=600, height=400)
    .mark_point()
    .encode(x="x:Q", y="y:Q", color="color:N", tooltip=["color:N"])
)

In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X_train, X_test, y_train, y_test = train_test_split(
    digits.data, digits.target, random_state=0, test_size=0.25
)
y_pred = GaussianNB().fit(X_train, y_train).predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)

0.8333333333333334

In [6]:
pd.DataFrame(
    cm, columns=digits.target_names, index=digits.target_names
).reset_index().melt(id_vars="index").rename(
    columns={"index": "true", "variable": "predicted", "value": "count"}
).pipe(
    lambda df: alt.Chart(df, height=50, width=50)
    .mark_rect()
    .encode(
        alt.Column("predicted:O", title="Predicted"),
        alt.Row("true:O", title="True"),
        alt.Color("count:Q", scale=alt.Scale(scheme="purples", type="symlog")),
        alt.Tooltip(["count:Q", "true:O", "predicted:O"]),
    )
    .configure_facet(spacing=0)
)