In [1]:
%load_ext lab_black

import altair as alt
import numpy as np
import pandas as pd

# 2. Introducing Scikit-Learn
## Application: Exploring Hand-written Digits

In [2]:
from sklearn.datasets import load_digits

digits = load_digits()

In [3]:
pd.Series(digits.data[:30].tolist()).apply(pd.Series).stack().to_frame(
    "color"
).reset_index().rename(columns={"level_0": "image", "level_1": "index"}).eval(
    "column=index%8"
).eval(
    "row=index//8"
).pipe(
    lambda df: alt.Chart(df, height=80, width=80)
    .mark_rect()
    .encode(
        alt.X("column:O", axis=None),
        alt.Y("row:O", axis=None),
        alt.Color(
            "color:Q",
            legend=None,
            scale=alt.Scale(scheme=alt.SchemeParams("greys", extent=[0, 1])),
        ),
        alt.Facet("image:N", columns=10, title=None),
    )
)

In [4]:
from sklearn.manifold import Isomap

pd.DataFrame(Isomap().fit_transform(digits.data), columns=["x", "y"]).assign(
    color=digits.target
).pipe(
    lambda df: alt.Chart(df, width=600, height=400)
    .mark_point()
    .encode(x="x:Q", y="y:Q", color="color:N", tooltip=["color:N"])
)

In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X_train, X_test, y_train, y_test = train_test_split(
    digits.data, digits.target, random_state=0, test_size=0.25
)
y_pred = GaussianNB().fit(X_train, y_train).predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)

0.8333333333333334

In [6]:
pd.DataFrame(
    cm, columns=digits.target_names, index=digits.target_names
).reset_index().melt(id_vars="index").rename(
    columns={"index": "true", "variable": "predicted", "value": "count"}
).pipe(
    lambda df: alt.Chart(df, height=50, width=50)
    .mark_rect()
    .encode(
        alt.Column("predicted:O", title="Predicted"),
        alt.Row("true:O", title="True"),
        alt.Color("count:Q", scale=alt.Scale(scheme="purples", type="symlog")),
        alt.Tooltip(["count:Q", "true:O", "predicted:O"]),
    )
    .configure_facet(spacing=0)
)

# 3. Hyperparameters and Model Validation
## Thinking about Model Validation

In [7]:
from scipy import stats
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier

iris = load_iris()
iris

accuracy_score(
    iris.target,
    KNeighborsClassifier(n_neighbors=1).fit(iris.data, iris.target).predict(iris.data),
)

1.0

In [8]:
cross_val_score(KNeighborsClassifier(n_neighbors=1), iris.data, iris.target, cv=5)

array([0.96666667, 0.96666667, 0.93333333, 0.93333333, 1.        ])

In [9]:
stats.describe(
    cross_val_score(
        KNeighborsClassifier(n_neighbors=1), iris.data, iris.target, cv=LeaveOneOut()
    )
)

DescribeResult(nobs=150, minmax=(0.0, 1.0), mean=0.96, variance=0.038657718120805366, skewness=-4.694855340334425, kurtosis=20.041666666666682)

## Selecting the Best Model

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

rng = np.random.RandomState(0)
data = (
    pd.DataFrame({"x": rng.rand(100) ** 2})
    .eval("y = 10 - 1/(x+.1)")
    .assign(y=lambda df: df.y + rng.randn(100))
)

vc = validation_curve(
    make_pipeline(PolynomialFeatures(), LinearRegression()),
    data.x[:, None],
    data.y,
    "polynomialfeatures__degree",
    np.arange(0, 31),
)

(
    alt.Chart(data, title="Data").mark_point().encode(x="x:Q", y="y:Q")
    | (
        pd.DataFrame({"train": np.median(vc[0], 1), "test": np.median(vc[1], 1)})
        .reset_index()
        .melt(id_vars="index")
        .pipe(
            lambda df: alt.Chart(df, title="Validation Curve")
            .mark_line(point=True)
            .encode(
                alt.X("index:O", title="Degree"),
                alt.Y("value:Q", title="Score"),
                alt.Color("variable:N", sort=None),
                alt.Tooltip(["index:O", "variable:N", "value:Q"]),
            )
        )
    )
)

## Learning Curves

In [11]:
rng = np.random.RandomState(0)
data = (
    pd.DataFrame({"x": rng.rand(1000) ** 2})
    .eval("y = 10 - 1/(x+.1)")
    .assign(y=lambda df: df.y + rng.randn(1000))
)

pd.concat(
    pd.DataFrame(
        {"degree": degree, "train": np.median(lc[1], 1), "test": np.median(lc[2], 1)}
    )
    for degree in np.arange(2, 11)
    if (
        lc := learning_curve(
            make_pipeline(PolynomialFeatures(degree), LinearRegression()),
            data.x[:, None],
            data.y,
            cv=7,
            train_sizes=np.linspace(0.1, 1, 10),
        )
    )
).pipe(
    lambda df: alt.Chart(
        df.reset_index().melt(id_vars=["index", "degree"]), height=150, width=300
    )
    .mark_line(point=True)
    .encode(
        alt.X("index:O", axis=None, title=None),
        alt.Y("value:Q", scale=alt.Scale(zero=False), title="Score"),
        alt.Color("variable:N", sort=None, title="Set"),
        alt.Facet("degree:O", columns=3, title="Degree"),
        alt.Tooltip(["variable:N", "value:Q"]),
    )
)

# 4. Feature Engineering
## Derived Features

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

data = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [4, 2, 1, 3, 7]}).assign(
    y_pred_linear=(
        lambda data: LinearRegression(fit_intercept=True)
        .fit(data.x[:, None], data.y)
        .predict(data.x[:, None])
    ),
    y_pred_polynomial=(
        lambda data: GridSearchCV(
            make_pipeline(PolynomialFeatures(), LinearRegression(fit_intercept=True)),
            {"polynomialfeatures__degree": range(10)},
            cv=2,
        )
        .fit(data.x[:, None], data.y)
        .best_estimator_.predict(data.x[:, None])
    ),
)

chart = alt.Chart(data).mark_point().encode(x="x:Q", y="y:Q")
(
    (
        chart.properties(title="Linear Regression")
        + chart.mark_line().encode(alt.Y("y_pred_linear:Q", title=None))
    )
    | (
        chart.properties(title="Polynomial Features + Linear Regression")
        + chart.mark_line().encode(alt.Y("y_pred_polynomial:Q", title=None))
    )
).resolve_scale(y="shared")

# 5. In Depth: Naive Bayes Classification

> Naive Bayes models are a group of extremely fast and simple classification algorithms that are often suitable for very high-dimensional datasets.

> Because they are so fast and have so few tunable parameters, they end up being very useful as a quick-and-dirty baseline for a classification problem.

> Because naive Bayesian classifiers make such stringent assumptions about data, they will generally not perform as well as a more complicated model. That said, they have several advantages:
> - They are extremely fast for both training and prediction
> - They provide straightforward probabilistic prediction
> - They are often very easily interpretable
> - They have very few (if any) tunable parameters

> This means that clusters in high dimensions tend to be more separated, on average, than clusters in low dimensions, assuming the new dimensions actually add information. For this reason, simplistic classifiers like naive Bayes tend to work as well or better than more complicated classifiers as the dimensionality grows: once you have enough data, even a simple model can be very powerful.

\begin{align}
    P(L \mid \mathrm{features}) = \frac{ P(\mathrm{features} \mid L) P(L) }{ P(\mathrm{features}) } \\
\end{align}

\begin{align}
    \frac{ P(L_1 \mid \mathrm{features}) }{ P(L_2 \mid \mathrm{features}) } = \frac{ P(\mathrm{features} \mid L_1)P(L_1) }{ P(\mathrm{features} \mid L_2)P(L_2) }
\end{align}

> Such a model is called a *generative* model because it specifies the hypothetical random process that generates the data.

## Gaussian Naive Bayes

> In this classifier, the assumption is that *data from each label is drawn from a simple Gaussian distribution*.

In [13]:
from sklearn.datasets import make_blobs
from sklearn.naive_bayes import GaussianNB

X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
chart = (
    lambda data: alt.Chart(data, height=300, width=400)
    .mark_point(filled=True)
    .encode(x="x0", y="x1", color=alt.Color("y:N", legend=None))
)

chart(pd.DataFrame(X).add_prefix("x").assign(y=y)) + (
    chart(
        pd.DataFrame(
            np.array(
                np.meshgrid(
                    np.linspace(X[:, 0].min(), X[:, 0].max()),
                    np.linspace(X[:, 1].min(), X[:, 1].max()),
                )
            ).T.reshape(-1, 2)
        )
        .add_prefix("x")
        .assign(y=lambda df: GaussianNB().fit(X, y).predict(df))
    ).mark_point(filled=True, opacity=0.25)
)

> We see a slightly curved boundary in the classifications—in general, the boundary in Gaussian naive Bayes is quadratic.

## Multinomial Naive Bayes

> Another useful example is multinomial naive Bayes, where the features are assumed to be generated from a simple multinomial distribution. The multinomial distribution describes the probability of observing counts among a number of categories, and thus multinomial naive Bayes is most appropriate for features that represent counts or count rates.

> One place where multinomial naive Bayes is often used is in text classification, where the features are related to word counts or frequencies within the documents to be classified.

In [14]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups()
dir(data), data.target_names

(['DESCR', 'data', 'filenames', 'target', 'target_names'],
 ['alt.atheism',
  'comp.graphics',
  'comp.os.ms-windows.misc',
  'comp.sys.ibm.pc.hardware',
  'comp.sys.mac.hardware',
  'comp.windows.x',
  'misc.forsale',
  'rec.autos',
  'rec.motorcycles',
  'rec.sport.baseball',
  'rec.sport.hockey',
  'sci.crypt',
  'sci.electronics',
  'sci.med',
  'sci.space',
  'soc.religion.christian',
  'talk.politics.guns',
  'talk.politics.mideast',
  'talk.politics.misc',
  'talk.religion.misc'])

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

train, test = (
    pd.Series(data.target_names)
    .pipe(lambda s: s[s.str.contains(r"graphics|religion|space")])
    .pipe(
        lambda categories: (
            fetch_20newsgroups(categories=categories, subset="train"),
            fetch_20newsgroups(categories=categories, subset="test"),
        )
    )
)

model = make_pipeline(TfidfVectorizer(), MultinomialNB()).fit(
    train.data, pd.Series(train.target_names)[train.target]
)

alt.Chart(
    pd.DataFrame(
        confusion_matrix(
            pd.Series(test.target_names)[test.target], model.predict(test.data)
        ),
        columns=train.target_names,
        index=test.target_names,
    )
    .reset_index()
    .melt(id_vars="index")
    .rename(columns={"index": "true", "variable": "predicted", "value": "count"})
).mark_rect(height=100, width=100).encode(
    alt.Column("predicted:O", title="Predicted"),
    alt.Row("true:O", title="True"),
    alt.Color("count:Q", scale=alt.Scale(scheme="purples", type="symlog")),
    alt.Tooltip(["count:Q", "true:O", "predicted:O"]),
).configure_facet(
    spacing=0
)

In [16]:
pd.Series(
    [
        "sending a payload to the ISS",
        "discussing islam vs atheism",
        "determining the screen resolution",
        "black hole",
        "quasar",
    ]
).to_frame("text").assign(category=lambda df: model.predict(df.text)).pipe(
    lambda df: df.join(
        pd.DataFrame(
            model.predict_proba(df.text), columns=test.target_names
        ).add_prefix("proba: ")
    )
)

Unnamed: 0,text,category,proba: comp.graphics,proba: sci.space,proba: soc.religion.christian,proba: talk.religion.misc
0,sending a payload to the ISS,sci.space,0.199502,0.421163,0.250665,0.12867
1,discussing islam vs atheism,soc.religion.christian,0.254114,0.145384,0.395239,0.205263
2,determining the screen resolution,comp.graphics,0.581839,0.162995,0.148421,0.106745
3,black hole,soc.religion.christian,0.174071,0.301832,0.42316,0.100938
4,quasar,soc.religion.christian,0.271249,0.27543,0.278216,0.175105
