In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

sns.set_theme(style="ticks", context="notebook", palette="muted")
%matplotlib inline

In [None]:
data_set = load_breast_cancer()
data = pd.DataFrame(data_set["data"], columns=data_set["feature_names"])
data["target"] = data_set["target"]

In [None]:
data

In [None]:
class_names = {"0": "Malignant", "1": "Benign"}

In [None]:
# The original data set contains many variables, for this example we select just 10 of these:
variables = [
    "mean radius",
    "mean texture",
    "mean perimeter",
    "mean area",
    "mean smoothness",
    "mean compactness",
    "mean concavity",
    "mean concave points",
    "mean symmetry",
    "mean fractal dimension",
]
# To use all variables, uncomment the next line:
# variables = [i for i in data.columns if i!= 'target']
print(variables)

In [None]:
X = scale(data[variables].values)
pca = PCA(n_components=4)  # Do PCA, but only ask for 4 principal components
scores = pca.fit_transform(X)
pca.components_

In [None]:
# Plot the explained variance:
fig, (ax1, ax2) = plt.subplots(
    constrained_layout=True, ncols=2, figsize=(8, 4), sharex=True
)
variance = pca.explained_variance_ratio_
components = 1 + np.arange(len(variance))
bar = ax1.bar(components, variance * 100.0)
ax1.bar_label(bar, fmt="{:.1f}")
ax2.plot(
    components, np.cumsum(variance) * 100.0, marker="o", markersize=8, lw=2
)
ax1.set_xlabel("Principal component no.")
ax1.set_ylabel("Explained variance (%) per component")
ax2.set_xlabel("Number of principal components")
ax2.set_ylabel("Explained variance (%)")
ax2.axhline(y=100, ls=":", color="k")
sns.despine(fig=fig)

In [None]:
# Plot scores:
fig, ax = plt.subplots(constrained_layout=True)
sns.scatterplot(
    data=data, x=scores[:, 0], y=scores[:, 1], ax=ax, s=90, hue="target"
)

# The rest of the code here is just to style the plot:
ax.axhline(y=0, ls=":", c="k", lw=1)
ax.axvline(x=0, ls=":", c="k", lw=1)
ax.set(xlabel=f"Scores PC1 ({pca.explained_variance_ratio_[0]*100:.2g}%)")
ax.set(ylabel=f"Scores PC2 ({pca.explained_variance_ratio_[1]*100:.2g}%)")

ax.legend(title="Cell type:", alignment="left")
legend = ax.get_legend()
for lab in legend.texts:
    lab.set_text(class_names[lab.get_text()])
sns.despine(fig=fig)

In [None]:
def plot_loadings_arrows(pca, idxi=0, idxj=1, labels=None):
    fig, ax = plt.subplots(constrained_layout=True)

    loadings = pca.components_.T

    load1 = loadings[:, idxi]
    load2 = loadings[:, idxj]

    var1 = pca.explained_variance_ratio_[idxi] * 100
    var2 = pca.explained_variance_ratio_[idxj] * 100

    ax.set(xlabel=f"Loadings PC1 ({var1:.2g}%)")
    ax.set(ylabel=f"Loadings PC2 ({var2:.2g}%)")
    ax.axhline(y=0, ls=":", color="k")
    ax.axvline(x=0, ls=":", color="k")

    ax.scatter(load1, load2, color="none")  # to adjust the axes

    for i in range(pca.n_features_in_):
        x, y = load1[i], load2[i]

        ax.annotate(
            "",
            xy=(x, y),
            xytext=(0, 0),
            arrowprops=dict(
                arrowstyle="-|>", lw=2, color="red", mutation_scale=25
            ),
        )
        txt = f"{i}"
        if labels is not None:
            txt = labels[i]
        ax.text(x, y, txt, fontsize="xx-small")

    ax.set_xlim(-0.6, 0.6)
    ax.set_ylim(-0.6, 0.6)
    ax.set_aspect("equal")
    sns.despine(fig=fig)

In [None]:
plot_loadings_arrows(pca, idxi=0, idxj=1, labels=variables)

In [None]:
# From the previous plot, it looks like we can separate (to some degree) by using
# just the mean area and the mean smoothness. Let us try this:
fig, ax = plt.subplots(constrained_layout=True)
sns.scatterplot(data=data, x="mean area", y="mean smoothness", hue="target", s=90)
ax.legend(title="Cell type:", alignment="left")
legend = ax.get_legend()
for lab in legend.texts:
    lab.set_text(class_names[lab.get_text()])
sns.despine(fig=fig)