In [None]:
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import scale
from psynlig import (
    pca_explained_variance,
    pca_explained_variance_bar,
    pca_2d_scores,
    pca_2d_loadings,
    pca_1d_loadings,
)
plt.style.use('seaborn-notebook')
%matplotlib notebook

In [None]:
data_set = load_breast_cancer()
data = pd.DataFrame(data_set['data'], columns=data_set['feature_names'])
data['target'] = data_set['target']

In [None]:
data

In [None]:
class_names = {0: 'Malignant', 1: 'Benign'}

In [None]:
# original data set contains many variables, for this example we select just 10 of these: 
variables = [
    'mean radius',
    'mean texture',
    'mean perimeter',
    'mean area',
    'mean smoothness',
    'mean compactness',
    'mean concavity',
    'mean concave points',
    'mean symmetry',
    'mean fractal dimension',
]
# to use all variables, uncomment the next line:
#variables = [i for i in data.columns if i!= 'target']
print(variables)

In [None]:
X = scale(data[variables].values)
pca = PCA(n_components=4)  # Do PCA, but only ask for 4 principal components
scores = pca.fit_transform(X)
pca.components_

In [None]:
# Plot the explained variance:
pca_explained_variance(pca, marker='o', markersize=12, alpha=0.8);

In [None]:
# Bar plot of explained variance:
pca_explained_variance_bar(pca);

In [None]:
# Plot scores:
pca_2d_scores(
    pca,
    scores,
    class_data=data['target'],
    class_names=class_names,
    select_components={(1, 2), (1, 3)},  # Plot PC1 vs PC2 and PC1 vs PC3
    s=150,
    alpha=.8
);

In [None]:
# Plot loadings for PC1 and PC2:
text_settings = {
    'fontsize': 'small',
    'outline': {'foreground': '0.5'},
    'show': False,
}

_, axes = pca_2d_loadings(
    pca,
    variables,
    select_components={(1, 2),},
    text_settings=text_settings,
    cmap='Spectral',
)
for axi in axes:
    leg = axi.legend(fontsize='small', ncol=2, loc='lower left')
    for legi in leg.legendHandles:
        legi.set_sizes([75.0])

In [None]:
# Plot 2D scores and loadings together:
loading_settings = {
    'add_text': False,
    'add_legend': True,
    'biplot': True,
}

pca_2d_scores(
    pca,
    scores,
    xvars=variables,
    class_data=data['target'],
    class_names=class_names,
    select_components={(1, 2)},
    loading_settings=loading_settings,
    s=100,
    alpha=.8,
);

In [None]:
# Plot contributions to PC1 and PC2:
pca_1d_loadings(
    pca,
    variables,
    select_components={1, 2},
    plot_type='bar',
);

In [None]:
# From the previous plot, it looks like we can separate (to some degree) by using
# just the mean area and the mean smoothness. Let us try this:
fig1, ax1 = plt.subplots(constrained_layout=True)
x = 'mean area'
y = 'mean smoothness'

class0 = data.loc[data['target'] == 0]
class1 = data.loc[data['target'] == 1]

for i, klass in enumerate((class0, class1)):
    ax1.scatter(
        klass[x].values,
        klass[y].values,
        s=100,
        label=class_names[i],
    )
ax1.legend()
ax1.set_xlabel(x);
ax1.set_ylabel(y);
fig1.savefig('two_variables.pdf', bbox_inches='tight')