# Example 3: Characterisation of volatile proﬁles in 50 native Peruvian chili pepper using solid phase microextraction–gas chromatography mass spectrometry (SPME–GCMS)

In the [original article](https://www.sciencedirect.com/science/article/pii/S0963996916303532?via%3Dihub), the authors characterised the volatile composition of 50 fresh Peruvian chili peppers. The authors found that [terpenes](https://en.wikipedia.org/wiki/Terpene), [esters](https://en.wikipedia.org/wiki/Ester), and [hydrocarbons](https://en.wikipedia.org/wiki/Hydrocarbon) were the major compounds present. Then, they used principal component analysis (PCA) to group the 50 chili peppers.

![](https://ars.els-cdn.com/content/image/1-s2.0-S0963996916303532-fx1.jpg)


Here, we are just recreating their PCA analysis for Fig 1. in the original article.

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA

%matplotlib notebook
sns.set_theme(style="ticks", context="notebook", palette="muted")

In [None]:
data = pd.read_csv("peppers.csv")
data

In [None]:
data.describe()

In [None]:
# To reproduce Fig. 1:
peppers = data["Compound"].values
variables = ["Ketone", "Ester", "Aldehyde", "Hydrocarbon", "Terpene"]
X = data[variables].to_numpy()
# Run PCA analysis:
pca = PCA()
scores = pca.fit_transform(X)

In [None]:
# Reproduce Fig 1.
fig, (ax1, ax2) = plt.subplots(
    constrained_layout=True, ncols=2, figsize=(8, 4)
)

# First plot the so-called loadings:
loadings = pca.components_
pca1 = loadings[0, :]
pca2 = loadings[1, :]


for i, vari in enumerate(variables):
    scat = ax1.scatter(pca1[i], pca2[i], s=100, edgecolor="k", zorder=2)
    ax1.text(
        pca1[i] - 0.1,
        pca2[i],
        vari,
        ha="right",
        va="center",
        bbox={
            "alpha": 0.9,
            "facecolor": "w",
            "edgecolor": scat.get_facecolor(),
            "lw": 1.0,
            "boxstyle": "round",
        },
    )
ax1.set_aspect("equal")
ax1.axhline(y=0, ls=":", color="k", zorder=1)
ax1.axvline(x=0, ls=":", color="k", zorder=1)
ax1.set_xlim(-1, 1)
ax1.set_ylim(-1, 1)
ax1.set(
    xlabel=f"Loadings, PC1 ({100 * pca.explained_variance_ratio_[0]:.2f} %)",
    ylabel=f"Loadings, PC2 ({100 * pca.explained_variance_ratio_[1]:.2f} %)",
)

# The plot the scores:
ax2.axhline(y=0, ls=":", color="k", zorder=1)
ax2.axvline(x=0, ls=":", color="k", zorder=1)
ax2.scatter(scores[:, 0], scores[:, 1])
ax2.set(
    xlabel=f"Scores, PC1 ({100 * pca.explained_variance_ratio_[0]:.2f} %)",
    ylabel=f"Scores, PC2 ({100 * pca.explained_variance_ratio_[1]:.2f} %)",
)


bbox = {"facecolor": "w", "boxstyle": "round", "lw": 1, "edgecolor": "k"}
ax2.text(
    scores[17, 0] + 5,
    scores[17, 1],
    f"Pepper: {peppers[17]}",
    ha="left",
    va="center",
    bbox=bbox,
)
ax2.text(
    scores[28, 0] + 5,
    scores[28, 1],
    f"Pepper: {peppers[28]}",
    ha="left",
    va="center",
    bbox=bbox,
)
ax2.set_aspect("equal")
sns.despine(fig=fig)

In [None]:
data[data["Compound"] == "374"]

In [None]:
data[data["Compound"] == "167"]