# PCA of Coffee Bean Spectra
- Originally completed for a graded assignment in August 2021
- Modified to remove assignment specific details

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
# importing data
eigenvalues_df = pd.read_csv(r'coffee_eigenvalues.csv', header = None, names = ['eigenvalues'])
eigenvectors_df = pd.read_csv(r'coffee_eigenvectors.csv', header = None)
data_df = pd.read_csv(r'DS19hH2_dk0_FTIR_Spectra_instant_coffee.csv', header = None)

In [3]:
# finding explained variance ratios
eigenvalues_series = pd.Series(eigenvalues_df['eigenvalues'].values.tolist())
eigenvalues_cumsum_array = np.array(eigenvalues_series.cumsum())
explained_variance_ratio = np.empty(56)
for i in range(56):
    explained_variance_ratio[i] = eigenvalues_cumsum_array[i]/eigenvalues_cumsum_array[55]

In [7]:
# plotting explained variance ratios
fig1 = px.line(
    x = range(1, 57),
    y = explained_variance_ratio,
    labels = {"x": "Number of Principal Components", "y": "Explained Variance Ratio"},
    title = "Explained Variance Ratios"
)

fig1.show()

Six principal components could represent the spectra dataset as the explained variance ratio for six principal components is greater than 0.99 or 99%

In [5]:
# projecting data onto principal components
data_array = np.array(data_df[1:57])
column_means = data_array.mean(axis=0)
data_array = data_array - column_means
pc_array = np.transpose(np.array(eigenvectors_df.iloc[0:6, 0:286]))
projections_array = np.matmul(data_array, pc_array)
projections_df = pd.DataFrame(data = projections_array, columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'])
projections_df['Coffee Type'] = ['Arabica'] * 29 + ['Robusta'] * 27

In [6]:
# plotting spectra projections
fig2 = px.scatter_matrix(
    projections_df,
    dimensions = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'],
    color = 'Coffee Type',
    title = "Prinicpal Component Projections"
    )
fig2.update_traces(diagonal_visible=False)
fig2.show()

In [8]:
# plotting pair of prinicpal components
fig3 = px.scatter(
    projections_df,
    x = 'PC3',
    y = 'PC1',
    color = 'Coffee Type',
    title = "PC1 vs PC3"
)
fig3.show()

PC1  and  PC3  are  a  pair  of  principal  components  that provide a good separation of the different coffee types if the spectra are projected into this subspace

In [9]:
# reconstructing spectra
reconstructed_spectra = np.matmul(projections_array, np.transpose(pc_array)) + column_means
print(reconstructed_spectra.shape)
arabica_row = reconstructed_spectra[0,:]
robusta_row = reconstructed_spectra[55,:]

(56, 286)


In [10]:
# plotting arabica spectra
arabica_df = pd.DataFrame()
arabica_df['Wavelength'] = list(data_df.iloc[0,:]) * 2
arabica_df['Signal'] = list(data_df.iloc[1,:]) + list(arabica_row)
arabica_df['Signal Type'] = ['Original'] * 286 + ['Reconstructed'] * 286
fig4 = px.line(
    arabica_df,
    x = 'Wavelength',
    y = 'Signal',
    color = 'Signal Type',
    title = 'Original vs Reconstructed Arabica Spectra'
)
fig4.show()

In [11]:
# plotting robusta spectra
robusta_df = pd.DataFrame()
robusta_df['Wavelength'] = list(data_df.iloc[0,:]) * 2
robusta_df['Signal'] = list(data_df.iloc[56,:]) + list(robusta_row)
robusta_df['Signal Type'] = ['Original'] * 286 + ['Reconstructed'] * 286
fig5 = px.line(
    robusta_df,
    x = 'Wavelength',
    y = 'Signal',
    color = 'Signal Type',
    title = 'Original vs Reconstructed Robusta Spectra'
)
fig5.show()