# PCA Applied to a Nutritional Dataset

This notebook is based on:
https://scikit-learn.org/stable/auto_examples/decomposition/plot_incremental_pca.html#sphx-glr-auto-examples-decomposition-plot-incremental-pca-py

This code loads the data from a spreadsheet and then visualises it after PCA. The first static plot is created using matplotlib, and the second one is interactive.

`conda install -c conda-forge jupyterlab`

## Load the data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.decomposition import PCA

# To read xlsx, install: pip install openpyxl
# If you use dropna() then whenever python finds NaN in a row, it will return True and will remove whole row, doesn't matter if any value is there or not besides NaN.
df = (pd.read_excel('data/CEqbBq1zDue_nutritional_data_english.xlsx').dropna())

# extract categorical columns
to_be_dropped = df.select_dtypes([object]).columns
# but don't remove descriptions
to_be_dropped = to_be_dropped.drop('Shrt_Desc')
# remove categorical variables
df= df.drop(to_be_dropped,axis=1)

df.head()

Unnamed: 0,NDB_No,Shrt_Desc,Water,Energ_Kcal,Protein,Lipd_Tot,Ash,Carbohydrt,Fiber_TD,Sugar_Tot,...,Beta_Crypt,Lycopene,Lut+Zea,FA_Sat,FA_Mono,FA_Poly,Cholestrl,GmWt_1,GmWt_2,Refuse_Pct
0,1001,"BUTTER,WITH SALT",15.87,717,0.85,81.110001,2.11,0.06,0.0,0.06,...,0.0,0.0,0.0,51.368,21.021,3.043,215.0,227.0,14.2,0.0
1,1002,"BUTTER,WHIPPED,WITH SALT",15.87,717,0.85,81.110001,2.11,0.06,0.0,0.06,...,0.0,0.0,0.0,50.488998,23.426001,3.012,219.0,151.0,9.4,0.0
2,1003,"BUTTER OIL,ANHYDROUS",0.24,876,0.28,99.480003,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,61.924,28.732,3.694,256.0,205.0,12.8,0.0
3,1004,"CHEESE,BLUE",42.41,353,21.4,28.74,5.11,2.34,0.0,0.5,...,0.0,0.0,0.0,18.669001,7.778,0.8,75.0,28.35,17.3,0.0
4,1005,"CHEESE,BRICK",41.110001,371,23.24,29.68,3.18,2.79,0.0,0.51,...,0.0,0.0,0.0,18.764,8.598,0.784,94.0,132.0,113.0,0.0


In [2]:
# we can sample a subset of rows if our dataset is large
# df = df.sample(frac=0.5)

print(df.describe())
print("The number of rows is: " + str(len(df.index)))

             NDB_No        Water   Energ_Kcal      Protein     Lipd_Tot  \
count   1834.000000  1834.000000  1834.000000  1834.000000  1834.000000   
mean   12388.346238    56.314886   219.244820    11.780125    10.903599   
std     6455.664486    30.289503   173.868759    11.441660    16.809104   
min     1001.000000     0.000000     0.000000     0.000000     0.000000   
25%     8116.250000    34.725000    72.000000     1.742500     0.450000   
50%    11948.500000    63.255001   194.500000     7.600000     4.650000   
75%    17110.500000    81.575001   309.000000    21.665000    14.267500   
max    44158.000000    99.900002   902.000000    85.599998   100.000000   

               Ash   Carbohydrt     Fiber_TD    Sugar_Tot      Calcium  ...  \
count  1834.000000  1834.000000  1834.000000  1834.000000  1834.000000  ...   
mean      1.915174    18.890703     1.726936     8.088272    70.759542  ...   
std       4.135405    25.446314     3.482732    16.363228   286.552759  ...   
min     

In [3]:
# We want the data in the numpy array

# numeric data starts from the second column; we convert it to the np.array
X = df[df.columns[2:]].to_numpy()
# we also extract names of the food items; they are in column "1", which is "Shrt_Desc"
X_names = df[df.columns[1]].to_numpy()

from sklearn.preprocessing import StandardScaler

# we will standardise our data; (x_i - m)/stdev
X = StandardScaler().fit_transform(X)
# print 3 rows to see normalised data
print(X[1:3,:])
print(X.shape)


[[-1.33564149  2.86360207 -0.9555525   4.17782821  0.04712441 -0.74021878
  -0.49599186 -0.49076267 -0.16322403 -0.66087865 -0.56901283 -0.43608346
  -0.47823362  0.33945392 -0.48451856 -0.39877228 -0.24982947 -0.30154344
  -0.12380911 -0.45823956 -0.51992394 -0.82364644 -0.48720927 -0.76747239
  -0.36239146 -0.28797329 -0.2709373  -0.39008752 -0.21951988  0.5442443
   0.98930052  1.02961434  0.50461451 -0.07100653 -0.11439706 -0.0566673
  -0.06507032 -0.10806672 -0.1648604   5.39059123  2.6512187   0.25605046
   1.34237708  0.05180403 -0.43648043 -0.49548896]
 [-1.85180258  3.77833434 -1.00538404  5.27098657 -0.46324281 -0.74257733
  -0.49599186 -0.49443042 -0.23303824 -0.72874149 -0.6061545  -0.4974457
  -0.51601385 -0.26999442 -0.49546395 -0.43746318 -0.25257097 -0.32258211
  -0.12380911 -0.46885765 -0.6163352  -0.83475462 -0.61253966 -0.77484408
  -0.39966717 -0.28797329 -0.31107346 -0.42313184 -0.2532316   0.7083029
   1.24059505  1.28220647  0.67217221 -0.04531618 -0.11439706 -0.

### TODO: now you can run PCA here and generate scatterplots
