In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

%matplotlib inline

#### Import the data cleaned in another notebook. Set the NDB Number to the index.

In [2]:
nutrients_matrix = pd.read_csv(r"../../data/Nutrients_Branded_Foods_2018/Matrix_Nutrients_Branded_Foods_2018.csv.gz")
print(nutrients_matrix.shape)
nutrients_matrix.set_index("NDB_No", inplace = True)
nutrients_matrix.head()

(237837, 95)


Unnamed: 0_level_0,"Calcium, Ca","Carbohydrate, by difference",Cholesterol,Energy,"Fatty acids, total saturated","Fatty acids, total trans","Fiber, total dietary","Iron, Fe",Protein,"Sodium, Na",...,18:2 undifferentiated,Glucose (dextrose),Starch,Lutein + zeaxanthin,"Fluoride, F","18:2 n-6 c,c","Choline, from phosphotidyl choline","18:3 n-3 c,c,c (ALA)",Epigallocatechin-3-gallate,Vitamin D3 (cholecalciferol)
NDB_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45001524,50.0,35.0,25.0,200.0,3.75,0.0,0.0,0.0,2.5,75.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45001528,0.0,43.24,0.0,162.0,0.0,0.0,0.0,0.0,0.0,703.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45001529,0.0,41.18,0.0,176.0,0.0,0.0,0.0,0.0,0.0,676.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45001530,0.0,34.29,0.0,143.0,0.0,0.0,0.0,0.0,0.0,971.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45001531,0.0,45.95,0.0,189.0,0.0,0.0,0.0,0.0,0.0,757.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Create two scaled version of the data for comparison: MinMax and Standard

In [9]:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(nutrients_matrix)
nutrients_min_max_scaled = min_max_scaler.transform(nutrients_matrix)

In [4]:
standard_scaler = StandardScaler()
standard_scaler.fit(nutrients_matrix)
nutrients_standard_scaled = standard_scaler.transform(nutrients_matrix)

#### Loop through the three datasets (original, minmax scaled and standard scaled) for different variance thresholds
We would like to create a table for comparing the best strategy moving forward

In [6]:
data = [nutrients_matrix, nutrients_min_max_scaled, nutrients_standard_scaled]

In [13]:
#create a list of the three datasets for looping
data = [nutrients_matrix, nutrients_min_max_scaled, nutrients_standard_scaled]
#list which will store our results for each dataset
results = []
for j in range(len(data)):
    variance_columns = {}
    #get variance explanation thresholds from 0.75 to 0.95 and record number of columns needed
    #reason (from sklearn docs):
    #If 0 < n_components < 1 and svd_solver == 'full', select the number of components such that the amount of 
    #variance that needs to be explained is greater than the percentage specified by n_components.
    for threshold in np.linspace(0.75, 1, num=5, endpoint=False):
        #set pca using the variance explanation requirement
        pca = PCA(n_components = threshold, svd_solver = 'full')
        pca.fit(data[j])
        pca_result = pca.transform(data[j])
        variance_columns[threshold] = pca_result.shape[1]
    results.append(variance_columns)

In [21]:
g = pd.DataFrame(results)
g = g.T
g.reset_index(inplace = True)
g.rename(columns = {"index": "% Variance Explained", 0: "Original DF", 1: "MinMaxScaled Df", 2: "StandardScaled Df"}, inplace = True)
g

Unnamed: 0,% Variance Explained,Original DF,MinMaxScaled Df,StandardScaled Df
0,0.75,2,2,38
1,0.8,2,3,42
2,0.85,2,3,47
3,0.9,3,4,52
4,0.95,3,7,59


#### Moving forward without scaling

In [10]:
pca = PCA(n_components = 0.95, svd_solver = 'full', whiten = False)
pca.fit(nutrients_min_max_scaled)
nutrients_min_max_scaled_pca = pca.transform(nutrients_min_max_scaled)

In [14]:
pd.DataFrame(nutrients_min_max_scaled_pca)

Unnamed: 0,0,1,2,3,4,5,6
0,0.089476,-0.073159,0.115016,-0.010595,-0.023487,0.005746,0.007571
1,0.202965,-0.143760,0.133676,-0.028313,-0.028881,-0.013643,-0.002431
2,0.172633,-0.142605,0.123067,-0.029844,-0.026854,-0.011274,-0.002760
3,0.078219,-0.147424,0.103767,-0.035021,-0.021797,-0.005793,-0.002040
4,0.255032,-0.141242,0.163615,-0.022179,-0.033017,-0.019619,-0.004486
...,...,...,...,...,...,...,...
237832,0.099976,0.110562,-0.267158,-0.050215,0.009927,-0.111476,-0.000390
237833,-0.365158,-0.170391,0.047480,-0.053324,-0.000513,0.014407,-0.001684
237834,0.506957,-0.052078,0.089091,0.004148,-0.038740,-0.012413,0.009181
237835,-0.215881,-0.150864,0.077494,-0.034579,-0.008568,0.004986,0.002784


#### Apply DBSCAN Algorithm

In [15]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN()

In [None]:
model = dbscan.fit(nutrients_min_max_scaled_pca)

In [None]:
labels = model.labels_
print(np.unique(labels))