# Hierarchical clustering on food products

Goals:
    
Do agglomerative clustering on the centroids of the results received from Kmeans on the scaled/PCA food products matrix and return a linkage matrix.

In [1]:
import numpy as np
import pandas as pd
import time

from sklearn.cluster import AgglomerativeClustering

In [2]:
# csv paths
data_p = r"../data/"

nutrient_matrix_centroids_p = data_p + "10000_cluster_centers.csv"

In [3]:
nutrient_matrix_centroids = pd.read_csv(nutrient_matrix_centroids_p, index_col=0)
print(nutrient_matrix_centroids.shape)
nutrient_matrix_centroids.head()

(10000, 65)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,-0.00769,-1.54465,0.107352,0.050278,0.056822,0.026857,0.230026,-0.002547,0.024334,0.046679,...,-0.016528,0.028247,-0.004425,-0.016682,0.007527,-0.014596,0.001947,-0.016372,-0.045266,0.029749
1,2781.483444,-2.25228,0.208276,-30.908771,-2.048539,-0.009266,0.02452,0.035365,0.470243,-0.048265,...,-0.036156,-0.098135,0.052281,0.10386,-0.133967,0.077619,-0.018594,0.21501,0.305988,0.150995
2,-0.011709,28.407393,-2.572315,-49.383371,711.784932,-1.048449,-9.489603,0.080867,3.083876,-0.543342,...,-0.511399,0.759729,-0.054034,-0.291077,-0.297037,-0.763396,0.22142,0.085775,0.620891,-0.067246
3,6.957604,8.825611,-0.857121,150.304901,10.068917,0.065692,-0.232524,-0.281654,-5.210221,2.005412,...,-0.091213,-1.760647,0.824565,1.182382,-1.781599,1.140641,-0.297284,2.107869,2.977343,1.503632
4,-0.009737,63.17701,892.732755,1.345903,0.652844,-325.665472,27.67172,0.290043,-0.126729,2.239691,...,-0.070477,2.081161,-1.119267,-1.016106,-2.356392,-4.189894,0.720082,0.42163,0.001571,0.395123


### Do Hierarchical clustering and save the linkage matrix

In [4]:
fname = data_p + "hcluster_nutrients_10000centroids_kmeans.npy"

In [5]:
program_starts = time.time()
agglo_cluster = AgglomerativeClustering(linkage = 'ward', compute_full_tree = True).fit(nutrient_matrix_centroids)
now = time.time()
print("It has been {0} seconds since the loop started".format(now - program_starts))

It has been 5.88123083114624 seconds since the loop started


In [6]:
linkage_matrix = agglo_cluster.children_
linkage_matrix

array([[ 4355,  7799],
       [ 1138,  6078],
       [ 2052,  8967],
       ...,
       [   10, 19995],
       [19974, 19996],
       [    1, 19997]])

In [7]:
linkage_matrix.shape

(9999, 2)

In [8]:
with open(fname, 'wb') as f:
    np.save(f,  linkage_matrix, fix_imports= False)