In [15]:
## K-Means Clustering
# Import Numpy, Pandas libraries and KMeans, Scaler and Imputer objects from Sklearn
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [3]:
# Read the Dataframe using read_csv and save it as Pandas dataframe
cereals = pd.read_csv('input/Cereals.csv')
cereals.shape

(77, 14)

In [4]:
# Understand the dimensions and observe the data using head and tail functions
cereals.head(10)

Unnamed: 0,name,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100%_Bran,70,4,1,130,10.0,5.0,6.0,280.0,25,3,1.0,0.33,68.402973
1,100%_Natural_Bran,120,3,5,15,2.0,8.0,8.0,135.0,0,3,1.0,1.0,33.983679
2,All-Bran,70,4,1,260,9.0,7.0,5.0,320.0,25,3,1.0,0.33,59.425505
3,All-Bran_with_Extra_Fiber,50,4,0,140,14.0,8.0,0.0,330.0,25,3,1.0,0.5,93.704912
4,Almond_Delight,110,2,2,200,1.0,14.0,8.0,,25,3,1.0,0.75,34.384843
5,Apple_Cinnamon_Cheerios,110,2,2,180,1.5,10.5,10.0,70.0,25,1,1.0,0.75,29.509541
6,Apple_Jacks,110,2,0,125,1.0,11.0,14.0,30.0,25,2,1.0,1.0,33.174094
7,Basic_4,130,3,2,210,2.0,18.0,8.0,100.0,25,3,1.33,0.75,37.038562
8,Bran_Chex,90,2,1,200,4.0,15.0,6.0,125.0,25,1,1.0,0.67,49.120253
9,Bran_Flakes,90,3,0,210,5.0,13.0,5.0,190.0,25,3,1.0,0.67,53.313813


In [5]:
cereals.describe()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
count,77.0,77.0,77.0,77.0,77.0,76.0,76.0,75.0,77.0,77.0,77.0,77.0,77.0
mean,106.883117,2.545455,1.012987,159.675325,2.151948,14.802632,7.026316,98.666667,28.246753,2.207792,1.02961,0.821039,42.665705
std,19.484119,1.09479,1.006473,83.832295,2.383364,3.907326,4.378656,70.410636,22.342523,0.832524,0.150477,0.232716,14.047289
min,50.0,1.0,0.0,0.0,0.0,5.0,0.0,15.0,0.0,1.0,0.5,0.25,18.042851
25%,100.0,2.0,0.0,130.0,1.0,12.0,3.0,42.5,25.0,1.0,1.0,0.67,33.174094
50%,110.0,3.0,1.0,180.0,2.0,14.5,7.0,90.0,25.0,2.0,1.0,0.75,40.400208
75%,110.0,3.0,2.0,210.0,3.0,17.0,11.0,120.0,25.0,3.0,1.0,1.0,50.828392
max,160.0,6.0,5.0,320.0,14.0,23.0,15.0,330.0,100.0,3.0,1.5,1.5,93.704912


In [6]:
cereals.describe(include='object')

Unnamed: 0,name
count,77
unique,77
top,Triples
freq,1


In [7]:
cereals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      77 non-null     object 
 1   calories  77 non-null     int64  
 2   protein   77 non-null     int64  
 3   fat       77 non-null     int64  
 4   sodium    77 non-null     int64  
 5   fiber     77 non-null     float64
 6   carbo     76 non-null     float64
 7   sugars    76 non-null     float64
 8   potass    75 non-null     float64
 9   vitamins  77 non-null     int64  
 10  shelf     77 non-null     int64  
 11  weight    77 non-null     float64
 12  cups      77 non-null     float64
 13  rating    77 non-null     float64
dtypes: float64(7), int64(6), object(1)
memory usage: 8.5+ KB


In [8]:
# Aggregate the columns ‘name’ shelf’ and ‘rating’ into one single column and name is as ‘label’.
# 100%_Bran (3 - 68.4)
cereals['label'] = cereals['name'] + ' (' + cereals['shelf'].astype(str) + ' - ' + round(cereals['rating'], 2).astype(str) +')'
cereals.drop(['name', 'shelf', 'rating'], axis=1, inplace=True)
cereals.head(1)

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,weight,cups,label
0,70,4,1,130,10.0,5.0,6.0,280.0,25,1.0,0.33,100%_Bran (3 - 68.4)


In [9]:
# To ensure the newly created label field is unique across along the dataframe, use the value_counts method. (this is one way, there can be a better way).
print('unique values: ',cereals['label'].nunique())
cereals['label'].value_counts

unique values:  77


<bound method IndexOpsMixin.value_counts of 0                     100%_Bran (3 - 68.4)
1            100%_Natural_Bran (3 - 33.98)
2                     All-Bran (3 - 59.43)
3     All-Bran_with_Extra_Fiber (3 - 93.7)
4               Almond_Delight (3 - 34.38)
                      ...                 
72                     Triples (3 - 39.11)
73                        Trix (2 - 27.75)
74                  Wheat_Chex (1 - 49.79)
75                    Wheaties (1 - 51.59)
76         Wheaties_Honey_Gold (1 - 36.19)
Name: label, Length: 77, dtype: object>

In [10]:
# To continue with clustering, remove the ‘label’ column and save it separately
cereals_label = cereals['label']
cereals.drop(['label'], axis=1, inplace=True)
cereals.head(1)

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,weight,cups
0,70,4,1,130,10.0,5.0,6.0,280.0,25,1.0,0.33


In [11]:
# Check the null values in each of the columns
cereals.isnull().sum()

calories    0
protein     0
fat         0
sodium      0
fiber       0
carbo       1
sugars      1
potass      2
vitamins    0
weight      0
cups        0
dtype: int64

In [12]:
# To ‘Impute’ the null values, initialize the ‘Imputer’ object with ‘mean’ as the imputation
# strategy. Then, use the Imputer object to impute the null values on the data frame
# from sklearn.impute import SimpleImputer # already imported in the import section
na_imputer = SimpleImputer(verbose=5)
na_imputer.fit(cereals)
cereals = pd.DataFrame(na_imputer.transform(cereals), columns=cereals.columns, index=cereals.index)
cereals.head()


Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,weight,cups
0,70.0,4.0,1.0,130.0,10.0,5.0,6.0,280.0,25.0,1.0,0.33
1,120.0,3.0,5.0,15.0,2.0,8.0,8.0,135.0,0.0,1.0,1.0
2,70.0,4.0,1.0,260.0,9.0,7.0,5.0,320.0,25.0,1.0,0.33
3,50.0,4.0,0.0,140.0,14.0,8.0,0.0,330.0,25.0,1.0,0.5
4,110.0,2.0,2.0,200.0,1.0,14.0,8.0,98.666667,25.0,1.0,0.75


In [13]:
# Check the null values again.
cereals.isna().sum()

calories    0
protein     0
fat         0
sodium      0
fiber       0
carbo       0
sugars      0
potass      0
vitamins    0
weight      0
cups        0
dtype: int64

In [14]:
# Standardize the data : Since clustering is based on distance between the elements, data is very sensitive to scale.
# from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(cereals)
scaled_data = pd.DataFrame(scaler.transform(cereals), columns=cereals.columns, index=cereals.index)
scaled_data.head()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,weight,cups
0,-1.905397,1.337319,-0.012988,-0.356306,3.314439,-2.542013,-0.237495,2.627053,-0.14627,-0.198067,-2.12387
1,0.677623,0.417912,3.987349,-1.737087,-0.064172,-1.764055,0.225316,0.526376,-1.27255,-0.198067,0.774053
2,-1.905397,1.337319,-0.012988,1.204578,2.892113,-2.023374,-0.468901,3.20655,-0.14627,-0.198067,-2.12387
3,-2.938605,1.337319,-1.013072,-0.236238,5.003745,-1.764055,-1.625929,3.351425,-0.14627,-0.198067,-1.388576
4,0.161019,-0.501495,0.987096,0.48417,-0.486498,-0.208138,0.225316,0.0,-0.14627,-0.198067,-0.307262


In [69]:
grid_search = GridSearchCV(estimator=KMeans(), param_grid={'n_clusters':list(range(2,10))}, verbose=10, n_jobs=-1)
# estimator = Algo
# param_grid = dict of parameter settings to try as values
# verbose = logger level
# n_jobs = number of processors to use in parallel; -1 to unleash all your processors
grid_search.fit(scaled_data)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0252s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  22 out of  40 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  27 out of  40 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  32 out of  40 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  37 out of  40 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.2s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=KMeans(algorithm='auto', copy_x=True, init='k-means++',
                              max_iter=300, n_clusters=8, n_init=10,
                              n_jobs=None, precompute_distances='auto',
                              random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_clusters': [2, 3, 4, 5, 6, 7, 8, 9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

In [70]:
grid_search.best_estimator_

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=9, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [71]:
cereals['Category'] = grid_search.predict(scaled_data)
cereals

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,weight,cups,cluster_id,Category
0,70.0,4.0,1.0,130.0,10.0,5.0,6.0,280.000000,25.0,1.0,0.33,6,5
1,120.0,3.0,5.0,15.0,2.0,8.0,8.0,135.000000,0.0,1.0,1.00,7,2
2,70.0,4.0,1.0,260.0,9.0,7.0,5.0,320.000000,25.0,1.0,0.33,6,5
3,50.0,4.0,0.0,140.0,14.0,8.0,0.0,330.000000,25.0,1.0,0.50,6,5
4,110.0,2.0,2.0,200.0,1.0,14.0,8.0,98.666667,25.0,1.0,0.75,9,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,110.0,2.0,1.0,250.0,0.0,21.0,3.0,60.000000,25.0,1.0,0.75,1,1
73,110.0,1.0,1.0,140.0,0.0,13.0,12.0,25.000000,25.0,1.0,1.00,4,3
74,100.0,3.0,1.0,230.0,3.0,17.0,3.0,115.000000,25.0,1.0,0.67,10,0
75,100.0,3.0,1.0,200.0,3.0,17.0,3.0,110.000000,25.0,1.0,1.00,10,1


In [72]:
grid_search.best_estimator_.cluster_centers_

array([[-0.63110674,  0.17273709, -0.74638329, -0.79255263,  0.24553433,
         0.46609263, -0.85457683,  0.13875735, -0.44661146, -0.2738775 ,
        -0.22652371],
       [ 0.05769863,  0.23403089, -0.51303028,  1.14454414, -0.57096331,
         1.3737108 , -1.02427427, -0.66883617, -0.14627013, -0.19806746,
         1.04654403],
       [ 0.03186843,  0.87761585,  1.4871384 , -0.72401383,  0.15754967,
        -0.7493821 , -0.09210552,  0.30001045, -0.42784013, -0.19806746,
        -1.02633609],
       [ 0.18348048, -0.90123698,  0.03049382,  0.15006803, -0.63339417,
        -0.56893031,  0.93965501, -0.68384851, -0.14627013, -0.19806746,
         0.23433571],
       [ 0.88426508,  0.41791231, -0.01298811,  0.48417024,  1.20280744,
        -0.41559358,  1.24350063,  1.81575741,  0.52949787,  2.12974647,
        -0.29861135],
       [-2.24979938,  1.33731939, -0.34634956,  0.20401161,  3.73676565,
        -2.10981415, -0.77744163,  3.06167606, -0.14627013, -0.19806746,
        -1.878

In [73]:
grid_search.best_estimator_.labels_

array([5, 2, 5, 5, 3, 3, 3, 7, 0, 0, 3, 1, 3, 2, 3, 1, 1, 3, 3, 2, 0, 1,
       3, 0, 3, 3, 0, 4, 4, 3, 3, 3, 0, 0, 2, 3, 3, 3, 6, 6, 1, 2, 3, 0,
       7, 7, 7, 3, 3, 7, 0, 7, 4, 6, 8, 8, 2, 2, 4, 2, 0, 1, 1, 0, 0, 0,
       3, 1, 0, 6, 4, 6, 1, 3, 0, 1, 3], dtype=int32)

In [74]:
grid_search.best_estimator_.inertia_

257.4088031286972

In [None]:
wss= {}
for k in range(2, 21):
    kmeans_loop = KMeans(n_clusters=k,n_init=30,n_jobs=-1,random_state=1000,verbose=0).fit(scaled_cereals)
    clusters = kmeans_loop.labels_
    labels = kmeans_loop.predict(scaled_cereals)
    print('silhouette_score(scaled_cereals, labels):', silhouette_score(scaled_cereals, labels))
    wss[k] = kmeans_loop.inertia_