In [1]:
### Centroid-based clustering 

## euclidean distance

# straight line between two points. Calculated by finding the square root of sum of difference between x coordinates squared and y coordinates squared: 

# √(x1 - x2)^2 + (y1 - y2)^2

# given points (0,1) and (2,0):

# √(0 - 2)^2 + (1 - 0)^2
# √(4 + 1)
# √5

# using numpy:
import numpy as np
a = np.array([0, 1])
b = np.array([2, 0])
print(np.sqrt(((a - b)**2).sum())) # 2.23606797749979
print(np.sqrt(5)) # 2.23606797749979


2.23606797749979
2.23606797749979


In [2]:
### clustering wines 

# import wine dataset as 'data'
from sklearn.datasets import load_wine
data = load_wine()

# load 'data' pandas DataFrame 'wine'
import pandas as pd
wine = pd.DataFrame(data.data, columns=data.feature_names)

# check shape and columns 
print(wine.shape)
print(wine.columns)
print()
# another way to check column names and data type in each column
print(wine.info())
print()

(178, 13)
Index(['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
       'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
       'proanthocyanins', 'color_intensity', 'hue',
       'od280/od315_of_diluted_wines', 'proline'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins              

In [5]:
# summary statistics for first 3 features 
print(wine.iloc[:,:3].describe()) # [all rows, columns 0 to 2]
print()
# can do the same thing with .loc() instead of iloc()
# print(wine.loc[:,'alcohol':'ash'].describe())

# plotting the data (scatter matrix, not shown)

          alcohol  malic_acid         ash
count  178.000000  178.000000  178.000000
mean    13.000618    2.336348    2.366517
std      0.811827    1.117146    0.274344
min     11.030000    0.740000    1.360000
25%     12.362500    1.602500    2.210000
50%     13.050000    1.865000    2.360000
75%     13.677500    3.082500    2.557500
max     14.830000    5.800000    3.230000



In [6]:
## Pre-processing: Standardization

# "centroid-based algorithms require one pre-processing step because k-means works better on data where each attribute is of similar scales. One way to achieve this is to standardize the data"

# Z = (X - mean) / std 

# X is the raw data, mean is the average of X, std is the standard deviation of X
# Z is the scaled data such that it is centered around 0 with std of 1 

X = wine[['alcohol', 'total_phenols']] # we picked these 2 features based on scatter matrix

# import the scaler
from sklearn.preprocessing import StandardScaler

# instantiate the scaler
scale = StandardScaler()

# compute the mean and std to be used later for scaling
scale.fit(X)

# check calculated mean and std 
print(scale.mean_) # ['alcohol' mean, 'total_phenols' mean]
print(scale.scale_) # ['alochol' mean, 'total phenols' mean]
print()

# transform the raw training data to scaled data
X_scaled = scale.transform(X)

# sanity check: making sure that the scaled data is centered at 0 with an std of 1 
print(X_scaled.mean(axis=0)) # axis=0 to calculate 'alcohol' and 'total_phenoms' seperately 
print(X_scaled.std(axis=0))
print()

# we can do the same thing combining fit and transform into one step with fit_transform()
X_scaled=scale.fit_transform(X)
print(X_scaled.mean(axis=0))  
print(X_scaled.std(axis=0))
print()

[13.00061798  2.29511236]
[0.80954291 0.62409056]

[ 7.84141790e-15 -1.95536471e-16]
[1. 1.]

[ 7.84141790e-15 -1.95536471e-16]
[1. 1.]



In [7]:
## K-means modelling: import -> instantiate -> fit -> predict

# import model
from sklearn.cluster import KMeans

# instantiate model
kmeans = KMeans(n_clusters=3) # 'n_clusters' pass number of clusters

# "in K-means, random initial guess for the centroids can result in bad clustering. k-means++ algorithm addresses this problem by specifying a procedure to initialize the centroids before proceeding with the standard k-means algorithm. In scikit-learn, the initialization mechanism is set to k-means++, by default"

# fit
kmeans.fit(X_scaled)

# predict
y_pred = kmeans.predict(X_scaled)
print(y_pred) # 1d array showing which group each wine is in
print()

# eheck how many wines in each group
print(np.unique(y_pred, return_counts=True))
print()

# check coordinates of the 3 centroids
print(kmeans.cluster_centers_)
print()

# (better to visualize results with scatter plot)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 0 1 1 1 2 1 2 1 0 2 0 2
 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1
 1 1 1 1 1 1 1 0 1 1 1 1 2 1 1 1 1 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 2 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2]

(array([0, 1, 2], dtype=int32), array([54, 59, 65]))

[[ 0.05253603 -1.14020926]
 [-1.06183503  0.08414606]
 [ 0.92017418  0.87087204]]



In [8]:
## classifying a new wine 

# wine with 13 alcohol and 2.5 total phenols 
X_new = np.array([[13, 2.5]]) 

# standardize the new wine data
X_new_scaled = scale.transform(X_new) # scale was already fit previously on model data
print(X_new_scaled) # [[scaled 'alcohol', scaled 'total_phenoms']]

# predict the cluster 
print(kmeans.predict(X_new_scaled)) # number shows which cluster it fits in
print()

[[-0.00076337  0.32829793]]
[2]



In [10]:
## finding optimal k: the elbow method

# We can divide our dataset of n data points into any number of clusters (k) from 1 (all data points in one big cluster) to n (each data point is its own cluster). 
# So how do we know how many clusters to choose?

# k-means partitions n data points into k tight sets such that the data points are closer to each other than to the data points in the other clusters. 
# The tightness can be measured as the sum of squares of the distance from data point to its nearest centroid, or inertia.

# itertia = sum((x - nearest centroid)^2) 

#inertia for k = 2
kmeans = KMeans(n_clusters=2)
kmeans.fit(X_scaled)
print(kmeans.inertia_) 

# inertia for k = 3
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_scaled)
print(kmeans.inertia_) 
print()

# plot inertia for different values of k and find elbow to identify optimal k (not shown)


1659.0079672511501
1277.928488844642



In [11]:
## Modelling with all 13 features 

# feature selection: all of them!
X = wine

# standardize the features 
scale = StandardScaler() 
scale.fit(X)
X_scaled = scale.transform(X)

# plot inertia for different values of k and find elbow to identify optimal k (not shown)

# finalize model and obtain predictions
k_opt = 3 # the elbow plot showed us that 3 is still the optimal number of clusters 
kmeans = KMeans(k_opt)
kmeans.fit(X_scaled)
y_pred = kmeans.predict(X_scaled)
print(y_pred)
print()

# eheck how many wines in each group
print(np.unique(y_pred, return_counts=True))

# compared to the predictions using only 2 features, the models produce very similiar results. Only 13 out of 178 wines were clustered differently by the two models. 

# which model is better? Clustering is an unsupervised learning method, which indicates we don’t know the ground truth of the labels. 
# We can't really tell which is better without access to external information. Features are often chosen by collaboration between data scientists and domain knowledge experts.

# code and comments by github.com/alandavidgrunberg


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]

(array([0, 1, 2], dtype=int32), array([66, 61, 51]))
