Imports

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 14 09:54:28 2021

Clustering exercise using fish measurements

@author: napi
"""

import pandas as pd
import numpy as np

import sklearn.cluster as cluster
import sklearn.metrics as skmet

import matplotlib.pyplot as plt
import cluster_tools as ct

%matplotlib inline

import map

ModuleNotFoundError: No module named 'map'

Inspecting the module.

In [None]:
help(ct)

Read in and inspect

In [None]:
# reading the file and basic statistics
df_fish = pd.read_csv("fish_measurements.csv", skiprows=(1,2))
print(df_fish.describe())
print()


Display heatmap and scatter plot.

The `coolwarm` colour map is used by the mapper. This and a few other colour maps.
![Colour maps](div_colormaps.png)

Combinations of columns with light blue or light red are good.

In [None]:
# heatmap
ct.map_corr(df_fish, 9)

# scatter plot
pd.plotting.scatter_matrix(df_fish, figsize=(9.0, 9.0))
plt.tight_layout()    # helps to avoid overlap of labels
plt.show()

Total length vs. height splits has a low-ish correlation. The scatter plot confirms that this is a good choice. Picking that combination. 

Setting up and executing kmeans clustering. Running a loop iterating the number of clusters and calculate the silouette score.

In [None]:
# extract columns for fitting. 
# .copy() prevents changes in df_fit to affect df_fish.
df_fit = df_fish[["total length", "height"]].copy()

# normalise dataframe and inspect result
# normalisation is done only on the extract columns. .copy() prevents
# changes in df_fit to affect df_fish. This make the plots with the 
# original measurements
df_fit, df_min, df_max = ct.scaler(df_fit)
print(df_fit.describe())
print()

print("n   score")
# loop over trial numbers of clusters calculating the silhouette
for ic in range(2, 7):
    # set up kmeans and fit
    kmeans = cluster.KMeans(n_clusters=ic)
    kmeans.fit(df_fit)     

    # extract labels and calculate silhoutte score
    labels = kmeans.labels_
    print (ic, skmet.silhouette_score(df_fit, labels))
    
    

Good results for 4 and 5 clusters. Plot both. Use the scatter plot for colouring the sysmbols.

In [None]:
# Plot for four clusters
nc = 4 # number of cluster centres

kmeans = cluster.KMeans(n_clusters=nc)
kmeans.fit(df_fit)     

# extract labels and cluster centres
labels = kmeans.labels_
cen = kmeans.cluster_centers_

plt.figure(figsize=(6.0, 6.0))
# scatter plot with colours selected using the cluster numbers
plt.scatter(df_fit["total length"], df_fit["height"], c=labels, cmap="tab10")
# colour map Accent selected to increase contrast between colours

# show cluster centres
xc = cen[:,0]
yc = cen[:,1]
plt.scatter(xc, yc, c="k", marker="d", s=80)
# c = colour, s = size

plt.xlabel("total length")
plt.ylabel("height")
plt.title("4 clusters")
plt.show()

#----------------------------
# Plot for five clusters
nc = 5 # number of cluster centres

kmeans = cluster.KMeans(n_clusters=nc)
kmeans.fit(df_fit)     

# extract labels and cluster centres
labels = kmeans.labels_
cen = kmeans.cluster_centers_

plt.figure(figsize=(6.0, 6.0))
# scatter plot with colours selected using the cluster numbers
plt.scatter(df_fit["total length"], df_fit["height"], c=labels, cmap="tab10")
# colour map Accent selected to increase contrast between colours

# show cluster centres
xc = cen[:,0]
yc = cen[:,1]
plt.scatter(xc, yc, c="k", marker="d", s=80)

plt.xlabel("total length")
plt.ylabel("height")
plt.title("5 clusters")
plt.show()



Using 5 clusters splits the bottom right cluster. The 4 cluster solution looks more plausible. The distinction between cluster 4 and 5 is much smaller than between all the other cluster combinations.

Note that cluster centres are initialised randomly. Results can differ. Consider rerunning if result does not look satisfactory.

#### Now plot it on the original scale

In [None]:
nc = 4 # number of cluster centres

kmeans = cluster.KMeans(n_clusters=nc)
kmeans.fit(df_fit)     

# extract labels and cluster centres
labels = kmeans.labels_
cen = kmeans.cluster_centers_

plt.figure(figsize=(6.0, 6.0))
# scatter plot with colours selected using the cluster numbers
# now using the original dataframe
plt.scatter(df_fish["total length"], df_fish["height"], c=labels, cmap="tab10")
# colour map Accent selected to increase contrast between colours

# rescale and show cluster centres
scen = ct.backscale(cen, df_min, df_max)
xc = scen[:,0]
yc = scen[:,1]
plt.scatter(xc, yc, c="k", marker="d", s=80)

plt.xlabel("total length")
plt.ylabel("height")
plt.title("4 clusters")
plt.show()

