In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy.cluster.hierarchy as hac
from scipy.cluster.hierarchy import fcluster

import sys  
sys.path.insert(0, '../../common/Cluster')
from plot_pie import plot_pie

from matplotlib.ticker import FixedLocator, FixedFormatter

from backup_outlier_detection_post_cluster import retirve_list_over_outliers

## **K-means, scoring when comparing with and without outliers**

This notebook compares k-means with six clusters before and after outlier removal, using silhouette_score, davies_bouldin_score and calinski_harabasz_score. The motivation for doing this is to analyze how big the effect of removing outlier net station is.
<br>
<br>
Outliers are detected using the function retrive_list_over_outliers. Several metric to use to detect outliers in retrive_list_over_outliers was tested in outlier_removal_metric_comparison. This notebook can be found in common -> Results -> feature_clustering. It has also been tested to remove outliers with two and three layers of clustering, see notebook hierarchical_clustering_combinations_for_outlier_removal.

Load data

In [3]:
df = pd.read_parquet('../parquet_files/weekly_load_all_trafos.parquet')

### Defining k-means clusters

In [4]:
weekly_t=df.transpose()

In [5]:
km1 = KMeans(n_clusters=6)

y_km1 = km1.fit_predict(weekly_t)

In [6]:
c1=weekly_t.loc[y_km1 == 0].transpose()
c2=weekly_t.loc[y_km1 == 1].transpose()
c3=weekly_t.loc[y_km1 == 2].transpose()
c4=weekly_t.loc[y_km1 == 3].transpose()
c5=weekly_t.loc[y_km1 == 4].transpose()
c6=weekly_t.loc[y_km1 == 5].transpose()

### K-means without outlier removal

In [7]:
x_kmeans_raw = df.T.iloc[:,:].values

In [8]:
#Create list that matches net station id to which cluster (1-6) net station is placed in.
#Used when computing score.

label_kmeans_raw = []
for idx in df.T.index:
    if idx in list(c1.columns):
        label_kmeans_raw.append(1)
    elif idx in list(c2.columns):
        label_kmeans_raw.append(2)
    elif idx in list(c3.columns):
        label_kmeans_raw.append(3)
    elif idx in list(c4.columns):
        label_kmeans_raw.append(4)
    elif idx in list(c5.columns):
        label_kmeans_raw.append(5)
    elif idx in list(c6.columns):
        label_kmeans_raw.append(6)
    else:
        print("ERROR: ", idx)

### K-means with outlier removal

In [9]:
outliers = (retirve_list_over_outliers(c1) + retirve_list_over_outliers(c2) +  
            retirve_list_over_outliers(c3) + retirve_list_over_outliers(c4) +
            retirve_list_over_outliers(c5) + retirve_list_over_outliers(c6))

In [10]:
#Remove net stations that have been detected as outliers
df_outliers_removed = df.drop(outliers, axis=1, )

In [11]:
x_kmeans_outliers_removed = df_outliers_removed.T.iloc[:,:].values

In [12]:
#Create list that matches net station id to which cluster (1-6) net station is placed in. 
#Used when computing score.

label_kmeans_outliers_removed = []
for idx in df_outliers_removed.T.index:
    if idx in list(c1.columns):
        label_kmeans_outliers_removed.append(1)
    elif idx in list(c2.columns):
        label_kmeans_outliers_removed.append(2)
    elif idx in list(c3.columns):
        label_kmeans_outliers_removed.append(3)
    elif idx in list(c4.columns):
        label_kmeans_outliers_removed.append(4)
    elif idx in list(c5.columns):
        label_kmeans_outliers_removed.append(5)
    elif idx in list(c6.columns):
        label_kmeans_outliers_removed.append(6)
    else:
        print("ERROR: ", idx)

## **Evaluation, scoring**

See common -> Results -> data_driven_clustering -> datadriven_clustering_results for a detailed description of each scoring metric. Included here is a short summary to help understand the results. 
The text description used here does not comply with copyright rules, and must be changed/removed if to be published. 

### **silhouette_score**

The Silhouette score is a measure of how well samples are clustered with samples that are similar to themselves. Clustering models with a high Silhouette score are said to be dense, where samples in the same cluster are similar to each other, and well separated, where samples in different clusters are not very similar to each other. 

In [13]:
from sklearn.metrics import silhouette_score

In [14]:
silhouette_score_with_outliers = silhouette_score(x_kmeans_raw, labels=label_kmeans_raw, metric='euclidean')

In [15]:
silhouette_score_without_outliers = silhouette_score(x_kmeans_outliers_removed, labels=label_kmeans_outliers_removed, metric='euclidean')

In [16]:
print("K-means with outliers, silhouette_score: ", silhouette_score_with_outliers)
print("K-means without outliers, silhouette_score: ", silhouette_score_without_outliers)

K-means with outliers, silhouette_score:  0.1946145329361984
K-means without outliers, silhouette_score:  0.19850465075563548


### **davies_bouldin_score**

This index signifies the average ‘similarity’ between clusters, where the similarity is a measure that compares the distance between clusters with the size of the clusters themselves. Zero is the lowest possible score. Values closer to zero indicate a better partition.

In [17]:
from sklearn.metrics import davies_bouldin_score

In [18]:
davies_bouldin_score_with_outliers = davies_bouldin_score(x_kmeans_raw, labels=label_kmeans_raw)

In [19]:
davies_bouldin_score_without_outliers = davies_bouldin_score(x_kmeans_outliers_removed, labels=label_kmeans_outliers_removed)

In [20]:
print("K-means with outliers, davies_bouldin_score: ", davies_bouldin_score_with_outliers)
print("K-means without outliers, davies_bouldin_score: ", davies_bouldin_score_without_outliers)

K-means with outliers, davies_bouldin_score:  1.3820489730147425
K-means without outliers, davies_bouldin_score:  1.363087260915682


### **calinski_harabasz_score**

A higher Calinski-Harabasz score relates to a model with better defined clusters. The index is the ratio of the sum of between-clusters variation and of inter-cluster variation for all cluster.

In [21]:
from sklearn.metrics import calinski_harabasz_score

In [22]:
calinski_harabasz_score_with_outliers = calinski_harabasz_score(x_kmeans_raw, labels=label_kmeans_raw)

In [23]:
calinski_harabasz_score_without_outliers = calinski_harabasz_score(x_kmeans_outliers_removed, labels=label_kmeans_outliers_removed)

In [24]:
print("K-means with outliers, calinski_harabasz_score: ", calinski_harabasz_score_with_outliers)
print("K-means without outliers, calinski_harabasz_score: ", calinski_harabasz_score_without_outliers)

K-means with outliers, calinski_harabasz_score:  1722.156857190652
K-means without outliers, calinski_harabasz_score:  1787.9731518976866
