In [3]:
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import sklearn.datasets as datasets
from matplotlib.colors import rgb2hex
from matplotlib.cm import get_cmap

In [5]:
#Function for plotting
from mpl_toolkits.mplot3d import Axes3D

def scatterPlot(x, y, points_labels, algoName, size = (4,3), x_label ='1. główna składowa',
                y_label = '2. główna składowa', show_legend =True):
    fig = plt.figure(figsize = size)
    ax = fig.add_subplot(111)
    colormap = get_cmap('tab20')
    n = len(np.unique(points_labels))
    if n == 1:
        cdict = [rgb2hex((0,0,0))]
    if n > 1:
        cdict = [rgb2hex(colormap(col)) for col in np.arange(0, 1.01,
                                                             1/(n-1))]
    color_iter = 0
    for g in np.unique(points_labels):
        ix = np.where(points_labels == g)[0]
        ax.scatter(x[ix], y[ix], c = cdict[color_iter], label = g, s = 5)
        color_iter += 1
        
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(algoName)
    
    # Shrink current axis by 20%
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    ax.set_xlabel(x_label)
    lgnd = plt.legend(loc="lower left", scatterpoints=10, fontsize=10)
    
    for handle in lgnd.legendHandles:
        handle.set_sizes([6.0])
        
    ax.set_ylabel(y_label)
    if show_legend:
        ax.legend(loc='best')
        
    return ax

In [6]:
def plotDendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram
    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1 # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
        
    linkage_matrix = np.column_stack([model.children_, model.distances_,counts]).astype(float)
    
    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [7]:
def analyzeClusters(labels, real_labels):
    clusters, counts = np.unique(labels, return_counts = True)
    real_lab_perc = pd.DataFrame()
    
    for g in np.unique(labels):
        idx = np.where(labels == g)[0]
        cluster_real_labels = np.asarray(real_labels)[idx]
        cluster_labels = np.asarray(labels)[idx]
        real_label_in, real_lab_counts = np.unique(np.asarray(real_labels)[idx], return_counts = True)
        percent = 100*real_lab_counts/sum(real_lab_counts)
        xx = pd.DataFrame([percent], index = [g], columns =
        np.unique(real_label_in))
        real_lab_perc = pd.concat([real_lab_perc, xx], axis = 1)

    real_lab_perc=real_lab_perc.groupby(by=real_lab_perc.columns,axis=1).sum()
    clusters_table = np.asarray([clusters, counts]).T
    clusters_table = pd.DataFrame(clusters_table, columns = ['klaster','liczba obiektów'])
    clusters_table.reset_index(drop=True, inplace=True)
    real_lab_perc.reset_index(drop=True, inplace=True)
    
    df = pd.concat( [clusters_table, real_lab_perc], axis=1)
    return df.round(1)

In [8]:
def cutSmallest(labels, cut_num):
    clusters, counts = np.unique(labels, return_counts = True)
    cutoff_labels=labels
    for cluster, count in zip(clusters, counts):
        if count <= cut_num:
            for idx, label in enumerate(cutoff_labels):
                if label == cluster:
                    cutoff_labels[idx] = -1
    return cutoff_labels

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score