# Klasterinė analizė: Pagrindinės sąvokos 
# *(Cluster analysis: Basic concepts)*

Klasterinė analizė priskiriama prie *("Unsupervised learning")* Mokymasis be mokytojo t.y. nėra iš anksto nustatytų klasių.

In [1]:
                                import os
                                import pandas                         as pd
                                import matplotlib.pyplot              as plt
                                import numpy                          as np
# from sklearn                    import cluster
# from sklearn                    import mixture
# from collections                import defaultdict
# from sklearn.metrics.cluster    import normalized_mutual_info_score
# from sklearn.metrics.cluster    import adjusted_rand_score

from sklearn.cluster            import DBSCAN
from sklearn.cluster            import Birch
from sklearn                    import metrics
                                import hdbscan
                                import itertools

In [2]:
# Comment this if the data visualisations doesn't work
%matplotlib inline
plt.style.use('seaborn-whitegrid')
plt.rcParams['grid.linestyle'] = ':'
plt.rcParams['grid.color'] = '#474545'
plt.rcParams['axes.edgecolor'] = '#474545'

In [3]:
import os

__file__ = 'Clusterization.ipynb'
__path__ = os.path.dirname(os.path.realpath(__file__))

print('File path: %s' % __path__)
print('File name: %s' %__file__)

File path: /workspaces/optimisation_shortest_path/notebooks
File name: Clusterization.ipynb


In [4]:
df = pd.read_csv('../output/mungy/data_nm.csv').set_index('ID')
df.shape

(265, 357)

Methods for clustering:
    
* [HDBSCAN](http://hdbscan.readthedocs.io/en/latest/index.html)
* DBSCAN
* BIRCH

[Worcking with dictionarys](https://stackoverflow.com/a/8381589/7347438)

## Artumo matas *(Proximity measure)*


In [5]:
from sklearn.metrics.pairwise   import euclidean_distances
from sklearn.metrics.pairwise   import cosine_distances
from sklearn.metrics.pairwise   import manhattan_distances

In [6]:
kClusters = 8
idx = list(df.index.values)
results = {'Product': idx}

euclidean = euclidean_distances(df, df)
manhattan = manhattan_distances(df, df)
cosine = cosine_distances(df, df)

print('Expected clusters qty: %s' % kClusters)
print('Lenght of dataframe: %s' % len(idx))

Expected clusters qty: 8
Lenght of dataframe: 265


### Euclidean distances

In [7]:
# distance_matrix = pd.DataFrame(euclidean_distances(df, df), 
#                          index = labels, 
#                          columns = labeals)
# distance_matrix.to_csv('output/{}.csv'.format(euclidean_distances.__name__))

## Klasterizavimo metodai *(Clusterisation algorithms)*


### DBSCAN

In [8]:
def split_grid(parameter_grid):
    """Input: {x:[y]}, Output: [{x:y}]"""
    return [list_of_toople_to_dic([*zip(parameter_grid.keys(), values)]) 
            for values 
            in [*itertools.product(*parameter_grid.values())]]        
        
def list_of_toople_to_dic(values):
    """Input: [(x,y)], Output: {x:y}"""
    return {key:value for key, value in values}

def add_metrics(model, values, dic):
    labels = model.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    dic['Estimated number of clusters'].append(n_clusters_)
    dic['Estimated number of noise points'].append(n_noise_)
    dic['Silhouette Coefficient'].append(metrics.silhouette_score(df.values, labels)\
        if n_clusters_ != 0 else -1)
    dic['Minimum cluster size'].append(min([count_matching(label, labels) for label in np.unique(labels)]))

    return dic

def count_matching(condition, seq):
    """Returns the amount of items in seq that return true from condition"""
    return sum(1 for item in seq if condition == item)

clust_results = {
        'Clasterisation method' : [],
        'Maximum distance between two samples = eps' : [],
        'The number of samples = min_sample' : [],
        'Metric for Distance between instances = metric' :[],
        'Minimum cluster size' : [],
        'Estimated number of clusters' : [],
        'Estimated number of noise points' : [],
        'Silhouette Coefficient' : []
}


In [9]:
# Compute DBSCAN

parameter_grid = dict(
    eps=np.round(np.arange(0.1, 7, 0.1), decimals=1),
    min_samples = np.round(np.arange(1, 10, 1), decimals=0),
    metric = ['euclidean'], #, 'manhattan'
)

options_grid = split_grid(parameter_grid)

for options in options_grid:

    model = DBSCAN(n_jobs=-1, **options).fit(df.values)
    labels = model.labels_

    clust_results['Clasterisation method'].append('DBSCAN')
    clust_results['Maximum distance between two samples = eps'].append(options['eps'])
    clust_results['The number of samples = min_sample'].append(options['min_samples'])
    clust_results['Metric for Distance between instances = metric'].append(options['metric'])

    clust_results = add_metrics(model, df.values, clust_results)

### HDBSCAN

HDBSCAN - theese method contane several steps:

    1. Transform the space according to the density/sparsity.
    2. Build the minimum spanning tree of the distance weighted graph.
    3. Construct a cluster hierarchy of connected components.
    4. Condense the cluster hierarchy based on minimum cluster size.
    5. Extract the stable clusters from the condensed tree.

In [10]:


parameter_grid = dict(
    min_cluster_size = getattr(np.round(np.arange(5, 20, 1, dtype=int), decimals=0), "tolist", lambda: value)(),
    metric = ['euclidean'], #, 'manhattan'
)

options_grid = split_grid(parameter_grid)

for options in options_grid:

    model = hdbscan.HDBSCAN(gen_min_span_tree=True, **options).fit(df.values)

    clust_results['Clasterisation method'].append('HDBSCAN')
    clust_results['Maximum distance between two samples = eps'].append(None)
    clust_results['The number of samples = min_sample'].append(None)
    clust_results['Metric for Distance between instances = metric'].append(options['metric'])

    clust_results = add_metrics(model, df.values, clust_results)

### BIRCH

In [11]:
parameter_grid = dict(
    n_clusters = getattr(np.round(np.arange(5, 40, 1, dtype=int), decimals=0), "tolist", lambda: value)(),
)

options_grid = split_grid(parameter_grid)

for options in options_grid:

    model = Birch(**options).fit(df.values)

    clust_results['Clasterisation method'].append('BIRCH')
    clust_results['Maximum distance between two samples = eps'].append(None)
    clust_results['The number of samples = min_sample'].append(None)
    clust_results['Metric for Distance between instances = metric'].append(None)
    
    clust_results = add_metrics(model, df.values, clust_results) 

In [12]:
clust_results = pd.DataFrame(clust_results)
clust_results.to_csv('../output/metrics/{}.csv'.format('clust_results'))
clust_results.head()


Unnamed: 0,Clasterisation method,Maximum distance between two samples = eps,The number of samples = min_sample,Metric for Distance between instances = metric,Minimum cluster size,Estimated number of clusters,Estimated number of noise points,Silhouette Coefficient
0,DBSCAN,0.1,1.0,euclidean,1,90,0,0.89056
1,DBSCAN,0.1,2.0,euclidean,2,69,21,0.822395
2,DBSCAN,0.1,3.0,euclidean,3,39,81,0.476191
3,DBSCAN,0.1,4.0,euclidean,4,33,99,0.401993
4,DBSCAN,0.1,5.0,euclidean,5,12,183,0.106212


The Silhouette Coefficient is calculated using the mean intra-cluster distance ( a ) and the mean nearest-cluster distance ( b ) for each sample. ... To obtain the values for each sample, use silhouette_samples . The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

## Tikrinimas / vertinimas rezultatų *(Validation of the results)*

## Rezultatų interpretavimas *(Interpretation of the results)*

## Metrics & Plotting

# Otput:

In [32]:
model = hdbscan.HDBSCAN(gen_min_span_tree=True, min_cluster_size=6).fit(df.values)
labels = model.labels_
results = pd.DataFrame(labels, columns=['labels'], index = df.index.values)
results.to_csv('../output/models/labels.csv')

In [None]:
# import subprocess

# command = f'jupyter nbconvert Exploratory_analysis.ipynb --output Exploratory_analysis.html'
# subprocess.call(command)

from IPython.display import Javascript
from nbconvert import HTMLExporter
from IPython.display import Javascript

def save_notebook():
    display(
        Javascript("IPython.notebook.save_notebook()"),
        include=['application/javascript']
    )

def output_HTML(read_file, output_file):
    import codecs
    import nbformat
    exporter = HTMLExporter()
    # read_file is '.ipynb', output_file is '.html'
    output_notebook = nbformat.read(read_file, as_version=4)
    output, resources = exporter.from_notebook_node(output_notebook)
    codecs.open(output_file, 'w', encoding='utf-8').write(output)