# Klasterinė analizė: Pagrindinės sąvokos 
# *(Cluster analysis: Basic concepts)*

Klasterinė analizė priskiriama prie *("Unsupervised learning")* Mokymasis be mokytojo t.y. nėra iš anksto nustatytų klasių.

In [2]:
                                import os
                                import pyodbc 
                                import pandas                         as pd
                                import networkx                       as nx
                                import matplotlib.pyplot              as plt
                                import numpy                          as np
from sklearn                    import cluster
from sklearn                    import mixture
from collections                import defaultdict
from sklearn.metrics.cluster    import normalized_mutual_info_score
from sklearn.metrics.cluster    import adjusted_rand_score

In [3]:
# Comment this if the data visualisations doesn't work
%matplotlib inline
plt.style.use('seaborn-whitegrid')
plt.rcParams['grid.linestyle'] = ':'
plt.rcParams['grid.color'] = '#474545'
plt.rcParams['axes.edgecolor'] = '#474545'

In [4]:
import os

__file__ = 'Clusterization.ipynb'
__path__ = os.path.dirname(os.path.realpath(__file__))

print('File path: %s' % __path__)
print('File name: %s' %__file__)

File path: C:\Users\banarn\projects\optimisation_shortest_path\notebooks
File name: Clusterization.ipynb


In [5]:
df = pd.read_csv('../output/data_nm.csv').set_index('Product')
df.head(3)

Unnamed: 0_level_0,NPC,Path,Width,Lenght,LiftPushSpeed,LiftRegularSpeed,RT1EntranceSpeed,RT1ExitSpeed,RT2EntranceSpeed,RT2ExitSpeed,...,Position_5,BR_BR2,BR_BR3,Color_black,Color_chary,Color_none,Color_oack,Color_white,Parts_1,Parts_2
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BR3-IKW202,-0.18999,1,0.099728,-0.054031,0.097934,-0.220126,0.074873,-0.082581,0.058467,0.149686,...,0,0,1,0,0,0,0,1,1,0
BR3-ITW101-K,-0.023323,0,0.043267,-0.165659,-0.140162,-0.077269,-0.163223,-0.082581,0.058467,0.006828,...,0,0,1,0,0,0,0,1,0,1
BR3-ITW302,-0.106656,0,0.174103,-0.138334,0.002695,-0.077269,-0.020365,0.047853,0.169578,0.092543,...,0,0,1,0,0,0,0,1,0,1


methods for clustering:
    
* [HDBSCAN](http://hdbscan.readthedocs.io/en/latest/index.html)
* DBSCAN
* BIRCH

## Požymių savybių atranka *(Feature selection)*

Parinkimas atliekamas remiantis **Exploratory analysis** rezulstatais

[Worcking with dictionarys](https://stackoverflow.com/a/8381589/7347438)

## Artumo matas *(Proximity measure)*


In [None]:
from sklearn.metrics.pairwise   import euclidean_distances
from sklearn.metrics.pairwise   import cosine_distances
from sklearn.metrics.pairwise   import manhattan_distances

In [None]:
kClusters = 8
idx = list(df.index.values)
results = {'Product': idx}

euclidean = euclidean_distances(df, df)
manhattan = manhattan_distances(df, df)
cosine = cosine_distances(df, df)

print('Expected clusters qty: %s' % kClusters)
print('Lenght of dataframe: %s' % len(idx))

### Euclidean distances

In [None]:
# distance_matrix = pd.DataFrame(euclidean_distances(df, df), 
#                          index = labels, 
#                          columns = labeals)
# distance_matrix.to_csv('output/{}.csv'.format(euclidean_distances.__name__))

### Cosine distance

In [None]:
# distance_matrix = pd.DataFrame(cosine_distances(df, df), 
#                          index = labels, 
#                          columns = labels)
# distance_matrix.to_csv('../output/{}.csv'.format(cosine_distances.__name__))

### Manhattan distances

In [None]:
# distance_matrix = pd.DataFrame(manhattan_distances(df, df), 
#                          index = labels, 
#                          columns = labels)
# distance_matrix.to_csv('../output/{}.csv'.format(manhattan_distances.__name__))

## Klasterizavimo metodai *(Clusterisation algorithms)*


### DBSCAN

In [None]:
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
import itertools

parameter_grid = dict(
    eps=np.arange(0.1, 1.1, 0.1),
    min_samples = np.arange(1, 21, 1)
)

def split_grid(parameter_grid):
    """Input: {x:[y]}, Output: [{x:y}]"""
    return [list_of_toople_to_dic([*zip(parameter_grid.keys(), values)]) 
            for values 
            in [*itertools.product(*parameter_grid.values())]]        
        
def list_of_toople_to_dic(values):
    """Input: [(x,y)], Output: {x:y}"""
    return {key:value for key, value in values}

options_grid = split_grid(parameter_grid)

In [None]:
# Compute DBSCAN

def add_metrics(model, values):
    labels = model.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    return {
        'Estimated number of clusters' : n_clusters_,
        'Silhouette Coefficient' : metrics.silhouette_score(df.values, labels)\
        if n_clusters_ != 0 else -1
    } 

metrics_results = {}

for options in options_grid:
    db = DBSCAN(n_jobs=-1, **options).fit(df.values)
    
    dic_metric = add_metrics(db, df.values)
    
    metrics_results.setdefault('Parameters', []).append(options)
    
    for metric in dic_metric:
        metrics_results.setdefault(metric, [])\
        .append(metrics_results[metric])

        
#     core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#     core_samples_mask[db.core_sample_indices_] = True
    

#     print('Clusterization model: \n{}\n'.format(db))
# #     print('Lenght of labels: {}'.format(len(labels)))

#     # Number of clusters in labels, ignoring noise if present.
#     n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

#     
# #     print('numer of noise: %s' % labels == -1)
#     print("Silhouette Coefficient: %0.3f"
#           % metrics.silhouette_score(df.values, labels))

In [None]:
pd.DataFrame(metrics_results)

The Silhouette Coefficient is calculated using the mean intra-cluster distance ( a ) and the mean nearest-cluster distance ( b ) for each sample. ... To obtain the values for each sample, use silhouette_samples . The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

In [None]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(df.values, labels))

## Klasterizavimo evoliucija *(Evaluation of clustering)*

## Tikrinimas / vertinimas rezultatų *(Validation of the results)*

### Clustering metrics
See the Clustering performance evaluation section of the user guide for further details.
There are two forms of evaluation:
* **supervised**, which uses a ground truth class values for each sample.
* **unsupervised**, which does not and measures the ‘quality’ of the model itself.

## Rezultatų interpretavimas *(Interpretation of the results)*

## K-Means Clustering

K-means is considered by many the gold standard when it comes to clustering due to its simplicity and performance, and it's the first one we'll try out. When you have no idea at all what algorithm to use, K-means is usually the first choice. Bear in mind that K-means might under-perform sometimes due to its concept: spherical clusters that are separable in a way so that the mean value converges towards the cluster center. To simply construct and train a K-means model, use the follow lines:

In [None]:
# K-means Clustering Model
kmeans = cluster.KMeans(n_clusters=kClusters, n_init=200)
# kmeans.fit(edgeMat)
kmeans.fit(data)

# Transform our data to list form and store them in results list
results.append(list(kmeans.labels_))

#### Agglomerative Clustering

The main idea behind agglomerative clustering is that each node starts in its own cluster, and recursively merges with the pair of clusters that minimally increases a given linkage distance. The main advantage of agglomerative clustering (and hierarchical clustering in general) is that you don’t need to specify the number of clusters. That of course, comes with a price: performance. But, in scikit’s implementation, you can specify the number of clusters to assist the algorithm’s performance. To create and train an agglomerative model use the following code:

In [None]:
# Agglomerative Clustering Model
agglomerative = cluster.AgglomerativeClustering(n_clusters=kClusters, linkage="ward")
agglomerative.fit(edgeMat)

# Transform our data to list form and store them in results list
results.append(list(agglomerative.labels_))

#### Spectral

The Spectral clustering technique applies clustering to a projection of the normalized Laplacian. When it comes to image clustering, spectral clustering works quite well. See the next few lines of Python for all the magic:

In [None]:
# Spectral Clustering Model
spectral = cluster.SpectralClustering(n_clusters=kClusters, affinity="precomputed", n_init= 200)
spectral.fit(edgeMat)

# Transform our data to list form and store them in results list
results.append(list(spectral.labels_))

#### Affinity Propagation

Well this one is a bit different. Unlike the previous algorithms, you can see AF does not require the number of clusters to be determined before running the algorithm. AF, performs really well on several computer vision and biology problems, such as clustering pictures of human faces and identifying regulated transcripts:

In [None]:
# Affinity Propagation Clustering Model
affinity = cluster.affinity_propagation(S=edgeMat, max_iter=200, damping=0.6)

# Transform our data to list form and store them in results list
results.append(list(affinity[1]))

# Metrics & Plotting