In [241]:
import numpy as np
import pandas as pd
import time
from sklearn import metrics
# For plotting
import plotly.io as plt_io
import plotly.graph_objects as go

#PCA
from sklearn.decomposition import PCA
#TSNE
from sklearn.manifold import TSNE
#UMAP
import umap.umap_ as umap
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
import joblib

# Data load and methods

In [242]:
PROJECT_PATH = "C:\\Users\\benan\\OneDrive\\07_DOKTORA\\PAPERLAR\\PAPER_NO_03\\github\\"

In [243]:
x = joblib.load(PROJECT_PATH + "data\\flatten_dataframe2.pkl")

In [244]:
x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6134,6135,6136,6137,6138,6139,6140,6141,6142,6143
0,160,100,92,162,102,94,162,103,99,163,...,76,124,91,74,122,86,70,121,85,69
1,158,97,92,161,100,95,162,103,97,163,...,74,127,92,73,123,88,69,122,87,68
2,160,100,92,161,101,93,162,103,99,163,...,76,127,91,75,124,89,70,122,87,68
3,159,100,94,161,102,96,161,103,99,162,...,73,127,91,75,129,92,76,128,91,75
4,160,100,90,161,101,91,162,103,97,164,...,74,125,92,73,123,88,69,121,86,67


### Preprocessing

In [245]:
from sklearn.preprocessing import StandardScaler
## Standardizing the data
x = StandardScaler().fit_transform(x)

### Methods

In [246]:
def plot_2d(ds,labels=0):
    
    fig = go.Figure(data=go.Scatter(
        x = ds[:,0],
        y = ds[:,1],
        mode='markers',
        marker=dict(
            size=10,
            color=3, #set color equal to a variable
            colorscale='Rainbow', # one of plotly colorscales
            showscale=True,
            line_width=1
        )
    ))
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),width=2000,height=1200)                 
    fig.layout.template = 'plotly_dark'
    
    fig.show()

In [247]:
def plot_3d(ds,labels=0):
    fig = go.Figure(data=[go.Scatter3d(
        x = ds[:,0],
        y = ds[:,1],
        z = ds[:,2],
        mode='markers',
        marker=dict(
            size=5,
            color=labels,                # set color to an array/list of desired values
            colorscale='Rainbow',   # choose a colorscale
            opacity=1,
            line_width=1
        )
    )])
# tight layout
    fig.update_layout(margin=dict(l=50,r=50,b=50,t=50),width=1800,height=1000)
    fig.layout.template = 'plotly_dark'
    
    fig.show()

In [248]:
def get_pca(x,n_components):
    pca = PCA(n_components)
    principalComponents = pca.fit_transform(x)
    return principalComponents

In [249]:
def get_tsne(x,n_components):
    tsne = TSNE(random_state = 42, n_components=n_components,verbose=0, perplexity=40, n_iter=400).fit_transform(x)
    return tsne

In [250]:
def get_umap(x,n_components):
    reducer = umap.UMAP(random_state=42,n_components=n_components)
    embedding = reducer.fit_transform(x)
    return embedding

In [290]:
def score(x,labels,metric='euclidean'):
    if len(set(labels)) <2:
        print("Cluster algoritması sadece 1 küme oluşturdu.\nSkor sonucu yok!")
        
    else:
        silhouette_score = metrics.silhouette_score(x, labels, metric='euclidean')
        calinski_harabasz_score = metrics.calinski_harabasz_score(x, labels)
        davies_bouldin_score = metrics.davies_bouldin_score(x, labels)
        print("silhouette_score: ",silhouette_score)
        print("calinski_harabasz_score: ",calinski_harabasz_score)
        print("davies_bouldin_score: ",davies_bouldin_score)
        print("{} küme tespit edildi...".format(len(set(labels)) ))

# 2. Experiments

## 2.1 Flatten - PCA Experiments

In [291]:
x_pca = get_pca(x,3)

### 2.1.1 Flatten - PCA - Kmeans

In [292]:
kmeans_pca = KMeans(n_clusters=8, random_state=0).fit(x_pca)

In [293]:
kmeans_pca_labels =  kmeans_pca.labels_ 

In [294]:
score(x_pca, kmeans_pca_labels, metric='euclidean')

silhouette_score:  0.25494067721101793
calinski_harabasz_score:  1424.4533852998036
davies_bouldin_score:  1.1684441886469752
8 küme tespit edildi...


### 2.1.2 Flatten - PCA - HAC

In [295]:
hac_pca = AgglomerativeClustering(n_clusters=8).fit(x_pca)

In [296]:
hac_pca_labels =  hac_pca.labels_ 

In [297]:
score(x_pca, hac_pca_labels, metric='euclidean')

silhouette_score:  0.19955813741681852
calinski_harabasz_score:  1101.1345165192954
davies_bouldin_score:  1.2689457880972101
8 küme tespit edildi...


### 2.1.2 Flatten - PCA - DBScan

In [298]:
db_pca = DBSCAN(eps=3, min_samples=20).fit(x_pca)
set(db_pca.labels_)

{-1}

In [299]:
db_pca_labels =  db_pca.labels_ 

In [300]:
len(set(db_pca_labels))

1

In [301]:
score(x_pca, db_pca_labels, metric='euclidean')

Cluster algoritması sadece 1 küme oluşturdu.
Skor sonucu yok!


In [302]:
set(db_pca_labels)

{-1}

## 2.2 Flatten - t-SNE Experiments

In [303]:
x_pca = get_pca(x,30)
x_tsne = get_tsne(x_pca,3)

### 2.2.1 Flatten - tSNE - Kmeans

In [304]:
kmeans_tsne = KMeans(n_clusters=8, random_state=0).fit(x_tsne)
score(x_tsne, kmeans_tsne.labels_, metric='euclidean')

silhouette_score:  0.29650494
calinski_harabasz_score:  1753.9312081769463
davies_bouldin_score:  1.1315415705873442
8 küme tespit edildi...


### 2.2.2 Flatten - tSNE - HCA

In [265]:
hca_tsne = AgglomerativeClustering(n_clusters=8).fit(x_tsne)
score(x_tsne, hca_tsne.labels_, metric='euclidean')

silhouette_score:  0.23879927
calinski_harabasz_score:  1373.2943077894931
davies_bouldin_score:  1.2010943348619787


### 2.2.3 Flatten - tSNE - DBScan

In [266]:
db_tsne = DBSCAN(eps=3, min_samples=20).fit(x_tsne)
set(db_tsne.labels_)
score(x_tsne, db_tsne.labels_, metric='euclidean')

silhouette_score:  0.015596356
calinski_harabasz_score:  11.511601994102834
davies_bouldin_score:  1.1488046362917697


In [267]:
set(db_tsne.labels_)

{-1, 0}

## 2.3 Flatten - UMAP Experiments

In [268]:
x_umap = get_umap(x,3)

### 2.3.1 Flatten -UMAP - Kmeans

In [269]:
kmeans_umap = KMeans(n_clusters=8, random_state=0).fit(x_umap)
score(x_umap, kmeans_umap.labels_, metric='euclidean')

silhouette_score:  0.37247205
calinski_harabasz_score:  2536.023861399815
davies_bouldin_score:  0.8262436024944823


### 2.3.2 Flatten - UMAP - HCA

In [270]:
hca_umap = AgglomerativeClustering(n_clusters=8).fit(x_umap)
score(x_umap, hca_umap.labels_, metric='euclidean')

silhouette_score:  0.33939797
calinski_harabasz_score:  2286.351239590818
davies_bouldin_score:  0.8371726250589678


### 2.3.3 Flatten - UMAP - DBScan

In [271]:
db_umap = DBSCAN(eps=0.5, min_samples=5).fit(x_umap)

set(db_umap.labels_)

{-1, 0, 1, 2, 3, 4, 5}

In [272]:
score(x_umap, db_umap.labels_, metric='euclidean')

silhouette_score:  -0.08753778
calinski_harabasz_score:  151.7411086393165
davies_bouldin_score:  0.6912332795245648


### Not 
Deneyler sonucunda yaptığım kısıtlı gözlem ile DBScan clustering yönteminin problemimize uygun bir yöntem olmadığı sonucuna ulaştım. Hiperparametreleri değiştirsem de aşağıdaki görüntüden kurtulamadım. Sanırım her bir datanın diğerlerine olan mesafesi benzer uzaklıklar olduğu için hepsini tek bir kümeye atıyor. 

In [273]:
plot_3d(x_umap,db_umap.labels_)

# 3. Best Results Plots

En iyi sonuçlara UMAP ile ulaştık aşağıda her ikisi de yüksek skorlar veren UMAP-Kmeans ve UMAP-HCA deney sonuçlarının plotları mevcut.

In [274]:
plot_3d(x_umap,hca_umap.labels_)

In [275]:
plot_3d(x_umap,kmeans_umap.labels_)

# 4. Dump to folder

In [None]:
import os
import shutil

cluster_set = "hca_umap"  # değiştirilebilir
labels = hca_umap.labels_ # değiştirilebilir

In [311]:
"""
cluster_folder altına cluster_set'e atanan string isminde bir folder açıyor
ve clusterlara göre fotoğrafları ayrı klasörlere atıyor.

Cluster_folder klasörü yoksa kendisi oluşturuyor.

gitignore dosyasına cluster_folder/* koydum ki her seferinde frameleri github'a upload etmesin.
"""

cluster_folder = "cluster_folder\\" + cluster_set + '_cluster_folder'
labels = hca_umap.labels_

dataset_dir = PROJECT_PATH + "frames"

if not os.path.exists(PROJECT_PATH + cluster_folder):
    os.makedirs(PROJECT_PATH + cluster_folder)    
    
for i in set(labels):
    if not os.path.exists(PROJECT_PATH + cluster_folder + "\\cluster_{}".format(i)):
        os.makedirs(PROJECT_PATH + cluster_folder + "\\cluster_{}".format(i))
terms = os.listdir(dataset_dir)
i=0

for term in terms:
    if not term.startswith('.'):
        print(term)
        class_file=os.path.join(dataset_dir,term)
        frames=os.listdir(class_file)
        for frame in frames:
            img_name=os.path.join(class_file,frame)
            x = img_name.split("\\") 
            group = labels[i]
            folder = "cluster_{}".format(group)
            target = cluster_folder + "\\" + folder + "\\" + frame
            shutil.copyfile(img_name, target)            
            i+=1

006_acik_butce
007_acik_ekonomi
008_acik_eksiltme
010_acik_kredi
012_acik_maliyet
013_acik_oturum
