In [36]:
import os
import hdbscan
import debacl
import fastcluster
import sklearn.cluster
import scipy.cluster
import sklearn.datasets
import sklearn.metrics
import sklearn.metrics.cluster
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_palette('Paired', 10)
sns.set_color_codes()

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [6]:
def load_file(filename, className):
  script_dir = os.getcwd()
  data_dir = os.path.join(script_dir, '..', 'data')
  file_path = os.path.join(data_dir, filename)

  try:
    data = pd.read_csv(file_path)
    dataOnly = data.drop(className, axis='columns')
    labelsOnly = data[className]

    encoder = LabelEncoder()

    dict = {'Filename': filename,
        'DataOnly': dataOnly,
        'Data': data,
        'LabelsOnly': encoder.fit_transform(labelsOnly),
        'LabelsEncoded': encoder.classes_,
        'LabelsOnlyEncoded': labelsOnly
        }

    return dict
  except FileNotFoundError:
    print(f"Arquivo {filename} não encontrado no diretório {data_dir}")
    return None
  except pd.errors.EmptyDataError:
    print(f"Arquivo {filename} está vazio")
    return None
  except pd.errors.ParserError:
    print(f"Erro ao analisar o arquivo {filename}")
    return None

In [10]:
data= load_file("iris.csv", "class")
print(data["Filename"])
#print(data["DataOnly"].head())
#print(data["Data"].head())
#print(data["LabelsOnly"].head())
print(data["LabelsEncoded"])


iris.csv
['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [50]:
def benchmark_algorithm(dataset, cluster_function_name, cluster_function, function_args, function_kwds, sample_size=2):
    start_time = time.time()
    cluster = cluster_function(dataset["DataOnly"], *function_args, **function_kwds)
    time_taken = time.time() - start_time
    contingency = sklearn.metrics.cluster.contingency_matrix(dataset["LabelsOnly"], cluster.labels_)
    confusion_matrix = sklearn.metrics._classification.confusion_matrix(dataset["LabelsOnly"], cluster.labels_)
    var = []
    for n in contingency:
        var.append(np.where(n == max(n))[0][0])
    clusterLabels = cluster.labels_.copy().astype('object')
    for n in var:
        clusterLabels[clusterLabels==n]=dataset["LabelsEncoded"][n]

    purity_score = np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix)
    jaccard_score = sklearn.metrics._classification.jaccard_score(dataset["LabelsOnly"], cluster.labels_, average='micro')
    adjusted_rand_score = sklearn.metrics.cluster.adjusted_rand_score(dataset["LabelsOnly"], cluster.labels_)
    accuracy_score = sklearn.metrics._classification.accuracy_score(dataset["LabelsOnly"], cluster.labels_)

    classification_report = sklearn.metrics.classification_report(dataset["LabelsOnly"], cluster.labels_, target_names=dataset["LabelsEncoded"], zero_division=0)

    dict = {'ClusterFunctionName': cluster_function_name,
        'Cluster': cluster,
        'TimeTaken': time_taken,
        'ClusterLabels': clusterLabels,
        'AccuracyScore': accuracy_score,
        'PurityScore': purity_score,
        'AdjustedRandScore': adjusted_rand_score,
        'JaccardScore': jaccard_score,
        'ClassificationReport': classification_report
        }

    return dict


In [61]:
dataList = {
        #'Iris': "iris.csv",
        'Rice Cammeo Osmancik': "Rice_Cammeo_Osmancik.arff.csv"
        }

for dataKey in dataList:
    print(color.BOLD + dataKey + " Dataset" + color.END + "\n")
    
    data = load_file(dataList[dataKey], "class")

    clusterResult = [];

    k_means = sklearn.cluster.KMeans(len(data["LabelsEncoded"]), n_init='auto')
    clusterResult.append(benchmark_algorithm(data, "K-means", k_means.fit, (), {}))

    dbscan = sklearn.cluster.DBSCAN(eps=1.25)
    clusterResult.append(benchmark_algorithm(data, "DBScan", dbscan.fit, (), {}))

    agglomerative = sklearn.cluster.AgglomerativeClustering(len(data["LabelsEncoded"]))
    clusterResult.append(benchmark_algorithm(data, "Agglomerative Clustering", agglomerative.fit, (), {}))

    spectral = sklearn.cluster.SpectralClustering(len(data["LabelsEncoded"]))
    clusterResult.append(benchmark_algorithm(data, "Spectral Clustering", spectral.fit, (), {}))

    #affinity_prop = sklearn.cluster.AffinityPropagation()
    #clusterResult.append(benchmark_algorithm(data, "Affinity Propagation", affinity_prop.fit, (), {}))

    for dataResult in clusterResult:
        print(color.UNDERLINE + "Cluster Algorithm: " + dataResult['ClusterFunctionName'] + color.END)
        print("Time Taken (seg):", dataResult['TimeTaken'])
        print("Accuracy Score:", dataResult['AccuracyScore'])
        print("Purity Score:", dataResult['PurityScore'])
        print("Adjusted Rand Index:", dataResult['AdjustedRandScore'])
        print("Jaccard Score:", dataResult['JaccardScore'])
        print(dataResult['ClassificationReport'])


[1mRice Cammeo Osmancik Dataset[0m

[4mCluster Algorithm: K-means[0m
Time Taken (seg): 0.008510828018188477
Accuracy Score: 0.9149606299212598
Purity Score: 0.9149606299212598
Adjusted Rand Index: 0.6885026205971801
Jaccard Score: 0.8432510885341074
              precision    recall  f1-score   support

      Cammeo       0.91      0.89      0.90      1630
    Osmancik       0.92      0.94      0.93      2180

    accuracy                           0.91      3810
   macro avg       0.91      0.91      0.91      3810
weighted avg       0.91      0.91      0.91      3810

[4mCluster Algorithm: DBScan[0m
Time Taken (seg): 0.31294941902160645
Accuracy Score: 0.42782152230971127
Purity Score: 0.5721784776902887
Adjusted Rand Index: 0.0
Jaccard Score: 0.27212020033388984
              precision    recall  f1-score   support

      Cammeo       0.43      1.00      0.60      1630
    Osmancik       0.00      0.00      0.00      2180

    accuracy                           0.43      3810


In [59]:
k_means = sklearn.cluster.KMeans(10)
k_means_data = benchmark_algorithm(dataset_sizes, k_means.fit, (), {})

dbscan = sklearn.cluster.DBSCAN(eps=1.25)
dbscan_data = benchmark_algorithm(dataset_sizes, dbscan.fit, (), {})

scipy_k_means_data = benchmark_algorithm(dataset_sizes,
                                         scipy.cluster.vq.kmeans, (10,), {})

scipy_single_data = benchmark_algorithm(dataset_sizes,
                                        scipy.cluster.hierarchy.single, (), {})

fastclust_data = benchmark_algorithm(dataset_sizes,
                                     fastcluster.linkage_vector, (), {})

hdbscan_ = hdbscan.HDBSCAN()
hdbscan_data = benchmark_algorithm(dataset_sizes, hdbscan_.fit, (), {})

debacl_data = benchmark_algorithm(dataset_sizes,
                                  debacl.geom_tree.geomTree, (5, 5), {'verbose':False})

agglomerative = sklearn.cluster.AgglomerativeClustering(10)
agg_data = benchmark_algorithm(dataset_sizes,
                               agglomerative.fit, (), {}, sample_size=4)

spectral = sklearn.cluster.SpectralClustering(10)
spectral_data = benchmark_algorithm(dataset_sizes,
                                    spectral.fit, (), {}, sample_size=6)

affinity_prop = sklearn.cluster.AffinityPropagation()
ap_data = benchmark_algorithm(dataset_sizes,
                              affinity_prop.fit, (), {}, sample_size=3)

NameError: name 'dataset_sizes' is not defined