<a href="https://colab.research.google.com/github/carolineoliveira994/100-days-of-code-python/blob/main/evaluation_external.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Avaliação de Agrupamento de Dados

* Utilização do algoritmo K-means com diferentes parâmetros.
* Base de dados sintéticas
* Comparações das diferentes execuções usando índice de avaliação externo.

In [None]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors

from sklearn.cluster import KMeans
from sklearn.datasets import make_moons
from sklearn.datasets import make_blobs

from sklearn.metrics import adjusted_rand_score

%matplotlib inline

sns.set(style='darkgrid', context='talk', rc={'figure.figsize': (14, 8)})

In [None]:
X, y = make_blobs(n_samples=1000, centers=7, n_features=2, random_state=0)

df = pd.DataFrame(X, columns=['x', 'y'])
df['label'] = y

df.head()

In [None]:
_ = sns.scatterplot(x='x', y='y', data=df, hue='label', palette='rainbow', legend=False)

In [None]:
def train_and_plot(data, k=3, init='k-means++', n_init=10, max_iter=300,
                   algorithm='auto', verbose=0, random_state=133):

  data_copy = data.copy()
  data_copy.drop(columns='label', inplace=True)


  model = KMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter,
                 algorithm=algorithm, random_state=random_state, verbose=verbose)
  model.fit(data_copy)

  data_copy['cluster_labels'] = model.labels_
  _ = sns.scatterplot(x='x', y='y', data=data_copy, hue='cluster_labels', palette='rainbow', legend=False)


  centroids = model.cluster_centers_
  centroids_x = centroids[:,0]
  centroids_y = centroids[:,1]

  _ = plt.scatter(centroids_x, centroids_y, color='black', s=100)

  return model, model.labels_

In [None]:
_, clusters_labels = train_and_plot(df, k=7)

#### Adjusted Rand Index

In [None]:
_, clusters_labels = train_and_plot(df, k=7)

In [None]:
print('Adjusted Rand Index', round(adjusted_rand_score(y, clusters_labels), 4))

Com inicialização aleatória, onde grupos resultantes são menos similares ao ground truth.

In [None]:
_, clusters_labels = train_and_plot(df, k=7, init='random', random_state=100)

In [None]:
print('Adjusted Rand Index', round(adjusted_rand_score(y, clusters_labels), 4))

In [None]:
_, clusters_labels = train_and_plot(df, k=6, init='random', random_state=400)

In [None]:
print('Adjusted Rand Index', round(adjusted_rand_score(y, clusters_labels), 4))

Com número de grupos incorreto, onde a quantidade de grupos resultantes é menor do que a quantidade de grupos real.

In [None]:
_, clusters_labels = train_and_plot(df, k=3)

In [None]:
print('Adjusted Rand Index', round(adjusted_rand_score(y, clusters_labels), 4))

Com número de grupos incorreto, onde a quantidade de grupos resultantes é maior do que a quantidade de grupos real.

In [None]:
_, clusters_labels = train_and_plot(df, k=30)

In [None]:
print('Adjusted Rand Index', round(adjusted_rand_score(y, clusters_labels), 4))

#### Testes de plot com o índice
Pode ser muito útil ver um gráfico do índice variando pelo número de clusters

In [None]:
%%time

def do_ajusted_random_index_and_plot(data, max_clusters):
    min_clusters = 1

    clusters_range = range(min_clusters, max_clusters + 1)

    aris = []
    data_copy = data.copy()
    data_copy.drop(columns='label', inplace=True)

    for k in clusters_range:
        model = KMeans(n_clusters=k, random_state=33)
        model.fit(data_copy)

        #ARI = Ajusted Random Index :)
        ari = adjusted_rand_score(y, model.labels_)
        aris.append(ari)

    _ = plt.plot(clusters_range, aris, '-o')
    _ = plt.title('Ajusted Random Index x Número de Clusters')
    _ = plt.xlabel('Número de clusters')
    _ = plt.ylabel('Ajusted Random Index')
    _ = plt.xticks(clusters_range)
    plt.show()

In [None]:
do_ajusted_random_index_and_plot(df, max_clusters=10)

In [None]:
do_ajusted_random_index_and_plot(df, max_clusters=int(np.sqrt(len(df))))

### Execução com base de dados de 1M de registros

In [None]:
X, y = make_blobs(n_samples=1000000, centers=7, n_features=2,
                  random_state=33)

df = pd.DataFrame(X, columns=['x', 'y'])

df.head()

In [None]:
start = time.time()
model = KMeans(n_clusters=7, algorithm='full', random_state=33)
model.fit(df)
print(time.time() - start)

In [None]:
print('Ajusted Random Index:', round(adjusted_rand_score(y, model.labels_), 4))

Resultados da avaliação de execuções onde o K-means é limitado.

In [None]:
transformation = [[0.3, -0.3], [-1.2, 0.2]]

X, y = make_blobs(n_samples=1000, centers=7, n_features=2, random_state=170)

df_transformed = pd.DataFrame(np.dot(X, transformation), columns=['x', 'y'])
df_transformed['label'] = y

df_transformed.head()

In [None]:
_ = sns.scatterplot(x='x', y = 'y', data=df_transformed, hue='label', palette='rainbow', legend=False)

In [None]:
_, clusters_labels = train_and_plot(df_transformed, k=7)

In [None]:
print('Adjusted Rand Index', round(adjusted_rand_score(y, clusters_labels), 4))

### Avaliando em bases de dados com formatos diferentes de grupos

In [None]:
X, y = make_moons(300, noise=.05, random_state=33)
df_moon = pd.DataFrame(X, columns=['x', 'y'])
df_moon['label'] = y

df_moon.head()

In [None]:
_ = sns.scatterplot(x='x', y='y', data=df_moon, hue='label', palette='rainbow', legend=False)

In [None]:
_, clusters_labels = train_and_plot(df_moon, k=2)

In [None]:
print('Adjusted Rand Index', round(adjusted_rand_score(y, clusters_labels), 4))