<a href="https://colab.research.google.com/github/carolineoliveira994/clustering/blob/main/KMeans_mini_batch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors



from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.datasets import make_moons
from sklearn.datasets import make_blobs

%matplotlib inline

sns.set(style='darkgrid', context='talk', rc={'figure.figsize': (14, 8)})


In [None]:
X, y = make_blobs(n_samples=1000, centers=7, n_features=2,
                  random_state=33)
df = pd.DataFrame(X, columns=['x', 'y'])
df['label'] = y

df.head()

In [None]:
_ = sns.scatterplot(x='x', y='y', data=df, hue='label', palette='rainbow', legend=False)

In [None]:
def train_and_plot(data, k=3, method='default', plot=True, verbose=0, random_state=133):
  #Copiamos os daos e removemos a coluna label
  data_copy = data.copy()
  data_copy.drop(columns='label', inplace=True)

  if method == 'minibatch':
    model = MiniBatchKMeans(n_clusters=k, random_state=random_state, verbose=verbose)

  else:
    model = KMeans(n_clusters=k, random_state=random_state, verbose=verbose)

  model.fit(data_copy)

  if plot:
    data_copy['cluster_labels'] = model.labels_
    _ = sns.scatterplot(x='x', y='y', data=data_copy, hue='cluster_labels', palette='rainbow', legend=False)

  centroids = model.cluster_centers_
  centroids_x = centroids[:,0]
  centroids_y = centroids[:,1]

  _ = plt.scatter(centroids_x, centroids_y, color='black', s=100)

  return model


In [None]:
start = time.time()
clustering = train_and_plot(df, k=7)
print('Inércia:', clustering.inertia_)
print('Tempo (s):', time.time() - start)


Mini Batch K-means

In [None]:
start = time.time()
clustering = train_and_plot(df, k=7, method='minibatch')
print('Inércia:', clustering.inertia_)
print('Tempo (s):', time.time() - start)

In [None]:
transformation = [[0.3, -0.3], [-1.2, 0.2]]

X, y = make_blobs(n_samples=1000, centers=7, n_features=2,
                  random_state=170)

df_transformed = pd.DataFrame(np.dot(X, transformation), columns=['x', 'y'])
df_transformed['label'] = y

df_transformed.head()

In [None]:
_ = sns.scatterplot(x='x', y='y', data=df_transformed, hue='label', palette='rainbow', legend=False)

In [None]:
start = time.time()
clustering = train_and_plot(df_transformed, k=7)
print('K-means')
print('Inércia:', clustering.inertia_)
print('Tempo (s):', time.time() - start)

In [None]:
start = time.time()
clustering = train_and_plot(df_transformed, k=7, method='minibatch')
print('Mini Batch k-means')
print('Inércia:', clustering.inertia_)
print('Tempo (s):', time.time() - start)

In [None]:
X, y = make_blobs(n_samples=1000000, centers=7, n_features=2,
                  random_state=33)

df = pd.DataFrame(X, columns=['x', 'y'])
df['label'] = y

df.head()

In [None]:
start = time.time()
clustering = train_and_plot(df, k=7, plot=False)
print('K-means')
print('Inércia:', clustering.inertia_)
print('Tempo (s):', time.time() - start)

In [None]:
start = time.time()
clustering = train_and_plot(df, k=7, method='minibatch', plot=False)
print('Mini Batch k-means')
print('Inércia:', clustering.inertia_)
print('Tempo (s):', time.time() - start)

In [None]:
X, y = make_blobs(n_samples=10000000, centers=7, n_features=2,
                  random_state=33)

df = pd.DataFrame(X, columns=['x', 'y'])
df['label'] = y

df.head()

In [None]:
start = time.time()
clustering = train_and_plot(df, k=7, method='minibatch', plot=False)
print('Mini Batch k-means')
print('Inércia:', clustering.inertia_)
print('Tempo (s):', time.time() - start)