In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans
import io
import requests
import plotly as py
import plotly.graph_objs as go

import warnings
warnings.filterwarnings('ignore')

In [None]:
# df = pd.read_csv('Mall_Customers.csv')
url = 'https://raw.githubusercontent.com/casasmgb/curso-ml/main/03-section/Mall_Customers.csv'
data = requests.get(url).content
df = pd.read_csv(io.StringIO(data.decode('utf-8')))

df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

### Grafica basada en la edad y la puntuacion de gasto

In [None]:
plt.figure(1 , figsize = (15 , 7))
plt.title('Dispersión de Edad v/s Puntuacion de Gasto', fontsize = 20)
plt.xlabel('Edad')
plt.ylabel('Puntuacion de Gasto')
plt.scatter( x = 'edad', y = 'puntuacion_gasto', data = df, s = 100)
plt.show()

### Identificacion de valor optimo de K

In [None]:
x1 = df[['edad' , 'puntuacion_gasto']].iloc[: , :].values
inertia = []
for n in range(1 , 15):
    algorithm = (KMeans(n_clusters = n ,init='k-means++', n_init = 10 ,max_iter=300, 
                        tol=0.0001,  random_state= 111  , algorithm='elkan') )
    algorithm.fit(x1)
    inertia.append(algorithm.inertia_)

In [None]:
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 15) , inertia , 'o')
plt.plot(np.arange(1 , 15) , inertia , '-' , alpha = 0.5)
plt.xlabel('Numero de clustes') , plt.ylabel('Inertia')
plt.show()

### Aplicamos KMeans para un valor de K=4

In [None]:
algorithm = (KMeans(n_clusters = 4 ,init='k-means++', n_init = 10 ,max_iter=300, tol=0.0001,  random_state= 111  , algorithm='elkan') )
algorithm.fit(x1)
labels1 = algorithm.labels_
centroids1 = algorithm.cluster_centers_

In [None]:
h = 0.02
x_min, x_max = x1[:, 0].min() - 1, x1[:, 0].max() + 1
y_min, y_max = x1[:, 1].min() - 1, x1[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
pred = algorithm.predict(np.c_[xx.ravel(), yy.ravel()]) 

In [None]:
plt.figure(1 , figsize = (15 , 7) )
plt.clf()
pred = pred.reshape(xx.shape)
plt.imshow(pred , interpolation='nearest', 
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap = plt.cm.Pastel2, aspect = 'auto', origin='lower')

plt.scatter(x = 'edad', y = 'puntuacion_gasto', data = df, c = labels1, s = 100)
plt.scatter(x = centroids1[: , 0] , y =  centroids1[: , 1] , s = 300 , c = 'red' , alpha = 0.5)
plt.ylabel('Puntuacion de Gasto') , plt.xlabel('Edad')
plt.show()


### Grafica basada en Ingresos Anuales y la Puntuacion de Gasto

In [None]:

x2 = df[['ingreso_anual' , 'puntuacion_gasto']].iloc[: , :].values
inertia = []
for n in range(1 , 11):
    algorithm = (KMeans(n_clusters = n ,init='k-means++', n_init = 10 ,max_iter=300, 
                        tol=0.0001,  random_state= 111  , algorithm='elkan') )
    algorithm.fit(x2)
    inertia.append(algorithm.inertia_)

In [None]:
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Numero de clustes') , plt.ylabel('Inertia')
plt.show()

In [None]:

algorithm = (KMeans(n_clusters = 5 ,init='k-means++', n_init = 10 ,max_iter=300, tol=0.0001,  random_state= 111  , algorithm='elkan') )
algorithm.fit(x2)
labels2 = algorithm.labels_
centroids2 = algorithm.cluster_centers_


In [None]:
h = 0.02
x_min, x_max = x2[:, 0].min() - 1, x2[:, 0].max() + 1
y_min, y_max = x2[:, 1].min() - 1, x2[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z2 = algorithm.predict(np.c_[xx.ravel(), yy.ravel()]) 

In [None]:
plt.figure(1 , figsize = (15 , 7) )
plt.clf()
Z2 = Z2.reshape(xx.shape)
plt.imshow(Z2 , interpolation='nearest', 
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap = plt.cm.Pastel2, aspect = 'auto', origin='lower')

plt.scatter( x = 'ingreso_anual' ,y = 'puntuacion_gasto' , data = df , c = labels2 , 
            s = 100 )
plt.scatter(x = centroids2[: , 0] , y =  centroids2[: , 1] , s = 300 , c = 'red' , alpha = 0.5)
plt.ylabel('Puntuacion de gasto') , plt.xlabel('Ingreso Anual')
plt.show()

### Grafica basada en Edad, Ingresos Anuales y la Puntuacion de Gasto

In [None]:
X3 = df[['edad' , 'ingreso_anual' ,'puntuacion_gasto']].iloc[: , :].values
inertia = []
for n in range(1 , 11):
    algorithm = (KMeans(n_clusters = n, init='k-means++', n_init = 10, max_iter=300, tol=0.0001, random_state= 111, algorithm='elkan'))
    algorithm.fit(X3)
    inertia.append(algorithm.inertia_)

In [None]:
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Numero de clustes') , plt.ylabel('Inertia')
plt.show()

In [None]:
algorithm = (KMeans(n_clusters = 6 ,init='k-means++', n_init = 10 ,max_iter=300, tol=0.0001,  random_state= 111  , algorithm='elkan') )
algorithm.fit(X3)
labels3 = algorithm.labels_
centroids3 = algorithm.cluster_centers_

y_kmeans = algorithm.fit_predict(X3)
df['cluster'] = pd.DataFrame(y_kmeans)
df

In [None]:
trace1 = go.Scatter3d(
    x= df['edad'],
    y= df['puntuacion_gasto'],
    z= df['ingreso_anual'],
    mode='markers',
     marker=dict(
        color = df['cluster'], 
        size= 10,
        line=dict(
            color= df['cluster'],
            width= 12
        ),
        opacity=0.8
     )
)
data = [trace1]
layout = go.Layout(
    title= 'Edad, Ingresos Anuales y la Puntuacion de Gasto',
    scene = dict(
            xaxis = dict(title  = 'Edad'),
            yaxis = dict(title  = 'Puntuacion de Gasto'),
            zaxis = dict(title  = 'Ingreso Anual')
        )
)
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)