# K Means Algo

In [1]:
#!pip install nbformat

In [2]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score, davies_bouldin_score
#import nbformat
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import joblib


In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/k-means-project-tutorial/main/housing.csv')
df

In [None]:
X = df[['Latitude','Longitude', 'MedInc']]
X.head()


In [5]:
X_train, X_test = tts(X, test_size=0.2, random_state=42)

### Paso 2: Construye un K-Means 
Clasifica los datos en 6 clusters utilizando, para ello, el modelo K-Means. 

In [6]:
model = KMeans(n_clusters=6)
model.fit(X_train)
y_train = model.predict(X_train)

A continuación, almacena el cluster al que pertenece cada casa como una columna nueva del dataset. Podrías llamarla cluster. 

In [None]:
X_train['cluster'] = y_train
X_train['cluster'].value_counts()

Para introducirla a tu conjunto de datos quizá tengas que categorizarla. Observa qué formato y valores tiene y actúa en consecuencia. Grafícala en un diagrama de puntos y describe lo que ves.

In [None]:
X_train['cluster'].dtypes

In [None]:
clusters = {0: 'Cluster A', 1: 'Cluster B', 2: 'Cluster C',
            3: 'Cluster D', 4: 'Cluster E', 5: 'Cluster F'}

X_train['cluster'] = X_train['cluster'].map(clusters)
X_train['cluster'].value_counts()

In [None]:
fig, axis = plt.subplots(1, 3, figsize = (15, 5))

sns.scatterplot(ax = axis[0], data = X_train, x = "Latitude", y = "Longitude", hue = 'cluster', palette = "deep")
sns.scatterplot(ax = axis[1], data = X_train, x = "Latitude", y = "MedInc", hue = 'cluster', palette = "deep")
sns.scatterplot(ax = axis[2], data = X_train, x = "Longitude", y = "MedInc", hue = 'cluster', palette = "deep")
plt.tight_layout()

plt.show()

In [None]:
# Ahora en 3d
fig = px.scatter_3d(X_train, x = "Latitude", y = "Longitude", z = "MedInc", color = "cluster", width = 1000, height = 500,
                    size = X_train["MedInc"].abs())
camera = dict(
    up = dict(x = 1, y = 3.5, z = 0),
    eye = dict(x = 2, y = 0, z = 0)
)

fig.update_layout(scene_camera = camera)
fig.show()

In [None]:
# Voy a revisar si esta bien que sean 6 clusters los indicados:
X_train_noclus = X_train.drop('cluster', axis=1)

# Primero con el metodo de Elbow
elbow_viz = KElbowVisualizer(KMeans(), k=(1,20))
elbow_viz.fit(X_train_noclus)
elbow_viz.show()

In [None]:
# Ahora conel metodo Silhouette
k_values = range(2, 21)
silhouette_scores = []

for k in k_values:
  cluster_labels = KMeans(n_clusters=k,random_state=42).fit_predict(X_train_noclus)
  silhouette_scores.append(silhouette_score(X_train_noclus, cluster_labels))
silhouette_scores

In [None]:
plt.plot(k_values, silhouette_scores)
plt.xlabel('Número de clústeres (k)')
plt.ylabel('Silhouette Score')
plt.title('Método Silhouette')
plt.show()

In [15]:
# Ahora con Davies-Bouldin
k_values = range(2, 21)
davies_scores = []

for k in k_values:
  cluster_labels = KMeans(n_clusters=k, random_state=42).fit_predict(X_train_noclus)
  davies_scores.append(davies_bouldin_score(X_train_noclus, cluster_labels))

In [None]:
plt.plot(k_values, davies_scores)
plt.title('Índice Davies-Bouldin para Diferentes Números de Clústeres')
plt.xlabel('Número de Clústeres')
plt.ylabel('Índice Davies-Bouldin')
plt.show()

We can assume that the optimal number of clusteres for clasifying our dataset, would be 11.

In [17]:
new_x_train = X_train_noclus
new_model = KMeans(n_clusters=11, random_state=42)
new_model.fit(new_x_train)
new_y_train = new_model.predict(new_x_train)

In [None]:
new_x_train['cluster'] = new_y_train
print("Número de clústeres encontrados:", len(set(new_model.labels_)))
new_x_train['cluster'].value_counts()

In [None]:
new_clusters = {0: 'Cluster A', 1: 'Cluster B', 2: 'Cluster C', 3: 'Cluster D', 4: 'Cluster E', 5: 'Cluster F',
            6: 'Cluster G', 7: 'Cluster H', 8: 'Cluster I', 9: 'Cluster J', 10: 'Cluster K'}

new_x_train['cluster'] = new_x_train['cluster'].map(new_clusters)
new_x_train['cluster'].value_counts()

In [None]:
fig, axis = plt.subplots(1, 3, figsize = (15, 5))

sns.scatterplot(ax = axis[0], data = new_x_train, x = "Latitude", y = "Longitude", hue = 'cluster', palette = "deep")
sns.scatterplot(ax = axis[1], data = new_x_train, x = "Latitude", y = "MedInc", hue = 'cluster', palette = "deep")
sns.scatterplot(ax = axis[2], data = new_x_train, x = "Longitude", y = "MedInc", hue = 'cluster', palette = "deep")
plt.tight_layout()

plt.show()

In [None]:
# Ahora en 3d
fig = px.scatter_3d(new_x_train, x = "Latitude", y = "Longitude", z = "MedInc", color = "cluster", width = 1000, height = 500,
                    size = new_x_train["MedInc"].abs())
camera = dict(
    up = dict(x = 1, y = 3.5, z = 0),
    eye = dict(x = 2, y = 0, z = 0)
)

fig.update_layout(scene_camera = camera)
fig.show()

### Paso 3: Predice con el conjunto de test
Ahora utiliza el modelo entrenado con el conjunto test y añade los puntos al gráfico anterior para confirmar que la predicción es satisfactoria o no.

In [None]:
y_test = list(new_model.predict(X_test))
X_test['cluster'] = y_test
X_test.head()

In [None]:
X_test['cluster'] = X_test['cluster'].map(new_clusters)
X_test['cluster']

In [None]:
new_x_train['source'] = 'X_train'
X_test['source'] = 'X_test'
consolidated_data = pd.concat((new_x_train, X_test), axis=0)
consolidated_data

In [None]:
palette = sns.color_palette("deep", n_colors=len(new_clusters))             # hago una paleta para el largo de los clusters
cluster_palette = dict(zip(new_clusters.values(), palette))                 # asocio la paleta con los clusters

fig, axis = plt.subplots(1, 3, figsize = (15, 5))

#Esta vez le añado markers con "style"
sns.scatterplot(ax = axis[0], data = consolidated_data, x = "Latitude", y = "Longitude", hue = 'cluster', style = 'source', palette = cluster_palette)
sns.scatterplot(ax = axis[1], data = consolidated_data, x = "Latitude", y = "MedInc", hue = 'cluster', style = 'source',palette = cluster_palette)
sns.scatterplot(ax = axis[2], data = consolidated_data, x = "Longitude", y = "MedInc", hue = 'cluster', style = 'source',palette = cluster_palette)

plt.tight_layout()

plt.show()

In [None]:
# Ahora en 3d, y #Esta vez le añado markers con "symbol"

fig = px.scatter_3d(consolidated_data, x = "Latitude", y = "Longitude", z = "MedInc", color = "cluster", symbol='source', width = 1000, height = 500,
                    size = consolidated_data["MedInc"].abs())
camera = dict(
    up = dict(x = 1, y = 3.5, z = 0),
    eye = dict(x = 2, y = 0, z = 0)
)

fig.update_layout(scene_camera = camera)
fig.show()

### Paso 4: Entrena un modelo de clasificación supervisada
Ahora que el K-Means nos ha devuelto una categorización (agrupación) de los puntos para los conjuntos de entrenamiento y prueba, estudia qué modelo podría ser más útil y entrénalo. Obtén las estadísticas y describe lo que ves.
Este flujo es muy común cuando contamos con datos no etiquetados: utilizar un modelo de aprendizaje no supervisado para etiquetarlos de forma automática y a continuación, un modelo de aprendizaje supervisado."


In [32]:
# Ok, me decido por el boosting algo de XGB
# Creo 3 modelos para evaluar la efectividad

model1 = XGBClassifier(max_depth=10, min_child_weight=20, learning_rate=0.01, random_state=42)
model2 = XGBClassifier(max_depth=20, min_child_weight=10, learning_rate=0.02, random_state=42)
model3 = XGBClassifier(max_depth=30, min_child_weight=5, learning_rate=0.025, random_state=42)




In [43]:
#tuve que hacer esto porque no podia usar el algor XGBClassifier con var categ
inverse_clusters = {v: k for k, v in new_clusters.items()}
consolidated_data['cluster'] = consolidated_data['cluster'].map(inverse_clusters)

In [45]:
X = consolidated_data.drop(['cluster','source'], axis=1)
y = consolidated_data['cluster']

In [48]:
X_train_1, X_test_1, y_train_1, y_test_1 = tts(X, y, test_size=0.2, random_state=42)

In [None]:
model1.fit(X_train_1,y_train_1)
model2.fit(X_train_1,y_train_1)
model3.fit(X_train_1,y_train_1)

In [50]:
y_train_pred1 = model1.predict(X_train_1)
y_test_pred1 = model1.predict(X_test_1)
y_train_pred2 = model2.predict(X_train_1)
y_test_pred2 = model2.predict(X_test_1)
y_train_pred3 = model3.predict(X_train_1)
y_test_pred3 = model3.predict(X_test_1)

In [None]:
print(f'Accuracy_score for Model1: {accuracy_score(y_test_1, y_test_pred1)}')
print(f'Accuracy_score for Model2: {accuracy_score(y_test_1, y_test_pred2)}')
print(f'Accuracy_score for Model3: {accuracy_score(y_test_1, y_test_pred3)}')

There's no significant difference between the 3 boosting algorithms utilized. We're going to save the intermediate complex one (Model 2)

In [None]:
dump(new_model, open("/workspaces/K-means-Project/models/k-means_new_model_KMeans(n_clusters=11, random_state=42).sav", "wb"))
dump(model2, open("/workspaces/K-means-Project/models/boosting_XGBClassifier(max_depth=20, min_child_weight=10, learning_rate=0.02, random_state=42).sav", "wb"))