In [5]:
# -*- coding: utf-8 -*-
"""
K-Means con scikit-learn para Mall_Customers.csv
- Carga y limpieza
- (Opcional) eliminación de variables poco útiles
- Selección de k (elbow + silhouette)
- Ajuste de KMeans y exportación de centros en escala original
"""

from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# =============== 1) Configuración ===============
DATA_PATH = Path("/Users/victorianaomivilchismontalvo/Documents/Analitica/Mall_Customers.csv")
RANDOM_STATE = 42
DROP_GENDER = True
# Justificación: En el análisis previo, "Gender" no mostró correlaciones fuertes ni aportó
# mejoras evidentes al clustering numérico; por simplicidad del k-means (distancias euclidianas),
# usamos sólo variables numéricas continuas: Age, Annual_Income_k, Spending_Score.

# =============== 2) Utilidades ===============
def normalizar_columnas(df: pd.DataFrame) -> pd.DataFrame:
    cols = (
        df.columns.astype(str)
        .str.strip()
        .str.replace(r"\s+", "_", regex=True)
        .str.replace(r"[^0-9A-Za-z_]+", "", regex=True)
    )
    df.columns = cols
    return df

def cargar_mall_customers(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"No se encontró el archivo: {path}")
    df = pd.read_csv(path)
    df = normalizar_columnas(df)
    # Renombrar columnas típicas
    rename_map = {}
    for c in df.columns:
        cl = c.lower()
        if cl in ["annual_income_k", "annual_income_ks", "annual_income_k"]:
            rename_map[c] = "Annual_Income_k"
        if cl in ["spending_score_110", "spending_score_1100", "spending_score"]:
            rename_map[c] = "Spending_Score"
        if cl == "customerid": rename_map[c] = "CustomerID"
        if cl == "gender":     rename_map[c] = "Gender"
        if cl == "age":        rename_map[c] = "Age"
    if rename_map:
        df = df.rename(columns=rename_map)

    # Eliminar identificadores
    for idcol in ["CustomerID", "Unnamed_0", "Unnamed_0_1", "id", "ID"]:
        if idcol in df.columns:
            df = df.drop(columns=[idcol])

    # Codificar Gender si existe
    if "Gender" in df.columns:
        df["Gender"] = (
            df["Gender"]
            .astype(str).str.strip().str.lower()
            .map({"male": 1, "m": 1, "female": 0, "f": 0})
        )
    return df

# =============== 3) Carga y selección de variables ===============
df = cargar_mall_customers(DATA_PATH)

# Variables candidatas
features = ["Age", "Annual_Income_k", "Spending_Score"]
if not DROP_GENDER and "Gender" in df.columns:
    features = ["Gender"] + features  # si decides incluir género

X = df[features].dropna().copy()

# Escalado (k-means se basa en distancias; escalamos para no sesgar por rangos distintos)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.values)

# =============== 4) Determinar k (Elbow + Silhouette) ===============
ks = range(2, 11)  # probamos de 2 a 10 clusters
inertias = []
silhouettes = []

for k in ks:
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)
    labels = km.fit_predict(X_scaled)
    inertias.append(km.inertia_)
    sil = silhouette_score(X_scaled, labels)
    silhouettes.append(sil)

# Regla simple: elegir el k con mayor silhouette (puedes ajustar a criterio/“codo”)
k_opt = ks[int(np.argmax(silhouettes))]
print(f"→ k óptimo por silhouette: {k_opt}")
print("Inertia por k:", dict(zip(ks, np.round(inertias, 2))))
print("Silhouette por k:", dict(zip(ks, np.round(silhouettes, 3))))

# =============== 5) Ajustar K-Means final y exportar centros ===============
kmeans = KMeans(n_clusters=k_opt, random_state=RANDOM_STATE, n_init=10)
labels = kmeans.fit_predict(X_scaled)

# Centros en escala estandarizada:
centers_scaled = kmeans.cluster_centers_
# Convertir centros a escala original (muy importante para interpretarlos):
centers_original = scaler.inverse_transform(centers_scaled)
centers_df = pd.DataFrame(centers_original, columns=features)
centers_df.index.name = "cluster"

# Conteo de elementos por cluster
counts = pd.Series(labels).value_counts().sort_index()
counts.name = "n"

# Adjuntar conteos a centros
resumen_centros = centers_df.copy()
resumen_centros.insert(0, "n", counts.values)

print("\n=== RESUMEN DE CENTROS (escala original) ===")
print(resumen_centros.round(2))

# =============== 6) Guardar resultados ===============
# Asignaciones por fila (opcional, útil para análisis/visualización)
asignaciones = X.copy()
asignaciones["cluster"] = labels
asignaciones.to_csv("kmeans_labels.csv", index=False, encoding="utf-8")

# Centros
resumen_centros.round(4).to_csv("kmeans_centers.csv", index=True, encoding="utf-8")

print("\nArchivos guardados:")
print(" - kmeans_centers.csv  (centros en escala original)")
print(" - kmeans_labels.csv   (asignación de cada fila a un cluster)")


→ k óptimo por silhouette: 6
Inertia por k: {2: np.float64(389.39), 3: np.float64(295.21), 4: np.float64(205.23), 5: np.float64(168.25), 6: np.float64(133.87), 7: np.float64(117.01), 8: np.float64(103.87), 9: np.float64(93.09), 10: np.float64(82.39)}
Silhouette por k: {2: np.float64(0.335), 3: np.float64(0.358), 4: np.float64(0.404), 5: np.float64(0.417), 6: np.float64(0.428), 7: np.float64(0.417), 8: np.float64(0.408), 9: np.float64(0.418), 10: np.float64(0.407)}

=== RESUMEN DE CENTROS (escala original) ===
          n    Age  Annual_Income_k  Spending_Score
cluster                                            
0        45  56.33            54.27           49.07
1        39  26.79            57.10           48.13
2        33  41.94            88.94           16.97
3        39  32.69            86.54           82.13
4        23  25.00            25.26           77.61
5        21  45.52            26.29           19.38


OSError: [Errno 30] Read-only file system: 'kmeans_labels.csv'

In [6]:
# -*- coding: utf-8 -*-
from pathlib import Path
import tempfile
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# ================== 1) Rutas ==================
DATA_PATH = Path("/Users/victorianaomivilchismontalvo/Documents/Analitica/Mall_Customers.csv")
RANDOM_STATE = 42
DROP_GENDER = True

# Elegir una carpeta escribible para los outputs
def carpeta_escribible(candidatos):
    for p in candidatos:
        try:
            p.mkdir(parents=True, exist_ok=True)
            t = p/".perm_test"
            t.write_text("ok")
            t.unlink()
            return p
        except Exception:
            continue
    raise OSError("No encontré carpeta escribible")

OUTPUT_DIR = carpeta_escribible([
    Path("/Users/victorianaomivilchismontalvo/Documents/Analitica"),
    Path("/Users/victorianaomivilchismontalvo/Downloads"),
    Path(tempfile.gettempdir()),
])
print(f"📂 Guardaré resultados en: {OUTPUT_DIR}")

# ================== 2) Carga/Limpieza ==================
def normalizar_columnas(df):
    df.columns = (df.columns.astype(str).str.strip()
                  .str.replace(r"\s+","_",regex=True)
                  .str.replace(r"[^0-9A-Za-z_]+","",regex=True))
    return df

def cargar_mall_customers(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"No se encontró el archivo: {path}")
    df = pd.read_csv(path)
    df = normalizar_columnas(df)
    # Renombres típicos
    rename = {}
    for c in df.columns:
        lc = c.lower()
        if lc in ["annual_income_k","annualincome","annual_income_ks"]:
            rename[c] = "Annual_Income_k"
        if "spending" in lc:
            rename[c] = "Spending_Score"
        if lc == "customerid": rename[c] = "CustomerID"
        if lc == "gender":     rename[c] = "Gender"
        if lc == "age":        rename[c] = "Age"
    df = df.rename(columns=rename)
    # Quitar IDs
    for col in ["CustomerID","Unnamed_0","id","ID"]:
        if col in df.columns:
            df = df.drop(columns=col)
    # Codificar género
    if "Gender" in df.columns:
        df["Gender"] = (df["Gender"].astype(str).str.strip().str.lower()
                        .map({"male":1,"m":1,"female":0,"f":0}))
    return df

df = cargar_mall_customers(DATA_PATH)

# ================== 3) Variables y escalado ==================
features = ["Age","Annual_Income_k","Spending_Score"]
if (not DROP_GENDER) and ("Gender" in df.columns):
    features = ["Gender"] + features

X = df[features].dropna().copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.values)

# ================== 4) Elegir k (Elbow + Silhouette) ==================
ks = range(2, 11)
inertias, silhouettes = [], []
for k in ks:
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)
    labels_tmp = km.fit_predict(X_scaled)
    inertias.append(km.inertia_)
    silhouettes.append(silhouette_score(X_scaled, labels_tmp))

k_opt = ks[int(np.argmax(silhouettes))]
print(f"✅ k óptimo por silhouette: {k_opt}")
print("Inertia:", dict(zip(ks, np.round(inertias,2))))
print("Silhouette:", dict(zip(ks, np.round(silhouettes,3))))

# ================== 5) K-Means final ==================
kmeans = KMeans(n_clusters=k_opt, random_state=RANDOM_STATE, n_init=10)
labels = kmeans.fit_predict(X_scaled)

# Centros en escala original
centers_original = scaler.inverse_transform(kmeans.cluster_centers_)
centers_df = pd.DataFrame(centers_original, columns=features)
centers_df.index.name = "Cluster"

# Tamaños por cluster
counts = pd.Series(labels).value_counts().sort_index()
counts.name = "n"
centers_df.insert(0, "n", counts.values)

print("\n=== CENTROS (escala original) ===")
print(centers_df.round(2))

# ================== 6) Guardar en OUTPUT_DIR ==================
(OUTPUT_DIR / "kmeans_labels.csv").write_text(
    X.assign(cluster=labels).to_csv(index=False), encoding="utf-8"
)
centers_df.round(4).to_csv(OUTPUT_DIR / "kmeans_centers.csv", index=True, encoding="utf-8")

print("\n✅ Archivos guardados en:")
print(f" - {OUTPUT_DIR/'kmeans_centers.csv'}")
print(f" - {OUTPUT_DIR/'kmeans_labels.csv'}")


📂 Guardaré resultados en: /Users/victorianaomivilchismontalvo/Documents/Analitica
✅ k óptimo por silhouette: 6
Inertia: {2: np.float64(389.39), 3: np.float64(295.21), 4: np.float64(205.23), 5: np.float64(168.25), 6: np.float64(133.87), 7: np.float64(117.01), 8: np.float64(103.87), 9: np.float64(93.09), 10: np.float64(82.39)}
Silhouette: {2: np.float64(0.335), 3: np.float64(0.358), 4: np.float64(0.404), 5: np.float64(0.417), 6: np.float64(0.428), 7: np.float64(0.417), 8: np.float64(0.408), 9: np.float64(0.418), 10: np.float64(0.407)}

=== CENTROS (escala original) ===
          n    Age  Annual_Income_k  Spending_Score
Cluster                                            
0        45  56.33            54.27           49.07
1        39  26.79            57.10           48.13
2        33  41.94            88.94           16.97
3        39  32.69            86.54           82.13
4        23  25.00            25.26           77.61
5        21  45.52            26.29           19.38

✅ Archivo