In [2]:
# -*- coding: utf-8 -*-


from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# =======================
# 1) Configuración
# =======================
# 📍 Ruta completa del archivo CSV (ajustada para tu Mac)
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# =======================
# 1) Configuración
# =======================
# 📍 Ruta completa del archivo CSV (ajustada para tu Mac)
DATA_PATH = Path("/Users/victorianaomivilchismontalvo/Documents/Analitica/Mall_Customers.csv")

# 🔹 Guardar resultados en la misma carpeta donde ejecutes el script
OUTPUT_DIR = Path(".")

print(f"Intentando cargar archivo desde: {DATA_PATH}")
if not DATA_PATH.exists():
    raise FileNotFoundError(f"No se encontró el archivo: {DATA_PATH}")

# Guarda resultados en la misma carpeta donde ejecutes el script
OUTPUT_DIR = Path(".")

# =======================
# 2) Funciones auxiliares
# =======================
def normalizar_columnas(df: pd.DataFrame) -> pd.DataFrame:
    cols = (
        df.columns.astype(str)
        .str.strip()
        .str.replace(r"\s+", "_", regex=True)
        .str.replace(r"[^0-9A-Za-z_]+", "", regex=True)
    )
    df.columns = cols
    return df

def kmeans_simple(X: np.ndarray, k: int = 4, max_iter: int = 200, tol: float = 1e-4, random_state: int = 42):
    rng = np.random.default_rng(random_state)
    idx = rng.choice(len(X), size=k, replace=False)
    centroids = X[idx].copy()
    for _ in range(max_iter):
        dists = np.sqrt(((X[:, None, :] - centroids[None, :, :]) ** 2).sum(axis=2))
        labels = dists.argmin(axis=1)
        new_centroids = np.array([X[labels == j].mean(axis=0) for j in range(k)])
        for j in range(k):
            if np.isnan(new_centroids[j]).any():
                new_centroids[j] = X[rng.integers(0, len(X))]
        if np.sqrt(((centroids - new_centroids) ** 2).sum()) < tol:
            break
        centroids = new_centroids
    return labels, centroids

def guardar_boxplot(serie, titulo, ruta):
    plt.figure(figsize=(6,4))
    plt.boxplot(serie.dropna(), vert=True, labels=[serie.name])
    plt.title(titulo)
    plt.ylabel(serie.name)
    plt.tight_layout()
    plt.savefig(ruta, dpi=150)
    plt.close()

def guardar_histograma(serie, titulo, ruta, bins=20):
    plt.figure(figsize=(6,4))
    plt.hist(serie.dropna(), bins=bins)
    plt.title(titulo)
    plt.xlabel(serie.name)
    plt.ylabel("Frecuencia")
    plt.tight_layout()
    plt.savefig(ruta, dpi=150)
    plt.close()

def guardar_heatmap_corr(corr, ruta, titulo="Mapa de calor de correlaciones"):
    plt.figure(figsize=(6,5))
    im = plt.imshow(corr, interpolation="nearest")
    plt.title(titulo)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=45, ha="right")
    plt.yticks(range(len(corr.index)), corr.index)
    plt.tight_layout()
    plt.savefig(ruta, dpi=150)
    plt.close()

# =======================
# 3) Carga y limpieza de datos
# =======================
if not DATA_PATH.exists():
    raise FileNotFoundError(f"No se encontró el archivo: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
df = normalizar_columnas(df)

# Renombrar columnas típicas
rename_map = {}
for c in df.columns:
    cl = c.lower()
    if cl in ["annual_income_k", "annual_income_ks", "annual_income_k"]:
        rename_map[c] = "Annual_Income_k"
    if cl in ["spending_score_110", "spending_score_1100", "spending_score"]:
        rename_map[c] = "Spending_Score"
    if cl == "customerid":
        rename_map[c] = "CustomerID"
    if cl == "gender":
        rename_map[c] = "Gender"
    if cl == "age":
        rename_map[c] = "Age"

if rename_map:
    df = df.rename(columns=rename_map)

# Convertir género a numérico (opcional)
if "Gender" in df.columns:
    df["Gender"] = (
        df["Gender"]
        .astype(str)
        .str.strip()
        .str.lower()
        .map({"male": 1, "m": 1, "female": 0, "f": 0})
    )

# Eliminar identificadores
for idcol in ["CustomerID", "Unnamed_0", "Unnamed_0_1", "id", "ID"]:
    if idcol in df.columns:
        df = df.drop(columns=[idcol])

# =======================
# 4) Análisis descriptivo
# =======================
num_cols = [c for c in df.columns if df[c].dtype.kind in "if"]
if not num_cols:
    raise ValueError("No se detectaron columnas numéricas para analizar.")

df[num_cols].describe().T.to_csv("resumen_descriptivo.csv")

for col in num_cols:
    guardar_boxplot(df[col], f"Diagrama de caja y bigotes: {col}", f"boxplot_{col}.png")
    guardar_histograma(df[col], f"Histograma: {col}", f"hist_{col}.png")

corr = df[num_cols].corr()
corr.to_csv("correlaciones.csv")
guardar_heatmap_corr(corr, "heatmap_correlaciones.png")

# =======================
# 5) Outliers y rangos
# =======================
outlier_data = {}
for col in num_cols:
    s = df[col].dropna()
    if len(s) < 5:
        continue
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    low, high = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    mask = (s < low) | (s > high)
    outlier_data[col] = {
        "conteo": int(mask.sum()),
        "porcentaje": round(100 * mask.sum() / len(s), 2),
        "min": float(s.min()),
        "q1": float(q1),
        "q3": float(q3),
        "max": float(s.max()),
    }
pd.DataFrame(outlier_data).T.to_csv("outliers_iqr.csv")

rangos = []
for col in num_cols:
    s = df[col].dropna()
    if len(s) == 0:
        continue
    rangos.append((col, float(s.min()), float(s.max()), float(s.max() - s.min())))
pd.DataFrame(rangos, columns=["Variable", "Mínimo", "Máximo", "Rango"]).to_csv("rangos.csv", index=False)

# =======================
# 6) Clustering simple
# =======================
if {"Annual_Income_k", "Spending_Score"}.issubset(df.columns):
    sub = df[["Annual_Income_k", "Spending_Score"]].dropna().to_numpy()
    if len(sub) >= 10:
        labels, centers = kmeans_simple(sub, k=4, max_iter=300)
        clusters = pd.DataFrame(sub, columns=["Annual_Income_k", "Spending_Score"])
        clusters["cluster"] = labels
        resumen = clusters.groupby("cluster").agg(
            n=("Annual_Income_k", "count"),
            ingreso_prom=("Annual_Income_k", "mean"),
            score_prom=("Spending_Score", "mean"),
        )
        resumen.to_csv("clusters_resumen.csv")

        plt.figure(figsize=(6,5))
        for j in sorted(np.unique(labels)):
            pts = clusters[clusters["cluster"] == j]
            plt.scatter(pts["Annual_Income_k"], pts["Spending_Score"], s=15, alpha=0.7, label=f"Cluster {j}")
        plt.scatter(centers[:,0], centers[:,1], s=120, marker="X", label="Centroides")
        plt.xlabel("Annual_Income_k")
        plt.ylabel("Spending_Score")
        plt.title("Agrupación K-Means (k=4)")
        plt.legend()
        plt.tight_layout()
        plt.savefig("clusters_scatter.png", dpi=150)
        plt.close()

print("\n✅ Análisis completado. Todos los archivos se guardaron en la misma carpeta del script.")

Intentando cargar archivo desde: /Users/victorianaomivilchismontalvo/Documents/Analitica/Mall_Customers.csv


OSError: [Errno 30] Read-only file system: 'resumen_descriptivo.csv'

In [None]:
# -*- coding: utf-8 -*-
"""
Análisis Mall_Customers con salida en carpeta escribible (macOS).
Guarda CSVs y PNGs en una carpeta que SÍ tenga permisos.
"""

from pathlib import Path
import os, tempfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --------- 1) Rutas ----------
DATA_PATH = Path("/Users/victorianaomivilchismontalvo/Documents/Analitica/Mall_Customers.csv")

# Candidatos de salida (en orden de preferencia)
candidatos = [
    Path("/Users/victorianaomivilchismontalvo/Documents/Analitica"),
    Path("/Users/victorianaomivilchismontalvo/Downloads"),
    Path(tempfile.gettempdir()),
]

def primera_carpeta_escribible(paths):
    for p in paths:
        try:
            p.mkdir(parents=True, exist_ok=True)
            test = p / ".perm_test"
            with open(test, "w") as f:
                f.write("ok")
            test.unlink()
            return p
        except Exception:
            continue
    raise OSError("No encontré una carpeta escribible. Revisa permisos.")

OUTPUT_DIR = primera_carpeta_escribible(candidatos)
print(f"📂 Guardaré resultados en: {OUTPUT_DIR}")

# --------- 2) Funciones ----------
def normalizar_columnas(df: pd.DataFrame) -> pd.DataFrame:
    cols = (
        df.columns.astype(str)
        .str.strip()
        .str.replace(r"\s+", "_", regex=True)
        .str.replace(r"[^0-9A-Za-z_]+", "", regex=True)
    )
    df.columns = cols
    return df

def kmeans_simple(X: np.ndarray, k: int = 4, max_iter: int = 200, tol: float = 1e-4, random_state: int = 42):
    rng = np.random.default_rng(random_state)
    idx = rng.choice(len(X), size=k, replace=False)
    centroids = X[idx].copy()
    for _ in range(max_iter):
        dists = np.sqrt(((X[:, None, :] - centroids[None, :, :]) ** 2).sum(axis=2))
        labels = dists.argmin(axis=1)
        new_centroids = np.array([X[labels == j].mean(axis=0) for j in range(k)])
        for j in range(k):
            if np.isnan(new_centroids[j]).any():
                new_centroids[j] = X[rng.integers(0, len(X))]
        if np.sqrt(((centroids - new_centroids) ** 2).sum()) < tol:
            break
        centroids = new_centroids
    return labels, centroids

def guardar_boxplot(serie, titulo, ruta):
    plt.figure(figsize=(6,4))
    plt.boxplot(serie.dropna(), vert=True, labels=[serie.name])
    plt.title(titulo)
    plt.ylabel(serie.name)
    plt.tight_layout()
    plt.savefig(ruta, dpi=150)
    plt.close()

def guardar_histograma(serie, titulo, ruta, bins=20):
    plt.figure(figsize=(6,4))
    plt.hist(serie.dropna(), bins=bins)
    plt.title(titulo)
    plt.xlabel(serie.name)
    plt.ylabel("Frecuencia")
    plt.tight_layout()
    plt.savefig(ruta, dpi=150)
    plt.close()

def guardar_heatmap_corr(corr, ruta, titulo="Mapa de calor de correlaciones"):
    plt.figure(figsize=(6,5))
    im = plt.imshow(corr, interpolation="nearest")
    plt.title(titulo)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=45, ha="right")
    plt.yticks(range(len(corr.index)), corr.index)
    plt.tight_layout()
    plt.savefig(ruta, dpi=150)
    plt.close()

# --------- 3) Carga y limpieza ----------
print(f"Intentando cargar archivo desde: {DATA_PATH}")
if not DATA_PATH.exists():
    raise FileNotFoundError(f"No se encontró el archivo: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
df = normalizar_columnas(df)

# Renombrar columnas típicas
rename_map = {}
for c in df.columns:
    cl = c.lower()
    if cl in ["annual_income_k", "annual_income_ks", "annual_income_k"]:
        rename_map[c] = "Annual_Income_k"
    if cl in ["spending_score_110", "spending_score_1100", "spending_score"]:
        rename_map[c] = "Spending_Score"
    if cl == "customerid": rename_map[c] = "CustomerID"
    if cl == "gender":     rename_map[c] = "Gender"
    if cl == "age":        rename_map[c] = "Age"
if rename_map:
    df = df.rename(columns=rename_map)

# Género a numérico (opcional)
if "Gender" in df.columns:
    df["Gender"] = (
        df["Gender"].astype(str).str.strip().str.lower()
        .map({"male": 1, "m": 1, "female": 0, "f": 0})
    )

# Quitar IDs si están
for idcol in ["CustomerID","Unnamed_0","Unnamed_0_1","id","ID"]:
    if idcol in df.columns:
        df = df.drop(columns=[idcol])

# --------- 4) Análisis ----------
num_cols = [c for c in df.columns if df[c].dtype.kind in "if"]
if not num_cols:
    raise ValueError("No se detectaron columnas numéricas para analizar.")

(df[num_cols].describe().T).to_csv(OUTPUT_DIR / "resumen_descriptivo.csv")

for col in num_cols:
    guardar_boxplot(df[col], f"Diagrama de caja y bigotes: {col}", OUTPUT_DIR / f"boxplot_{col}.png")
    guardar_histograma(df[col], f"Histograma: {col}", OUTPUT_DIR / f"hist_{col}.png")

corr = df[num_cols].corr()
corr.to_csv(OUTPUT_DIR / "correlaciones.csv")
guardar_heatmap_corr(corr, OUTPUT_DIR / "heatmap_correlaciones.png")

# Outliers IQR
outlier_data = {}
for col in num_cols:
    s = df[col].dropna()
    if len(s) < 5: 
        continue
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    low, high = q1 - 1.5*iqr, q3 + 1.5*iqr
    mask = (s < low) | (s > high)
    outlier_data[col] = {
        "conteo": int(mask.sum()),
        "porcentaje": round(100*mask.sum()/len(s), 2),
        "min": float(s.min()), "q1": float(q1), "q3": float(q3), "max": float(s.max())
    }
pd.DataFrame(outlier_data).T.to_csv(OUTPUT_DIR / "outliers_iqr.csv")

# Rangos
rangos = []
for col in num_cols:
    s = df[col].dropna()
    if len(s) == 0: 
        continue
    rangos.append((col, float(s.min()), float(s.max()), float(s.max()-s.min())))
pd.DataFrame(rangos, columns=["Variable","Mínimo","Máximo","Rango"]).to_csv(OUTPUT_DIR / "rangos.csv", index=False)

# Clustering (si aplica)
if {"Annual_Income_k","Spending_Score"}.issubset(df.columns):
    sub = df[["Annual_Income_k","Spending_Score"]].dropna().to_numpy()
    if len(sub) >= 10:
        labels, centers = kmeans_simple(sub, k=4, max_iter=300)
        clusters = pd.DataFrame(sub, columns=["Annual_Income_k","Spending_Score"])
        clusters["cluster"] = labels
        clusters.groupby("cluster").agg(
            n=("Annual_Income_k","count"),
            ingreso_prom=("Annual_Income_k","mean"),
            score_prom=("Spending_Score","mean")
        ).to_csv(OUTPUT_DIR / "clusters_resumen.csv")

        plt.figure(figsize=(6,5))
        for j in sorted(np.unique(labels)):
            pts = clusters[clusters["cluster"]==j]
            plt.scatter(pts["Annual_Income_k"], pts["Spending_Score"], s=15, alpha=0.7, label=f"Cluster {j}")
        plt.scatter(centers[:,0], centers[:,1], s=120, marker="X", label="Centroides")
        plt.xlabel("Annual_Income_k"); plt.ylabel("Spending_Score"); plt.title("Agrupación K-Means (k=4)")
        plt.legend(); plt.tight_layout()
        plt.savefig(OUTPUT_DIR / "clusters_scatter.png", dpi=150)
        plt.close()




📂 Guardaré resultados en: /Users/victorianaomivilchismontalvo/Documents/Analitica
Intentando cargar archivo desde: /Users/victorianaomivilchismontalvo/Documents/Analitica/Mall_Customers.csv


  plt.boxplot(serie.dropna(), vert=True, labels=[serie.name])
  plt.boxplot(serie.dropna(), vert=True, labels=[serie.name])
  plt.boxplot(serie.dropna(), vert=True, labels=[serie.name])
  plt.boxplot(serie.dropna(), vert=True, labels=[serie.name])


✅ ¡Listo! Revisa la carpeta indicada arriba para ver CSVs y gráficos.
