<a href="https://colab.research.google.com/github/abxda/COLMEX-ML/blob/main/Semana_07_COLMEX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings

# Configurar filtro de warnings
warnings.filterwarnings("ignore",
                        category=FutureWarning,  # o warnings.FutureWarning
                        module="sklearn.*",
                        message=".*'force_all_finite'.*")

In [None]:
import geopandas as gpd
import duckdb
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from shapely.wkb import loads as wkb_loads
import urllib.request
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import ipywidgets as widgets
from IPython.display import display

In [None]:

# Descargar y cargar datos
url = "https://github.com/abxda/COLMEX-ML/raw/refs/heads/main/data/hex_final.duckdb"
urllib.request.urlretrieve(url, "hex_final.duckdb")
con = duckdb.connect("hex_final.duckdb")
hex_final_df = con.execute("SELECT * FROM hex_final").fetchdf()

# Llenar NaNs en variables relevantes
cols_to_fill = ['ViviendasInternet', 'ViviendasConsola', 'ViviendasStreaming', 'TotalViviendas']
hex_final_df[cols_to_fill] = hex_final_df[cols_to_fill].fillna(0.0)

# Convertir geometría
hex_final_df["geometry"] = hex_final_df["geometry"].apply(lambda x: wkb_loads(bytes(x)) if x is not None else None)
hex_final = gpd.GeoDataFrame(hex_final_df, geometry="geometry", crs="EPSG:4326")

# Calcular proporciones
hex_final['ProporcionVInternet'] = np.where(
    hex_final['TotalViviendas'] > 0,
    hex_final['ViviendasInternet'] / hex_final['TotalViviendas'],
    0.0
)

hex_final['ProporcionVConsola'] = np.where(
    hex_final['TotalViviendas'] > 0,
    hex_final['ViviendasConsola'] / hex_final['TotalViviendas'],
    0.0
)

hex_final['ProporcionVStreaming'] = np.where(
    hex_final['TotalViviendas'] > 0,
    hex_final['ViviendasStreaming'] / hex_final['TotalViviendas'],
    0.0
)

In [None]:
# Preparar datos para clustering
X = hex_final[['ProporcionVInternet', 'ProporcionVConsola', 'ProporcionVStreaming']].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.cluster import KMeans

def apply_kmeans(gdf, X_scaled, n_clusters):
    gdf = gdf.copy()
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X_scaled)
    gdf['kmeans_cluster'] = clusters

    cluster_means = gdf.groupby('kmeans_cluster')[[
        'ProporcionVInternet', 'ProporcionVConsola', 'ProporcionVStreaming'
    ]].mean().mean(axis=1)
    sorted_clusters = cluster_means.sort_values(ascending=False).index.tolist()

    color_map = {}
    for idx, cluster in enumerate(sorted_clusters):
        if idx < 2:
            color_map[cluster] = 'green'
        elif idx == 2:
            color_map[cluster] = 'yellow'
        else:
            color_map[cluster] = 'red'

    gdf['kmeans_strata'] = gdf['kmeans_cluster'].map(color_map)

    return gdf


In [None]:
from sklearn.cluster import DBSCAN
def apply_dbscan(gdf, X_scaled, eps, min_samples):
    gdf = gdf.copy()
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(X_scaled)
    gdf['dbscan_cluster'] = clusters

    valid_clusters = gdf[gdf['dbscan_cluster'] != -1]
    if not valid_clusters.empty:
        dbscan_means = valid_clusters.groupby('dbscan_cluster')[[
            'ProporcionVInternet', 'ProporcionVConsola', 'ProporcionVStreaming'
        ]].mean().mean(axis=1)
        sorted_dbscan = dbscan_means.sort_values(ascending=False).index.tolist()

        color_map = {}
        for idx, cluster in enumerate(sorted_dbscan):
            color_map[cluster] = 'green' if idx < 2 else 'yellow' if idx == 2 else 'red'

        gdf['dbscan_strata'] = (
            gdf['dbscan_cluster']
            .map(color_map)
            .fillna('red'))
    else:
        gdf['dbscan_strata'] = 'red'

    return gdf

In [None]:
import hdbscan

def apply_hdbscan(gdf, X_scaled, min_cluster_size, min_samples):
    gdf = gdf.copy()
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                              min_samples=min_samples,
                              cluster_selection_method='eom')
    clusters = clusterer.fit_predict(X_scaled)
    gdf['hdbscan_cluster'] = clusters

    valid_clusters = gdf[gdf['hdbscan_cluster'] != -1]
    if not valid_clusters.empty:
        hdbscan_means = valid_clusters.groupby('hdbscan_cluster')[[
            'ProporcionVInternet', 'ProporcionVConsola', 'ProporcionVStreaming'
        ]].mean().mean(axis=1)
        sorted_clusters = hdbscan_means.sort_values(ascending=False).index.tolist()

        color_map = {}
        for idx, cluster in enumerate(sorted_clusters):
            color_map[cluster] = 'green' if idx < 2 else 'yellow' if idx == 2 else 'red'

        gdf['hdbscan_strata'] = (
            gdf['hdbscan_cluster']
            .map(color_map)
            .fillna('red'))
    else:
        gdf['hdbscan_strata'] = 'red'

    return gdf

In [None]:
from sklearn.mixture import GaussianMixture

def apply_gmm(gdf, X_scaled, n_components, covariance_type):
    gdf = gdf.copy()
    gmm = GaussianMixture(n_components=n_components,
                        covariance_type=covariance_type,
                        random_state=42)
    clusters = gmm.fit_predict(X_scaled)
    gdf['gmm_cluster'] = clusters

    cluster_means = gdf.groupby('gmm_cluster')[[
        'ProporcionVInternet', 'ProporcionVConsola', 'ProporcionVStreaming'
    ]].mean().mean(axis=1)
    sorted_clusters = cluster_means.sort_values(ascending=False).index.tolist()

    color_map = {}
    for idx, cluster in enumerate(sorted_clusters):
        color_map[cluster] = 'green' if idx < 2 else 'yellow' if idx == 2 else 'red'

    gdf['gmm_strata'] = gdf['gmm_cluster'].map(color_map)

    return gdf

In [None]:
from sklearn.cluster import SpectralClustering

def apply_spectral(gdf, X_scaled, n_clusters, gamma):
    gdf = gdf.copy()
    spectral = SpectralClustering(n_clusters=n_clusters,
                                gamma=gamma,
                                affinity='rbf',
                                random_state=42)
    clusters = spectral.fit_predict(X_scaled)
    gdf['spectral_cluster'] = clusters

    cluster_means = gdf.groupby('spectral_cluster')[[
        'ProporcionVInternet', 'ProporcionVConsola', 'ProporcionVStreaming'
    ]].mean().mean(axis=1)
    sorted_clusters = cluster_means.sort_values(ascending=False).index.tolist()

    color_map = {}
    for idx, cluster in enumerate(sorted_clusters):
        color_map[cluster] = 'green' if idx < 2 else 'yellow' if idx == 2 else 'red'

    gdf['spectral_strata'] = gdf['spectral_cluster'].map(color_map)

    return gdf



In [None]:
from matplotlib.pyplot import close

# Configuración completa de métodos y parámetros
METHOD_CONFIG = {
    'K-Means': {
        'function': apply_kmeans,
        'params': [
            {'n_clusters': 3},
            {'n_clusters': 4},
            {'n_clusters': 5},
            {'n_clusters': 6},
            {'n_clusters': 7}
        ]
    },
    'DBSCAN': {
        'function': apply_dbscan,
        'params': [
            {'eps': 0.01, 'min_samples': 10},
            {'eps': 0.04, 'min_samples': 10},
            {'eps': 0.04, 'min_samples': 10},
            {'eps': 0.03, 'min_samples': 10},
            {'eps': 0.02, 'min_samples': 10}
        ]
    },
    'HDBSCAN': {
        'function': apply_hdbscan,
        'params': [
            {'min_cluster_size': 50, 'min_samples': None},
            {'min_cluster_size': 60, 'min_samples': None},
            {'min_cluster_size': 70, 'min_samples': None},
            {'min_cluster_size': 80, 'min_samples': None},
            {'min_cluster_size': 90, 'min_samples': None}
        ]
    },
    'GMM': {
        'function': apply_gmm,
        'params': [
            {'n_components': 4, 'covariance_type': 'full'},
            {'n_components': 4, 'covariance_type': 'diag'},
            {'n_components': 5, 'covariance_type': 'full'},
            {'n_components': 6, 'covariance_type': 'diag'},
            {'n_components': 6, 'covariance_type': 'full'}
        ]
    },
    'Spectral': {
        'function': apply_spectral,
        'params': [
            {'n_clusters': 5, 'gamma': 5.0},
            {'n_clusters': 6, 'gamma': 0.5},
            {'n_clusters': 7, 'gamma': 5.0},
            {'n_clusters': 5, 'gamma': 1.0},
            {'n_clusters': 5, 'gamma': 2.0}
        ]
    }
}


# Función para generar todas las visualizaciones
def generate_all_visualizations():
    cmap = ListedColormap(['green', 'red', 'yellow'])

    for method_name, config in METHOD_CONFIG.items():
        fig, axes = plt.subplots(1, 5, figsize=(25, 5))
        fig.suptitle(f"Método: {method_name}", y=1.05, fontsize=16)

        for i, params in enumerate(config['params']):
            temp_gdf = config['function'](hex_final, X_scaled, **params)
            column_name = f"{method_name.lower().replace('-', '')}_strata"

            temp_gdf.plot(
                column=column_name,
                ax=axes[i],
                categorical=True,
                cmap=cmap,
                legend=True,
                legend_kwds={
                    'title': "\n".join([f"{k}={v}" for k, v in params.items()]),
                    'loc': 'lower right'
                }
            )
            axes[i].set_title(f"Variación {i+1}", fontsize=12)
            axes[i].set_axis_off()

        plt.tight_layout()
        plt.show()
        close('all')  # Limpiar memoria de figuras

# Ejecutar generación de todas las gráficas
generate_all_visualizations()


In [None]:
# -----------------------------------------------------------------
# CELDA FINAL PARA EXPORTAR (EJECUTAR SOLO DESPUÉS DE ELEGIR)
# -----------------------------------------------------------------
# Sustituir con selección del usuario
METODO_ELEGIDO = 'K-Means'#'GMM'  # Opciones: 'K-Means', 'DBSCAN', 'HDBSCAN', 'GMM', 'Spectral'
PARAMETROS_ELEGIDOS = {'n_clusters': 7}#{'n_components': 5, 'covariance_type': 'full'}  # Usar parámetros de la variación elegida

# Aplicar método seleccionado
method_functions = {
    'K-Means': apply_kmeans,
    'DBSCAN': apply_dbscan,
    'HDBSCAN': apply_hdbscan,
    'GMM': apply_gmm,
    'Spectral': apply_spectral
}

hex_final = method_functions[METODO_ELEGIDO](hex_final, X_scaled, **PARAMETROS_ELEGIDOS)

# Exportar resultados
hex_final.to_file(f"hex_final_strata_{METODO_ELEGIDO}.gpkg", driver="GPKG", layer="hex_data_strata")
print(f"¡Mapa exportado con éxito usando {METODO_ELEGIDO}!")
print("Parámetros utilizados:", PARAMETROS_ELEGIDOS)


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

resultados_kmeans = []
for k in range(5, 11):  # probar k = 2,3,4,5,6
    modelo = KMeans(n_clusters=k, random_state=0)
    labels = modelo.fit_predict(X_scaled)
    sil = silhouette_score(X_scaled, labels)  # solo se calcula si k>1
    dbi = davies_bouldin_score(X_scaled, labels)
    resultados_kmeans.append((k, sil, dbi))

# Ordenar resultados por silueta de mayor a menor
resultados_kmeans.sort(key=lambda tup: tup[1], reverse=True)
for k, sil, dbi in resultados_kmeans:
    print(f"K={k}: silhouette={sil:.3f}, DBI={dbi:.3f}")

In [None]:
from sklearn.cluster import DBSCAN

resultados_dbscan = []
for eps in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07]:
    for min_samples in [5, 7 , 8, 9, 10, 15]:
        modelo = DBSCAN(eps=eps, min_samples=min_samples)
        labels = modelo.fit_predict(X_scaled)
        # Filtrar ruido para evaluación
        core_mask = (labels != -1)
        n_clusters = len(set(labels[core_mask]))  # número de clusters (sin contar ruido)
        if n_clusters >= 2:
            sil = silhouette_score(X_scaled[core_mask], labels[core_mask])
            dbi = davies_bouldin_score(X_scaled[core_mask], labels[core_mask])
            resultados_dbscan.append(((eps, min_samples), sil, dbi))
# Ordenar por silueta descendente
resultados_dbscan.sort(key=lambda tup: tup[1], reverse=True)
for (eps, min_samples), sil, dbi in resultados_dbscan[:5]:
    print(f"eps={eps}, min_samples={min_samples}: silhouette={sil:.3f}, DBI={dbi:.3f}")

In [None]:
import hdbscan
resultados_hdb = []
for mcs in [50, 60, 70, 80, 90]:
    # Ejemplo: min_samples=None (por defecto = mcs), probar min_samples fijos
    clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs, min_samples=None)
    labels = clusterer.fit_predict(X_scaled)
    # Excluir ruido
    core_mask = (labels != -1)
    n_clusters = len(set(labels[core_mask]))
    if n_clusters >= 2:
        sil = silhouette_score(X_scaled[core_mask], labels[core_mask])
        dbi = davies_bouldin_score(X_scaled[core_mask], labels[core_mask])
        resultados_hdb.append((mcs, sil, dbi))
# Ordenar por silueta
resultados_hdb.sort(key=lambda tup: tup[1], reverse=True)
for mcs, sil, dbi in resultados_hdb[:5]:
    print(f"min_cluster_size={mcs}: silhouette={sil:.3f}, DBI={dbi:.3f}")

In [None]:
from sklearn.mixture import GaussianMixture

resultados_gmm = []
for n in range(4, 7):
    for covar in ["full", "diag"]:
        modelo = GaussianMixture(n_components=n, covariance_type=covar, random_state=0)
        labels = modelo.fit_predict(X_scaled)
        # Notar: GMM siempre asigna todos los puntos a alguno de los n componentes
        sil = silhouette_score(X_scaled, labels)
        dbi = davies_bouldin_score(X_scaled, labels)
        resultados_gmm.append(((n, covar), sil, dbi))
resultados_gmm.sort(key=lambda tup: tup[1], reverse=True)
for (n, covar), sil, dbi in resultados_gmm[:5]:
    print(f"n_comp={n}, covar={covar}: silhouette={sil:.3f}, DBI={dbi:.3f}")

In [None]:
# Definir rango ampliado de búsqueda
n_clusters_range = range(2, 10)  # más clusters
gamma_values = [0.1, 0.5, 1, 2, 5, 10]  # mayor variedad en gamma
resultados_spectral = []

for k in n_clusters_range:
    for gamma in gamma_values:
        modelo = SpectralClustering(n_clusters=k, affinity='rbf', gamma=gamma, random_state=42, assign_labels='kmeans')
        labels = modelo.fit_predict(X_scaled)

        # Evitar configuraciones inválidas (todos los puntos en un solo cluster)
        if len(set(labels)) > 1:
            sil = silhouette_score(X_scaled, labels)
            dbi = davies_bouldin_score(X_scaled, labels)
            resultados_spectral.append(((k, gamma), sil, dbi))

# Ordenar resultados por silhouette (alta es mejor)
resultados_spectral.sort(key=lambda tup: tup[1], reverse=True)

# Mostrar las 5 mejores configuraciones
print("Mejores configuraciones SpectralClustering:")
for (k, gamma), sil, dbi in resultados_spectral[:5]:
    print(f"n_clusters={k}, gamma={gamma}: silhouette={sil:.3f}, DBI={dbi:.3f}")


In [None]:
color_map

In [None]:
sorted_clusters

In [None]:
    gdf = hex_final.copy()
    kmeans = KMeans(n_clusters=7, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X_scaled)
    gdf['kmeans_cluster'] = clusters

    cluster_means = gdf.groupby('kmeans_cluster')[[
        'ProporcionVInternet', 'ProporcionVConsola', 'ProporcionVStreaming'
    ]].mean().mean(axis=1)
    sorted_clusters = cluster_means.sort_values(ascending=False).index.tolist()

    color_map = {}
    for idx, cluster in enumerate(sorted_clusters):
        if idx < 2:
            color_map[cluster] = 'green'
        elif idx == 2:
            color_map[cluster] = 'yellow'
        else:
            color_map[cluster] = 'red'

    gdf['kmeans_strata'] = gdf['kmeans_cluster'].map(color_map)

    gdf