<a href="https://colab.research.google.com/github/abxda/COLMEX-ML/blob/main/Semana_10_CENSO_DENUE_COLMEX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#https://drive.google.com/file/d/1WcTuLEkICVg2A9j98EZWuSgjUhO8AIRI/view?usp=sharing
#https://drive.google.com/file/d/1gYWtwKDpJK0_uBOJvvwkXK8Hy8w8mL0p/view?usp=sharing
#https://drive.google.com/file/d/1LVPNaUxto31HE-UQIO1-4l6XPER9Ry5l/view?usp=sharing

In [1]:
!gdown --id 1WcTuLEkICVg2A9j98EZWuSgjUhO8AIRI

Downloading...
From: https://drive.google.com/uc?id=1WcTuLEkICVg2A9j98EZWuSgjUhO8AIRI
To: /content/trained_model.joblib
100% 214k/214k [00:00<00:00, 86.2MB/s]


In [4]:
!gdown --id 1gYWtwKDpJK0_uBOJvvwkXK8Hy8w8mL0p

Downloading...
From (original): https://drive.google.com/uc?id=1gYWtwKDpJK0_uBOJvvwkXK8Hy8w8mL0p
From (redirected): https://drive.google.com/uc?id=1gYWtwKDpJK0_uBOJvvwkXK8Hy8w8mL0p&confirm=t&uuid=daa2efa0-0d2c-44c6-aebc-659501a97ccd
To: /content/denue_total_est_per_ocu_final_5.duckdb
100% 3.11G/3.11G [00:37<00:00, 82.2MB/s]


In [5]:
!gdown --id 1LVPNaUxto31HE-UQIO1-4l6XPER9Ry5l

Downloading...
From (original): https://drive.google.com/uc?id=1LVPNaUxto31HE-UQIO1-4l6XPER9Ry5l
From (redirected): https://drive.google.com/uc?id=1LVPNaUxto31HE-UQIO1-4l6XPER9Ry5l&confirm=t&uuid=2adc88e3-778e-4305-ac4a-853a615c9771
To: /content/datos_censo_nacional_s9.duckdb
100% 1.03G/1.03G [00:13<00:00, 75.5MB/s]


In [3]:
import duckdb
import pandas as pd
import geopandas as gpd
from shapely import wkb
import joblib
from typing import Dict


In [22]:
def cargar_modelo(ruta_modelo: str):
    """Carga el modelo entrenado desde un archivo."""
    return joblib.load(ruta_modelo)

def extraer_caracteristicas(lat: float, lon: float, denue_db_path: str, censo_db_path: str) -> pd.DataFrame:
    conn = duckdb.connect(':memory:')
    conn.execute("INSTALL spatial; LOAD spatial;")

    # Crear geometría del punto y buffers
    conn.execute(f"""
        CREATE TEMP TABLE nuevo_punto AS
        SELECT
            ST_GeomFromText('POINT({lon} {lat})') AS geometry,
            ST_Buffer(ST_GeomFromText('POINT({lon} {lat})'), 0.000898) AS buffer_100,
            ST_Buffer(ST_GeomFromText('POINT({lon} {lat})'), 0.00449) AS buffer_500
    """)

    # Conectar a bases de datos
    conn.execute(f"ATTACH '{denue_db_path}' AS denue_db")
    conn.execute(f"ATTACH '{censo_db_path}' AS censo_db")

    # =========================================================================
    # 1. Características de DENUE (actividades y personal)
    # =========================================================================
    act_codes = ['46', '51', '54', '11', '22', '52', '71', '43', '31', '61',
                 '23', '55', '93', '53', '81', '33', '48', '32', '56', '49',
                 '62', '21', '72']

    features = {
        'epo_100': 0,
        'epo_500': 0,
        **{f'act_{code}_100': 0 for code in act_codes},
        **{f'act_{code}_500': 0 for code in act_codes},
    }

    # DENUE - Actividades en buffer 100m
    denue_100 = conn.execute("""
        SELECT d.codigo_act_2c, SUM(d.est_per_ocu) AS total
        FROM denue_db.denue_est_per_ocu d
        JOIN nuevo_punto n ON ST_Intersects(d.geometry, n.buffer_100)
        GROUP BY d.codigo_act_2c
    """).fetchdf()
    for _, row in denue_100.iterrows():
        code = row['codigo_act_2c']
        if code in act_codes:
            features[f'act_{code}_100'] = row['total']

    # DENUE - Actividades en buffer 500m
    denue_500 = conn.execute("""
        SELECT d.codigo_act_2c, SUM(d.est_per_ocu) AS total
        FROM denue_db.denue_est_per_ocu d
        JOIN nuevo_punto n ON ST_Intersects(d.geometry, n.buffer_500)
        GROUP BY d.codigo_act_2c
    """).fetchdf()
    for _, row in denue_500.iterrows():
        code = row['codigo_act_2c']
        if code in act_codes:
            features[f'act_{code}_500'] = row['total']

    # DENUE - Personal ocupado total
    features['epo_100'] = conn.execute("""
        SELECT COALESCE(SUM(d.est_per_ocu), 0)
        FROM denue_db.denue_est_per_ocu d
        JOIN nuevo_punto n ON ST_Intersects(d.geometry, n.buffer_100)
    """).fetchone()[0]

    features['epo_500'] = conn.execute("""
        SELECT COALESCE(SUM(d.est_per_ocu), 0)
        FROM denue_db.denue_est_per_ocu d
        JOIN nuevo_punto n ON ST_Intersects(d.geometry, n.buffer_500)
    """).fetchone()[0]

    # =========================================================================
    # 2. Características del Censo
    # =========================================================================
    # Obtener todas las variables del censo usadas en el entrenamiento
    variables_censo = [
        'POBTOT', 'POBFEM', 'POBMAS', 'P_0A2', 'P_0A2_F', 'P_0A2_M',
        'P_3YMAS', 'P_3YMAS_F', 'P_3YMAS_M', 'P_5YMAS', 'P_5YMAS_F',
        'P_5YMAS_M', 'P_12YMAS', 'P_12YMAS_F', 'P_12YMAS_M', 'P_15YMAS',
        'P_15YMAS_F', 'P_15YMAS_M', 'P_18YMAS', 'P_18YMAS_F', 'P_18YMAS_M',
        'P_3A5', 'P_3A5_F', 'P_3A5_M', 'P_6A11', 'P_6A11_F', 'P_6A11_M',
        'P_8A14', 'P_8A14_F', 'P_8A14_M', 'P_12A14', 'P_12A14_F',
        'P_12A14_M', 'P_15A17', 'P_15A17_F', 'P_15A17_M', 'P_18A24',
        'P_18A24_F', 'P_18A24_M', 'P_15A49_F', 'P_60YMAS', 'P_60YMAS_F',
        'P_60YMAS_M', 'REL_H_M', 'POB0_14', 'POB15_64', 'POB65_MAS',
        'PROM_HNV', 'PNACENT', 'PNACENT_F', 'PNACENT_M', 'PNACOE', 'PNACOE_F',
        'PNACOE_M', 'PRES2015', 'PRES2015_F', 'PRES2015_M', 'PRESOE15',
        'PRESOE15_F', 'PRESOE15_M', 'P3YM_HLI', 'P3YM_HLI_F', 'P3YM_HLI_M',
        'P3HLINHE', 'P3HLINHE_F', 'P3HLINHE_M', 'P3HLI_HE', 'P3HLI_HE_F',
        'P3HLI_HE_M', 'P5_HLI', 'P5_HLI_NHE', 'P5_HLI_HE', 'PHOG_IND',
        'POB_AFRO', 'POB_AFRO_F', 'POB_AFRO_M', 'PCON_DISC', 'PCDISC_MOT',
        'PCDISC_VIS', 'PCDISC_LENG', 'PCDISC_AUD', 'PCDISC_MOT2', 'PCDISC_MEN',
        'PCON_LIMI', 'PCLIM_CSB', 'PCLIM_VIS', 'PCLIM_HACO', 'PCLIM_OAUD',
        'PCLIM_MOT2', 'PCLIM_RE_CO', 'PCLIM_PMEN', 'PSIND_LIM', 'P3A5_NOA',
        'P3A5_NOA_F', 'P3A5_NOA_M', 'P6A11_NOA', 'P6A11_NOAF', 'P6A11_NOAM',
        'P12A14NOA', 'P12A14NOAF', 'P12A14NOAM', 'P15A17A', 'P15A17A_F',
        'P15A17A_M', 'P18A24A', 'P18A24A_F', 'P18A24A_M', 'P8A14AN',
        'P8A14AN_F', 'P8A14AN_M', 'P15YM_AN', 'P15YM_AN_F', 'P15YM_AN_M',
        'P15YM_SE', 'P15YM_SE_F', 'P15YM_SE_M', 'P15PRI_IN', 'P15PRI_INF',
        'P15PRI_INM', 'P15PRI_CO', 'P15PRI_COF', 'P15PRI_COM', 'P15SEC_IN',
        'P15SEC_INF', 'P15SEC_INM', 'P15SEC_CO', 'P15SEC_COF', 'P15SEC_COM',
        'P18YM_PB', 'P18YM_PB_F', 'P18YM_PB_M', 'GRAPROES', 'GRAPROES_F',
        'GRAPROES_M', 'PEA', 'PEA_F', 'PEA_M', 'PE_INAC', 'PE_INAC_F',
        'PE_INAC_M', 'POCUPADA', 'POCUPADA_F', 'POCUPADA_M', 'PDESOCUP',
        'PDESOCUP_F', 'PDESOCUP_M', 'PSINDER', 'PDER_SS', 'PDER_IMSS',
        'PDER_ISTE', 'PDER_ISTEE', 'PAFIL_PDOM', 'PDER_SEGP', 'PDER_IMSSB',
        'PAFIL_IPRIV', 'PAFIL_OTRAI', 'P12YM_SOLT', 'P12YM_CASA', 'P12YM_SEPA',
        'PCATOLICA', 'PRO_CRIEVA', 'POTRAS_REL', 'PSIN_RELIG', 'TOTHOG',
        'HOGJEF_F', 'HOGJEF_M', 'POBHOG', 'PHOGJEF_F', 'PHOGJEF_M', 'VIVTOT',
        'TVIVHAB', 'TVIVPAR', 'VIVPAR_HAB', 'VIVPARH_CV', 'TVIVPARHAB',
        'VIVPAR_DES', 'VIVPAR_UT', 'OCUPVIVPAR', 'PROM_OCUP', 'PRO_OCUP_C',
        'VPH_PISODT', 'VPH_PISOTI', 'VPH_1DOR', 'VPH_2YMASD', 'VPH_1CUART',
        'VPH_2CUART', 'VPH_3YMASC', 'VPH_C_ELEC', 'VPH_S_ELEC', 'VPH_AGUADV',
        'VPH_AEASP', 'VPH_AGUAFV', 'VPH_TINACO', 'VPH_CISTER', 'VPH_EXCSA',
        'VPH_LETR', 'VPH_DRENAJ', 'VPH_NODREN', 'VPH_C_SERV', 'VPH_NDEAED',
        'VPH_DSADMA', 'VPH_NDACMM', 'VPH_SNBIEN', 'VPH_REFRI', 'VPH_LAVAD',
        'VPH_HMICRO', 'VPH_AUTOM', 'VPH_MOTO', 'VPH_BICI', 'VPH_RADIO', 'VPH_TV',
        'VPH_PC', 'VPH_TELEF', 'VPH_CEL', 'VPH_INTER', 'VPH_STVP', 'VPH_SPMVPI',
        'VPH_CVJ', 'VPH_SINRTV', 'VPH_SINLTC', 'VPH_SINCINT', 'VPH_SINTIC'
    ]

    # Agregar variables del censo para buffer 100m
    censo_100 = conn.execute(f"""
        SELECT {', '.join([f'COALESCE(SUM({var}), 0) AS {var}_100' for var in variables_censo])}
        FROM denue_db.censo_geo_int_centroid c
        JOIN nuevo_punto n ON ST_Intersects(c.geometry, n.buffer_100)
    """).fetchone()

    # Agregar variables del censo para buffer 500m
    censo_500 = conn.execute(f"""
        SELECT {', '.join([f'COALESCE(SUM({var}), 0) AS {var}_500' for var in variables_censo])}
        FROM denue_db.censo_geo_int_centroid c
        JOIN nuevo_punto n ON ST_Intersects(c.geometry, n.buffer_500)
    """).fetchone()

    # Combinar todas las features
    features.update({
        f'censo_{var}_100': censo_100[i] for i, var in enumerate(variables_censo)
    })
    features.update({
        f'censo_{var}_500': censo_500[i] for i, var in enumerate(variables_censo)
    })

    # =========================================================================
    # 3. Asegurar el orden correcto de las columnas
    # =========================================================================
    # Obtener nombres de features del modelo entrenado
    modelo = joblib.load('trained_model.joblib')
    expected_features = modelo.feature_names_in_

    # Crear DataFrame con todas las columnas esperadas
    df = pd.DataFrame([features])

    # Añadir columnas faltantes con 0
    for col in expected_features:
        if col not in df.columns:
            df[col] = 0

    # Reordenar columnas como el modelo espera
    df = df[expected_features]

    conn.close()
    return df

def predecir_probabilidad(lat: float, lon: float, ruta_modelo: str,
                          denue_db_path: str, censo_db_path: str) -> Dict[int, float]:
    """
    Predice la probabilidad de que el punto sea OXXO (1) o Tienda de Abarrotes (0).
    """
    modelo = cargar_modelo(ruta_modelo)
    features = extraer_caracteristicas(lat, lon, denue_db_path, censo_db_path)
    proba = modelo.predict_proba(features)[0]
    return {1: proba[1], 0: proba[0]}

In [29]:
lat=19.4326
lon=-99.1332

In [30]:
denue_db_path='/content/denue_total_est_per_ocu_final_5.duckdb'
censo_db_path='/content/datos_censo_nacional_s9.duckdb'
probabilidades = predecir_probabilidad(
    lat=lat,
    lon=lon,
    ruta_modelo='trained_model.joblib',
    denue_db_path=denue_db_path,
    censo_db_path=censo_db_path
)
print(f"Probabilidad OXXO (1): {probabilidades[1]:.2f}")
print(f"Probabilidad Tienda (0): {probabilidades[0]:.2f}")

Probabilidad OXXO (1): 0.90
Probabilidad Tienda (0): 0.10


In [31]:
caracteristicas = extraer_caracteristicas(lat, lon, denue_db_path, censo_db_path)

In [32]:
caracteristicas

Unnamed: 0,epo_100,epo_500,act_51_100,act_54_100,act_11_100,act_22_100,act_52_100,act_71_100,act_43_100,act_31_100,...,censo_VPH_TELEF_500,censo_VPH_CEL_500,censo_VPH_INTER_500,censo_VPH_STVP_500,censo_VPH_SPMVPI_500,censo_VPH_CVJ_500,censo_VPH_SINRTV_500,censo_VPH_SINLTC_500,censo_VPH_SINCINT_500,censo_VPH_SINTIC_500
0,184.0,33546.0,2.5,0,0,0,7.5,2.5,2.5,0,...,330,494,374,117,144,60,3,5,129,0


In [33]:
lat=21.893526864761085
lon=-102.2406108654755

In [34]:
probabilidades = predecir_probabilidad(
    lat=lat,
    lon=lon,
    ruta_modelo='trained_model.joblib',
    denue_db_path=denue_db_path,
    censo_db_path=censo_db_path
)
print(f"Probabilidad OXXO (1): {probabilidades[1]:.2f}")
print(f"Probabilidad Tienda (0): {probabilidades[0]:.2f}")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Probabilidad OXXO (1): 0.25
Probabilidad Tienda (0): 0.75
