# 03 - Buildings EDA (Microsoft & Google) - MongoDB

Este notebook realiza el EDA sobre las colecciones `microsoft_buildings`, `google_buildings` y `pdet_municipalities` en MongoDB.
- Nota: usa centroides de cada edificio para conteos por municipio.
- Ruta de guardado: `results/figures/deliverable_3/`


- Por defecto el notebook carga una muestra (SAMPLE_MODE=True) para pruebas rápidas.

In [1]:
%pip install --quiet geopandas pandas pymongo shapely folium plotly pyarrow

import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import geopandas as gpd
from shapely.geometry import shape
from pymongo import MongoClient
import plotly.express as px
import folium

os.makedirs('results/figures/deliverable_3', exist_ok=True)

# Configuración de conexión MongoDB (ajusta si corresponde)
MONGO_URI = 'mongodb://localhost:27017'
DB_NAME = 'pdet_solar_analysis'

SAMPLE_MODE = True
SAMPLE_SIZE = 50000


Note: you may need to restart the kernel to use updated packages.


In [2]:
def get_mongo_client(uri=None):
    uri = uri or MONGO_URI
    return MongoClient(uri)

def load_collection_as_gdf_from_mongo(client, db_name, coll_name, geom_field='geometry', sample_mode=True, sample_size=10000):
    db = client[db_name]
    coll = db[coll_name]
    total = coll.count_documents({})
    print(f'Colección {coll_name}: {total} documentos')
    if sample_mode and total > sample_size:
        print(f'Cargando muestra aleatoria de {sample_size} registros...')
        cursor = coll.aggregate([{ '$sample': { 'size': sample_size } }])
        docs = list(cursor)
    else:
        docs = list(coll.find({}))
    if len(docs) == 0:
        print('Colección vacía o no accesible')
        return gpd.GeoDataFrame()
    import pandas as pd
    df = pd.DataFrame(docs)
    # Normalizar geometría
    if geom_field in df.columns:
        df['geometry'] = df[geom_field].apply(lambda g: shape(g) if isinstance(g, dict) else g)
        gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
    else:
        # intentar detectar 'geom' o 'geom_wkt'
        if 'geom' in df.columns:
            df['geometry'] = df['geom'].apply(lambda g: shape(g) if isinstance(g, dict) else g)
            gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
        else:
            gdf = gpd.GeoDataFrame(df)
    return gdf


In [3]:
# Conectar a Mongo
client = get_mongo_client()
db = client[DB_NAME]

# Cargar colecciones (muestra si SAMPLE_MODE=True)
ms_gdf = load_collection_as_gdf_from_mongo(client, DB_NAME, 'microsoft_buildings', sample_mode=SAMPLE_MODE, sample_size=SAMPLE_SIZE)
ggl_gdf = load_collection_as_gdf_from_mongo(client, DB_NAME, 'google_buildings', sample_mode=SAMPLE_MODE, sample_size=SAMPLE_SIZE)
pdet_gdf = load_collection_as_gdf_from_mongo(client, DB_NAME, 'pdet_municipalities', sample_mode=False)

print('Cargado: ms=', len(ms_gdf), 'ggl=', len(ggl_gdf), 'pdet=', len(pdet_gdf))

ServerSelectionTimeoutError: localhost:27017: [WinError 10061] No connection could be made because the target machine actively refused it (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 691b4f543a3e6899a65e2295, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [WinError 10061] No connection could be made because the target machine actively refused it (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

## Resumen rápido y limpieza mínima

In [None]:
def dataset_summary(gdf, name='dataset'):
    print('---', name, '---')
    print('registros:', len(gdf))
    print('columnas:', list(gdf.columns)[:20])
    if 'geometry' in gdf.columns:
        print('geometrías inválidas:', int((~gdf.is_valid).sum()))
    print('\n')

dataset_summary(ms_gdf, 'Microsoft')
dataset_summary(ggl_gdf, 'Google')
dataset_summary(pdet_gdf, 'PDET')

## Conteo por municipio PDET usando centroides (cada edificio -> 1 punto)
Usamos centroides para asignar cada edificio a un único municipio PDET y evitar duplicidad cuando un polígono toca límites.

In [None]:
# Generar centroides (no modifica el gdf original)
if 'geometry' in ms_gdf.columns:
    ms_centroids = ms_gdf.copy()
    ms_centroids['geometry'] = ms_centroids.geometry.centroid
else:
    ms_centroids = ms_gdf

if 'geometry' in ggl_gdf.columns:
    ggl_centroids = ggl_gdf.copy()
    ggl_centroids['geometry'] = ggl_centroids.geometry.centroid
else:
    ggl_centroids = ggl_gdf

# Asegurar CRS
try:
    ms_centroids = ms_centroids.set_crs(epsg=4326, allow_override=True)
    ggl_centroids = ggl_centroids.set_crs(epsg=4326, allow_override=True)
    pdet_gdf = pdet_gdf.set_crs(epsg=4326, allow_override=True)
except Exception:
    pass

import geopandas as gpd
ms_join = gpd.sjoin(ms_centroids, pdet_gdf[['municipio','geometry']], how='inner', predicate='intersects')
ggl_join = gpd.sjoin(ggl_centroids, pdet_gdf[['municipio','geometry']], how='inner', predicate='intersects')

ms_count = ms_join.groupby('municipio').size().reset_index(name='ms_count')
ggl_count = ggl_join.groupby('municipio').size().reset_index(name='ggl_count')

coverage = ms_count.merge(ggl_count, on='municipio', how='outer').fillna(0)
coverage = coverage.sort_values('municipio')
coverage.head()

## Comparación y exportación

In [None]:
# Tabla resumen comparativa
comparison = pd.DataFrame({
    'dataset': ['Microsoft', 'Google'],
    'num_buildings': [len(ms_gdf), len(ggl_gdf)]
})

os.makedirs('results/figures/deliverable_3', exist_ok=True)
coverage.to_csv('results/figures/deliverable_3/coverage_pdet_centroid.csv', index=False)
comparison.to_csv('results/figures/deliverable_3/comparison_table.csv', index=False)
print('✅ Resultados guardados en results/figures/deliverable_3/')

## Visualizaciones 


In [None]:
# Ejemplo: histograma de áreas si existe 'area_m2'
if 'area_m2' in ms_gdf.columns:
    fig = px.histogram(ms_gdf, x='area_m2', nbins=100, title='Microsoft: Distribución de Áreas')
    fig.write_html('results/figures/deliverable_3/ms_area_histogram.html')
if 'area_m2' in ggl_gdf.columns:
    fig = px.histogram(ggl_gdf, x='area_m2', nbins=100, title='Google: Distribución de Áreas')
    fig.write_html('results/figures/deliverable_3/ggl_area_histogram.html')

print('Si deseas generar heatmaps o mapas interactivos, ejecuta las celdas específicas a continuación.')


## Histogramas  

In [None]:
{
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Histogramas de áreas (si la columna existe)\n",
    "if 'area_m2' in ms_gdf.columns:\n",
    "    plt.figure(figsize=(10,5))\n",
    "    ms_gdf['area_m2'].dropna().plot(kind='hist', bins=80)\n",
    "    plt.title('Distribución Áreas – Microsoft Buildings')\n",
    "    plt.xlabel('Área (m²)')\n",
    "    plt.ylabel('Frecuencia')\n",
    "    plt.grid(True)\n",
    "    plt.show()\n",
    "\n",
    "if 'area_m2' in ggl_gdf.columns:\n",
    "    plt.figure(figsize=(10,5))\n",
    "    ggl_gdf['area_m2'].dropna().plot(kind='hist', bins=80)\n",
    "    plt.title('Distribución Áreas – Google Buildings')\n",
    "    plt.xlabel('Área (m²)')\n",
    "    plt.ylabel('Frecuencia')\n",
    "    plt.grid(True)\n",
    "    plt.show()"]
}


## Barras comparativas por municipio

In [None]:
{
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "import plotly.express as px\n",
    "\n",
    "fig = px.bar(\n",
    "    coverage,\n",
    "    x='municipio',\n",
    "    y=['ms_count', 'ggl_count'],\n",
    "    barmode='group',\n",
    "    title='Comparación Edificaciones por Municipio PDET (centroides)',\n",
    "    labels={'value': 'Número de Edificios', 'municipio': 'Municipio'},\n",
    "    height=600\n",
    ")\n",
    "fig.update_layout(xaxis={'categoryorder':'total descending'})\n",
    "fig.write_html('results/figures/deliverable_3/pdet_comparative_barplot.html')\n",
    "fig.show()"
    ] 
}


## Heatmap Microsoft

In [None]:
{
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "from folium.plugins import HeatMap\n",
    "import folium\n",
    "\n",
    "m_ms = folium.Map(location=[4.5709, -74.2973], zoom_start=6)\n",
    "ms_heat = [[row.geometry.y, row.geometry.x] for _, row in ms_centroids.iterrows()]\n",
    "HeatMap(ms_heat, radius=5).add_to(m_ms)\n",
    "m_ms.save('results/figures/deliverable_3/ms_heatmap.html')\n",
    "m_ms" ]
}


## Heatmap Google

In [None]:
{
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "m_gg = folium.Map(location=[4.5709, -74.2973], zoom_start=6)\n",
    "gg_heat = [[row.geometry.y, row.geometry.x] for _, row in ggl_centroids.iterrows()]\n",
    "HeatMap(gg_heat, radius=5).add_to(m_gg)\n",
    "m_gg.save('results/figures/deliverable_3/google_heatmap.html')\n",
    "m_gg"]
}


## Mapa comparativo Microsoft vs Google

In [None]:
{
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "m_compare = folium.Map(location=[4.5709, -74.2973], zoom_start=6)\n",
    "\n",
    "for _, row in ms_centroids.sample(min(4000, len(ms_centroids))).iterrows():\n",
    "    folium.CircleMarker(\n",
    "        location=[row.geometry.y, row.geometry.x], radius=1,\n",
    "        color='blue', fill=True, fill_opacity=0.5\n",
    "    ).add_to(m_compare)\n",
    "\n",
    "for _, row in ggl_centroids.sample(min(4000, len(ggl_centroids))).iterrows():\n",
    "    folium.CircleMarker(\n",
    "        location=[row.geometry.y, row.geometry.x], radius=1,\n",
    "        color='red', fill=True, fill_opacity=0.5\n",
    "    ).add_to(m_compare)\n",
    "\n",
    "m_compare.save('results/figures/deliverable_3/compare_ms_google_map.html')\n",
    "m_compare"]
}


## Choropleth de cobertura PDET

In [None]:
{
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "pdet_cov = pdet_gdf.merge(coverage, on='municipio', how='left').fillna(0)\n",
    "\n",
    "fig = px.choropleth(\n",
    "    pdet_cov,\n",
    "    geojson=pdet_cov.geometry.__geo_interface__,\n",
    "    locations=pdet_cov.index,\n",
    "    color='ms_count',\n",
    "    color_continuous_scale='Viridis',\n",
    "    title='Cobertura Microsoft por Municipio PDET (centroides)'\n",
    ")\n",
    "\n",
    "fig.update_geos(fitbounds='locations', visible=False)\n",
    "fig.write_html('results/figures/deliverable_3/choropleth_microsoft.html')\n",
    "fig.show()"]
}
