# DENUE information

@roman

6 July, 2024

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from shapely.geometry import Polygon, MultiPolygon, Point
import h3
from tqdm import tqdm
from dotenv import load_dotenv
from scipy.spatial import KDTree
# dbscan
from sklearn.cluster import DBSCAN
from INEGIpy import DENUE, MarcoGeoestadistico

In [None]:
# show 100 columns in pandas
pd.set_option('display.max_columns', 100)

# Load environment variables
load_dotenv()

# inegi class
denue_api = DENUE(token=os.getenv('API_INEGI'))
inegi_api = MarcoGeoestadistico()


In [None]:
# params
ESTRATO_CATEGORIES = [
    '0 a 5 personas',
    '6 a 10 personas',
    '11 a 30 personas',
    '31 a 50 personas',
    '51 a 100 personas',
    '101 a 250 personas',
    '251 y más personas'
]

IMPORTANT_VARS_DENUE = [
    'CLEE', 'Nombre', 'Razon_social', 'Clase_actividad', 'estrato_category',
    'CLASE_ACTIVIDAD_ID', 'SUBRAMA_ACTIVIDAD_ID', 'AreaGeo', 'state_id',
    'Fecha_Alta', 'geometry'
]

---
# Data

## Properties

In [None]:
# read properties
gdf_properties = gpd.read_parquet("../../data/interim/cleaned_data_s6.parquet")
# info
gdf_properties.info()

## Mexico

In [None]:
# get mexico shapes
gdf_mexico = inegi_api.Entidades()
gdf_mexico.shape

---
# DENUE's Data

In [None]:
# search denue's data
def get_denue_units(list_of_names, activity_id, max_units=100_000):
    # fetch each name
    list_units_df = []

    for name in tqdm(list_of_names):
        gdf = denue_api.BuscarAreaAct(
            nombre=name,
            clave_area='00',
            registro_inicial=0,
            registro_final=max_units,
            estrato='0'
        )
        list_units_df.append(gdf)
    
    # concat
    df_append = pd.concat(list_units_df, axis=0)

    # filter by CLASE_ACTIVIDAD_ID in activity_id
    mask = df_append['CLASE_ACTIVIDAD_ID'].isin(activity_id)
    df_append = (
        df_append
        .loc[mask]
        # drop
        .drop_duplicates('CLEE', keep='first')
        .query("Tipo == 'Fijo'")  # we want always fixed units
        # format
        .assign(
            estrato_category=lambda x: pd.Categorical(
                x['Estrato'],
                categories=ESTRATO_CATEGORIES,
                ordered=True
            ),
            Razon_social=lambda x: 
                x['Razon_social'].str.strip().str.upper(),
            state_id=lambda x: x['AreaGeo'].str[:2],
            Fecha_Alta=lambda x: pd.to_datetime(
                x['Fecha_Alta'], format='%Y-%m', errors='coerce'
            )
        )
        # drop columns
        .loc[:, IMPORTANT_VARS_DENUE]
        .reset_index(drop=True)
    )
    
    return df_append


def plot_denues(gdf, title='DENUE'):
    # plot
    fig, ax = plt.subplots(figsize=(8, 8))
    gdf_mexico.plot(ax=ax, color='white', edgecolor='black')
    gdf.plot(ax=ax, color='C0', markersize=1)
    # add titles
    plt.title(f'{title} units')
    # remove ticks
    ax.set_xticks([])
    ax.set_yticks([])
    # show
    plt.show()


def denue_eda(gdf, title='DENUE'):
    # s2: count razon social
    print(f"\n {'='*10}Razon social {'='*10}")
    print(gdf['Razon_social'].value_counts(normalize=True).head(10))

    # s3 count estrato
    print(f"\n {'='*10}Estrato {'='*10}")
    print(gdf['estrato_category'].value_counts(normalize=True).sort_index())

    # s1: plot map
    plot_denues(gdf, title)

    # s4: plot ts
    fig, ax = plt.subplots(figsize=(8, 4))
    gdf.groupby('Fecha_Alta').size().cumsum().plot(ax=ax, drawstyle='steps-post')
    # add titles
    plt.title(f'{title} units time series')
    plt.xticks(rotation=90)
    plt.xlabel('Date')
    plt.ylabel('Units')
    # ylim in 0
    plt.ylim(0, None)
    # show
    plt.show()

    # s5: plot state_id
    fig, ax = plt.subplots(figsize=(8, 4))
    gdf['state_id'].value_counts(normalize=True).sort_index().plot(kind='bar')
    # add titles
    plt.title(f'{title} units by state')
    plt.xticks(rotation=90)
    plt.xlabel('State')
    plt.ylabel('Percentage')
    # y ticks in %
    ax.yaxis.set_major_formatter(lambda x, _: f'{x:.0%}')
    # show
    plt.show()
    return None

    


## Supermarkets

### Fetch

In [None]:
# get supermarkets
list_names = [
    'supermercado'
]

acivity_ids = [
    '462111',  # super comercio al por menor
]

# fetch
gdf_supermarkets = get_denue_units(list_names, acivity_ids)

# see the data
print(gdf_supermarkets.shape)
gdf_supermarkets.sample(5)

In [None]:
# important supermarkets
important_supermarkets = [
    'NUEVA WAL MART DE MEXICO S DE RL DE CV',
    'TIENDAS SORIANA SA DE CV',
    'TIENDAS CHEDRAUI SA DE CV',
    'CASA LEY SAPI DE CV',
    'QAR SUPERMERCADOS SAPI DE CV',
    'CORPORACION SANCHEZ SA DE CV',
    'COMERCIAL CITY FRESKO S DE RL DE CV',
    'OPERADORA FUTURAMA SA DE CV',
    'SUPERMERCADOS INTERNACIONALES HEB SA DE CV',
    'SUPERISSSTE',
    'INSTITUTO MEXICANO DEL SEGURO SOCIAL',
    'SUPER SAN FRANCISCO DE ASIS SA DE CV',
    'COSTCO DE MEXICO SA DE CV'
]

# filter
gdf_supermarkets = (
    gdf_supermarkets
    .query("Razon_social in @important_supermarkets")
    .query("estrato_category >= '11 a 30 personas'")
    .reset_index(drop=True)
)

# see
print(gdf_supermarkets.shape)

### EDA

In [None]:
# eda
denue_eda(gdf_supermarkets, 'Supermarkets')

## Hospitals

In [None]:
# get hospitals
list_names = [
    'hospitales'
]

activity_ids = [
    '622111',  # private hospitals
    '622112',  # public hospitals
]

# fetch
gdf_hospitals = get_denue_units(list_names, activity_ids)

# see the data
print(gdf_hospitals.shape)
gdf_hospitals.sample(5)

In [None]:
# filter
gdf_hospitals = (
    gdf_hospitals
    .query("estrato_category >= '251 y más personas'")
    .reset_index(drop=True)
)

# see
print(gdf_hospitals.shape)

### EDA

In [None]:
# eda
denue_eda(gdf_hospitals, 'Hospitals')

## Metro

This one is more artesanal

### Fetch

In [None]:
# get metro
list_names = [
    'metro'
]

activity_ids = [
    '485114',  # metro
]

# fetch
gdf_metro = get_denue_units(list_names, activity_ids)

# see the data
print(gdf_metro.shape)
gdf_metro.sample(5)

### Filtering

In [None]:
# see cve_ents
gdf_metro['AreaGeo'].str[:2].value_counts()

In [None]:
# drop rows with cve_ents eq to 11 or 14
mask = gdf_metro['AreaGeo'].str[:2].isin(['11', '14'])
gdf_metro = gdf_metro[~mask].reset_index(drop=True)
gdf_metro.shape

In [None]:
# explore
gdf_metro.loc[:, ['Nombre', 'geometry']].explore(tiles='CartoDB positron')

^^^
there are more than 1 metro stations in the same location, we need to cluster them

### Clean

In [None]:
# get longitude & latitude
gdf_metro = gdf_metro.to_crs('EPSG:6372')
gdf_metro['longitude'] = gdf_metro.centroid.x
gdf_metro['latitude'] = gdf_metro.centroid.y
# return to 4326
gdf_metro = gdf_metro.to_crs('EPSG:4326')

In [None]:
# use dbscan to cluster metro stations
eps = 250 # meters
min_samples = 2

# get clusters
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
gdf_metro['cluster'] = dbscan.fit_predict(gdf_metro[['latitude', 'longitude']])

In [None]:
# see clusters
gdf_metro['cluster'].value_counts()

In [None]:
# get only first observation for each cluster
gdf_metro_unique_clusters = gdf_metro.query("cluster == -1").copy()
gdf_metro_multiple_clusters = gdf_metro.query("cluster != -1").drop_duplicates('cluster', keep='first').copy()

# for multiple clusters, get the average of latitude and longitude
gdf_metro_multiple_clusters['longitude'] = gdf_metro_multiple_clusters.groupby('cluster')['longitude'].transform('mean')
gdf_metro_multiple_clusters['latitude'] = gdf_metro_multiple_clusters.groupby('cluster')['latitude'].transform('mean')

# join
gdf_metro2 = pd.concat([gdf_metro_unique_clusters, gdf_metro_multiple_clusters], axis=0)
gdf_metro2.shape

# set new geometry
gdf_metro2['geometry'] = gdf_metro2.apply(lambda x: Point(x['longitude'], x['latitude']), axis=1)
gdf_metro2 = gpd.GeoDataFrame(gdf_metro2, geometry='geometry', crs='EPSG:6372')

# change crs
gdf_metro2 = gdf_metro2.to_crs('EPSG:4326')


In [None]:
# explore
gdf_metro2.loc[:, ['Nombre', 'geometry']].explore(tiles='CartoDB positron')

### EDA

In [None]:
# eda
denue_eda(gdf_metro2, 'Metro')

## Schools

In [None]:
# schools
list_names = [
    'universidad', 'escuela', 'colegio', 'instituto', 'campus'
]

activity_ids = [
    '611311',  # private universities
    '611312',  # public schools
]

# fetch
gdf_schools = get_denue_units(list_names, activity_ids)

# see the data
print(gdf_schools.shape)
gdf_schools.sample(5)

In [None]:
# filter
gdf_schools = (
    gdf_schools
    .query("estrato_category >= '251 y más personas'")  # big universities
    .query("Razon_social != '' & Nombre != ''")  # should be named
    .reset_index(drop=True)
)

# see
print(gdf_schools.shape)
gdf_schools.sample(5)

### EDA

In [None]:
# count CLASE_ACTIVIDAD_ID
gdf_schools['CLASE_ACTIVIDAD_ID'].value_counts()

In [None]:
# general eda
denue_eda(gdf_schools, 'Schools')

## Restaurants

In [None]:
# get restaurants
list_names = [
    'restaurante', 'cafeteria', 'sushi', 'comida'
]

activity_ids = [
    '722511',  # restaurants
    '722512',  # seafood restaurants
]

# fetch
gdf_restaurants = get_denue_units(list_names, activity_ids)

# see the data
print(gdf_restaurants.shape)

In [None]:
# filter
# mask
mask = (
    gdf_restaurants['Nombre'].ne('')  # known name
    & gdf_restaurants['estrato_category'].ge('6 a 10 personas')
    & gdf_restaurants['estrato_category'].le('51 a 100 personas') 
    & gdf_restaurants['Razon_social'].ne('')  # known razon social
)
gdf_restaurants = gdf_restaurants[mask].reset_index(drop=True)

# see
print(gdf_restaurants.shape)
gdf_restaurants.sample(5)

### EDA

In [None]:
# general eda
denue_eda(gdf_restaurants, 'Restaurants')

## Concat All

In [None]:
# append all into one
gdf_denue = pd.concat([
    gdf_supermarkets.assign(category='supermarkets'),
    gdf_hospitals.assign(category='hospitals'),
    gdf_metro2.assign(category='metro'),
    gdf_schools.assign(category='schools'),
    gdf_restaurants.assign(category='restaurants')
], axis=0)

# delete longitude, latitude and cluster
gdf_denue = gdf_denue.drop(columns=['longitude', 'latitude', 'cluster'])

# see
print(gdf_denue.shape)
gdf_denue.sample(5)

In [None]:
# see crs
gdf_denue.crs

In [None]:
# count category by state
gdf_denue.groupby(['state_id', 'category']).size().unstack().plot(kind='bar', stacked=True, figsize=(8, 4))

In [None]:
# plot
plot_denues(gdf_denue, 'All DENUE')

In [None]:
# count unique clee
gdf_denue['CLEE'].nunique()

## Save

In [None]:
# save data into misc
dir_save = '../../data/misc'
os.makedirs(dir_save, exist_ok=True)

# save
gdf_denue.to_parquet(f'{dir_save}/denue_data.parquet')

---
# Count Of Units

In [None]:
dsfas

## Count

In [None]:
# define function to get poiunts at r distance
def count_denues(gdf_props, gdf_denues, r=1):
    # fit a kdtree
    kdtree = KDTree(
        data=gdf_denues[['longitude', 'latitude']],
    )

    # count neighbors at r-km
    return kdtree.query_ball_point(
        gdf_props[['longitude', 'latitude']],
        r=r * 1_000,
        workers=-1,
        return_length=True
    )

In [None]:
# set crs to 6372
gdf_properties = gdf_properties.to_crs('EPSG:6372')
gdf_denue = gdf_denue.to_crs('EPSG:6372')

# get longitude & latitude
gdf_properties['longitude'] = gdf_properties.centroid.x
gdf_properties['latitude'] = gdf_properties.centroid.y

gdf_denue['longitude'] = gdf_denue.centroid.x
gdf_denue['latitude'] = gdf_denue.centroid.y

In [None]:
# count denues
dict_activities_search = {
    'supermarkets': 1,  # 1 km
    'hospitals': 5,
    'metro': 1,
    'schools': 1,
    'restaurants': 1
}

# count
for category, radius in tqdm(dict_activities_search.items()):
    gdf_properties[f'count_{category}'] = count_denues(
        gdf_properties,
        gdf_denue.query("category == @category"),
        r=radius
        )

## EDA

In [None]:
# describe
vars2describe = gdf_properties.columns[gdf_properties.columns.str.startswith('count_')]
gdf_properties[vars2describe].describe()

In [None]:
# see how many have at least one
gdf_properties[vars2describe].gt(0).mean().to_frame().T

---
# Sandbox