# procesamiento de Base de datos DANE.

In [0]:
[a.path for a in dbutils.fs.ls("/FileStore/tables/shp/isocronas/", )]

In [0]:
from pyspark.sql import SparkSession
from pyspark import StorageLevel
from sedona.spark import *
from sedona.utils import SedonaKryoRegistrator, KryoSerializer

from sedona.register.geo_registrator import SedonaRegistrator

# configuración de spark y Sedona para datos geograficos
spark = SparkSession.\
    builder.\
    master("local[*]").\
    appName("Sedona App").\
    config("spark.serializer", KryoSerializer.getName).\
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName) .\
    config("spark.kryoserializer.buffer.max", "150000m").\
    getOrCreate()
    #config("sedona.global.charset", "utf8").\
    
#SedonaRegistrator.registerAll()
SedonaContext.create(spark)

In [0]:
# censo poblacional 2018
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.utils.adapter import Adapter
sc = spark.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")
#carga el censo 2018 del dane
censo = ShapefileReader.readToGeometryRDD(sc, "dbfs:/FileStore/tables/DANE/censo2018/")
#carga las isocronas
Isocrona = ShapefileReader.readToGeometryRDD(sc, "dbfs:/FileStore/tables/shp/isocronas/")
# carga de espacio publico parques plazas y plazoletas
ep = ShapefileReader.readToGeometryRDD(sc, "dbfs:/FileStore/tables/EP/")
#asigna el sistema de proyección, en este caso proyeccion geografica.
censo.CRSTransform('epsg:4326','epsg:4326')
Isocrona.CRSTransform('epsg:4326','epsg:4326')
ep.CRSTransform('epsg:4326','epsg:4326')
# definición observatorio de espacio publico

# espacio publico efectivo.
#  definido por el observatorio del espacio público como la suma de areas de
# Parques, plazas, plazoletas, zonas verdes, Lote patrimonio inmobiliario, calzada, anden, separador, cicloruta
epf_oep = ShapefileReader.readToGeometryRDD(sc, "dbfs:/FileStore/tables/EPF/")
epf_oep.CRSTransform('epsg:4326','epsg:4326')

In [0]:
from sedona.sql.st_functions import ST_Centroid
#  para no tener desbordamientos en la memoria del cluster se deben convertir los poligonos a un punto centroide
censo_df = Adapter.toDf(censo, spark)
censopoint_df = censo_df.withColumn("geometry", ST_Centroid(censo_df["geometry"]))
censo = Adapter.toSpatialRdd(censopoint_df,"geometry")
censo.CRSTransform('epsg:4326','epsg:4326')
# Epf tambien debe estar como centroide.
epf_oep_df = Adapter.toDf(epf_oep, spark)
epf_point_df = epf_oep_df.withColumn("geometry", ST_Centroid(epf_oep_df["geometry"]))
epf = Adapter.toSpatialRdd(epf_point_df,"geometry")
epf.CRSTransform('epsg:4326','epsg:4326')

In [0]:
#Isocrona.rawSpatialRDD.map(lambda x: x.getUserData())
#Isocrona.map(lambda x: x[0].geom.centroid).collect()

In [0]:
from sedona.utils.adapter import Adapter
censo_df = Adapter.toDf(censo, spark)
Isocrona_df = Adapter.toDf(Isocrona, spark)
ep_df = Adapter.toDf(ep, spark)
epf_df = Adapter.toDf(epf, spark)
c=[field.name for field in censo_df.schema.fields]
c.remove('geometry')
i = [field.name for field in Isocrona_df.schema.fields]
i.remove('geometry')
atributos_join = i+c
atributos_join.insert(0, 'geom')
# atributos de cruce en a espacio publico
e=[field.name for field in ep_df.schema.fields]
e.remove('geometry')
atributosEP_join = i+e
atributosEP_join.insert(0, 'geom')
# atributos de cruce con espacio publico total
epd_fields=[field.name for field in epf_df.schema.fields]
epd_fields.remove('geometry')
atributosEPF_join = i+epd_fields
atributosEPF_join.insert(0, 'geom')

In [0]:
from sedona.core.spatialOperator import JoinQuery
# haciendo las particiones espaciales
censo.analyze()
censo.spatialPartitioning(GridType.KDBTREE, 4)
Isocrona.spatialPartitioning(censo.getPartitioner())
ep.spatialPartitioning(censo.getPartitioner())
epf.spatialPartitioning(censo.getPartitioner())

In [0]:
from sedona.core.enums import GridType
from sedona.core.enums import IndexType
from sedona.core.spatialOperator import JoinQuery
using_index = True
result_censo = JoinQuery.SpatialJoinQueryFlat(censo, Isocrona, using_index, True)
result_ep = JoinQuery.SpatialJoinQueryFlat(ep, Isocrona, using_index, True)
result_ept = JoinQuery.SpatialJoinQueryFlat(epf, Isocrona, using_index, True)

In [0]:
#result.sample(withReplacement=False, fraction=0.01, seed=3).map(lambda x: x[1]).collect()

In [0]:
import geopandas as gpd
valores_censo = gpd.GeoDataFrame(
    result_censo.map(lambda x: [x[1].geom, *x[0].userData.split("\t"), *x[1].userData.split("\t")]).collect(),columns=atributos_join,
    geometry="geom", 
    crs= "epsg:4326"
)
valores_ep = gpd.GeoDataFrame(
    result_ep.map(lambda x: [x[1].geom, *x[0].userData.split("\t"), *x[1].userData.split("\t")]).collect(),columns=atributosEP_join,
    geometry="geom", 
    crs= "epsg:4326"
)
# epf
valores_ept = gpd.GeoDataFrame(
    result_ept.map(lambda x: [x[1].geom, *x[0].userData.split("\t"), *x[1].userData.split("\t")]).collect(),columns=atributosEPF_join,
    geometry="geom",
    crs= "epsg:4326"
)

In [0]:
valores_ept[(valores_ept['layer'] == 'E1')&(valores_ept['Tiempo'] == '5 min')].explore()
#.map(lambda x: [x[1].geom, *x[0].userData.split("\t"), *x[1].userData.split("\t")])

In [0]:
censo2018 = valores_censo[['group_inde','Tiempo','layer','Linea','COD_DANE_A','ZU_CDIVI','SETU_CCDGO','SETU_CCNCT','SECU_CCDGO','SECU_CCNCT','MANZ_CCDGO','AG_CCDGO','DATO_ANM','AREA','LATITUD','LONGITUD','DENSIDAD','CTNENCUEST','TP9_1_USO','TP9_2_USO','TP9_3_USO','TP9_4_USO','TP9_2_1_MI','TP9_2_2_MI','TP9_2_3_MI','TP9_2_4_MI','TP9_2_9_MI','TP9_3_1_NO','TP9_3_2_NO','TP9_3_3_NO','TP9_3_4_NO','TP9_3_5_NO','TP9_3_6_NO','TP9_3_7_NO','TP9_3_8_NO','TP9_3_9_NO','TP9_3_10_N','TP9_3_99_N','TVIVIENDA','TP14_1_TIP','TP14_2_TIP','TP14_3_TIP','TP16_HOG','TP19_EE_E1','TP19_EE_E2','TP19_EE_E3','TP19_EE_E4','TP19_EE_E5','TP19_EE_E6','TP19_EE_E9','TP27_PERSO','PERSONAS_L','PERSONAS_S','TP32_1_SEX','TP32_2_SEX','CD_LC_CM','NMB_LC_CM','TP34_1_EDA','TP34_2_EDA','TP34_3_EDA','TP34_4_EDA','TP34_5_EDA','TP34_6_EDA','TP34_7_EDA','TP34_8_EDA','TP34_9_EDA','geom']]

In [0]:
censo2018 = censo2018.astype({'TP32_1_SEX':'float','TP32_2_SEX':'float', 'TP27_PERSO':'float', 'TP16_HOG':'float','TP34_1_EDA':'float','TP34_2_EDA':'float','TP34_3_EDA':'float','TP34_4_EDA':'float','TP34_5_EDA':'float','TP34_6_EDA':'float','TP34_7_EDA':'float','TP34_8_EDA':'float','TP34_9_EDA':'float', 'TP19_EE_E1':'float','TP19_EE_E2':'float','TP19_EE_E3':'float','TP19_EE_E4':'float','TP19_EE_E5':'float','TP19_EE_E6':'float','TP19_EE_E9':'float'})

In [0]:
valores_ep["AREA M2"]=valores_ep["AREA M2"].astype('float')
valores_ept["AREA M2"]=valores_ept["AREA M2"].astype('float')

In [0]:

import numpy as np
import pandas as pd

valores_ep["AREA M2"] = valores_ep["AREA M2"].fillna(value = 0)
# EPR espacio público real
AreasXIso = pd.DataFrame(valores_ep.groupby(by=['Linea','layer','Tiempo']).agg({'AREA M2':'sum'}).unstack().to_records()).rename(columns={"('AREA M2', '10 min')": 'Area_EPR_10Min',"('AREA M2', '15 min')": 'Area_EPR_15Min',"('AREA M2', '20 min')": 'Area_EPR_20Min',"('AREA M2', '5 min')": 'Area_EPR_5Min'})

valores_ept["AREA M2"] = valores_ept["AREA M2"].fillna(value = 0)
# EPR espacio público total
epfAreasXIso = pd.DataFrame(valores_ept.groupby(by=['Linea','layer','Tiempo']).agg({'AREA M2':'sum'}).unstack().to_records()).rename(columns={"('AREA M2', '10 min')": 'Area_EPT_10Min',"('AREA M2', '15 min')": 'Area_EPT_15Min',"('AREA M2', '20 min')": 'Area_EPT_20Min',"('AREA M2', '5 min')": 'Area_EPT_5Min'})

censo2018["TP27_PERSO"] = censo2018["TP27_PERSO"].fillna(value = 0)
# Personas
personasXIso = pd.DataFrame(censo2018.groupby(by=['layer','Tiempo']).agg({'TP27_PERSO':'sum'}).unstack().to_records()).rename(columns={"('TP27_PERSO', '10 min')": 'personas_10Min',"('TP27_PERSO', '15 min')": 'personas_15Min',"('TP27_PERSO', '20 min')": 'personas_20Min',"('TP27_PERSO', '5 min')": 'personas_5Min'})

EP_process = AreasXIso.merge(personasXIso,how='inner',on='layer')
EP_process = EP_process.merge(epfAreasXIso,how='inner',on='layer')

# espacio publico efectivo real
EP_process['EPR_20Min']=EP_process['Area_EPR_20Min']/EP_process['personas_20Min']
EP_process['EPR_15Min']=EP_process['Area_EPR_15Min']/EP_process['personas_15Min']
EP_process['EPR_10Min']=EP_process['Area_EPR_10Min']/EP_process['personas_10Min']
EP_process['EPR_5Min']=EP_process['Area_EPR_5Min']/EP_process['personas_5Min']
# espacio publico total
EP_process['EPT_20Min']=EP_process['Area_EPT_20Min']/EP_process['personas_20Min']
EP_process['EPT_15Min']=EP_process['Area_EPT_15Min']/EP_process['personas_15Min']
EP_process['EPT_10Min']=EP_process['Area_EPT_10Min']/EP_process['personas_10Min']
EP_process['EPT_5Min']=EP_process['Area_EPT_5Min']/EP_process['personas_5Min']

EP_process.rename(columns={'Linea_x':'Linea','layer':'Estacion'}, inplace=True)
EP_process.fillna(value = 0)
# espacio publico real efectivo.
EP_process['EPR'] = EP_process[['EPR_10Min','EPR_15Min','EPR_20Min','EPR_5Min']].sum(axis=1)# espacio publico real (parques plazas plazoletas)
EP_process['EPT'] = EP_process[['EPT_10Min','EPT_15Min','EPT_20Min','EPT_5Min']].sum(axis=1)# espacio publico total (Calzadas, andes tec.)
EP_process['AREA_EPT'] = EP_process[['Area_EPT_10Min','Area_EPT_15Min','Area_EPT_20Min','Area_EPT_5Min']].sum(axis=1)# espacio publico efectivo total
EP_process['AREA_EPR'] = EP_process[['Area_EPR_10Min','Area_EPR_15Min','Area_EPR_20Min','Area_EPR_5Min']].sum(axis=1)# espacio publico efectivo total
EP_process['PersonasT'] = EP_process[['personas_10Min','personas_15Min','personas_20Min','personas_5Min']].sum(axis=1)# total personas 

EP_process.drop(columns = ['Linea_y'], inplace= True)

In [0]:
EP_process

In [0]:
pd.DataFrame.iteritems = pd.DataFrame.items
EP_calculo = spark.createDataFrame(EP_process)
EP_calculo.createOrReplaceTempView("espacio_publico_efectivo")

In [0]:
%sql
CREATE  OR REPLACE TABLE ep_efectivo AS (Select 
  * 
from 
  espacio_publico_efectivo);

In [0]:
%sql
select * from ep_efectivo

# dashboard Caracterización poblacional

In [0]:
Population = censo2018[["Tiempo","layer","Linea","TP32_1_SEX","TP32_2_SEX", "TP27_PERSO", "TP16_HOG","TP34_1_EDA","TP34_2_EDA","TP34_3_EDA","TP34_4_EDA","TP34_5_EDA","TP34_6_EDA","TP34_7_EDA","TP34_8_EDA","TP34_9_EDA", "TP19_EE_E1","TP19_EE_E2","TP19_EE_E3","TP19_EE_E4","TP19_EE_E5","TP19_EE_E6","LATITUD","LONGITUD","TP19_EE_E9","geom"]].rename(columns={"TP32_1_SEX": 'Hombres',"TP32_2_SEX":'Mujeres', "TP27_PERSO": 'personas', "TP16_HOG":'hogares',"TP34_1_EDA":'Edad1',"TP34_2_EDA":'Edad2',"TP34_3_EDA":'Edad3',"TP34_4_EDA":'Edad4',"TP34_5_EDA":'Edad5',"TP34_6_EDA":'Edad6',"TP34_7_EDA":'Edad7',"TP34_8_EDA":'Edad8',"TP34_9_EDA":'Edad9', "TP19_EE_E1":'est1',"TP19_EE_E2":'est2',"TP19_EE_E3":'est3',"TP19_EE_E4":'est4',"TP19_EE_E5":'est5',"TP19_EE_E6":'est6',"TP19_EE_E9":'SE'})
desc_Population_pdf = spark.createDataFrame(Population)
desc_Population_pdf.createOrReplaceTempView("descripcion_poblacion")

In [0]:
%sql
CREATE  OR REPLACE TABLE descripc_poblacion AS (Select 
  * 
from 
  descripcion_poblacion);

In [0]:
%sql
select * from descripc_poblacion

In [0]:
pbl = spark.sql(
    "select * from descripc_poblacion"
).toPandas()

In [0]:
pbl.display()

In [0]:
def clasificaciones(v):
    if v == '[0]':
        return '0 a 127 Personas'
    elif v == '[1]':
        return '128 a 304 Personas'
    elif v == '[2]':
        return '305 a 758 Personas'
    elif v == '[3]':
        return '759 a 1859 Personas'
    elif v == '[4]':
        return '1859 a 6299 Personas'
    else:
        pass

In [0]:
pbl["Rangos personas"].astype('str')

In [0]:
import mapclassify
bins = mapclassify.NaturalBreaks(y=pbl["personas"],k=5)
pbl['rangos_personas'] = pbl["personas"].apply(bins)
pbl['rangos_personas'] = pbl['rangos_personas'].astype('string')
pbl['Categorias_Personas'] = pbl['rangos_personas'].astype('str').apply(lambda x: clasificaciones(x))
pbl['Categorias_Personas'] = pbl['Categorias_Personas'].astype('string')

In [0]:
pbl.display()

In [0]:
desc_Population_pdf = spark.createDataFrame(pbl)
desc_Population_pdf.createOrReplaceTempView("descripcion_poblacion")

In [0]:
%sql
REPLACE TABLE descripc_poblacion AS (
  Select 
  * 
  from 
  descripcion_poblacion);

In [0]:
%sql
Select 
  * 
  from 
  descripc_poblacion;