# procesamiento de Base de datos DANE.

In [0]:
[a.path for a in dbutils.fs.ls("/FileStore/tables/DANE/censo2018/")]

In [0]:
from pyspark.sql import SparkSession
from pyspark import StorageLevel
from sedona.spark import *
from sedona.utils import SedonaKryoRegistrator, KryoSerializer

from sedona.register.geo_registrator import SedonaRegistrator

# configuración de spark y Sedona para datos geograficos
spark = SparkSession.\
    builder.\
    master("local[*]").\
    appName("Sedona App").\
    config("spark.serializer", KryoSerializer.getName).\
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName) .\
    config("spark.kryoserializer.buffer.max", "150000m").\
    getOrCreate()
    #config("sedona.global.charset", "utf8").\
    
#SedonaRegistrator.registerAll()
SedonaContext.create(spark)


In [0]:
# censo poblacional 2018
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.utils.adapter import Adapter
sc = spark.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")
#carga el censo 2018 del dane
censo = ShapefileReader.readToGeometryRDD(sc, "dbfs:/FileStore/tables/DANE/censo2018/")
#carga las isocronas
Isocrona = ShapefileReader.readToGeometryRDD(sc, "dbfs:/FileStore/tables/shp/isocronas/")
# carga de espacio publico
ep = ShapefileReader.readToGeometryRDD(sc, "dbfs:/FileStore/tables/EP/")
#asigna el sistema de proyección, en este caso proyeccion geografica.
censo.CRSTransform('epsg:4326','epsg:4326')
Isocrona.CRSTransform('epsg:4326','epsg:4326')
ep.CRSTransform('epsg:4326','epsg:4326')

In [0]:
from sedona.sql.st_functions import ST_Centroid
#  para no tener desbordamientos en la memoria del cluster se deben convertir los poligonos a un punto centroide
censo_df = Adapter.toDf(censo, spark)
censopoint_df = censo_df.withColumn("geometry", ST_Centroid(censo_df["geometry"]))
censo = Adapter.toSpatialRdd(censopoint_df,"geometry")
censo.CRSTransform('epsg:4326','epsg:4326')

In [0]:
#Isocrona.rawSpatialRDD.map(lambda x: x.getUserData())
#Isocrona.map(lambda x: x[0].geom.centroid).collect()

In [0]:
from sedona.utils.adapter import Adapter
censo_df = Adapter.toDf(censo, spark)
Isocrona_df = Adapter.toDf(Isocrona, spark)
ep_df = Adapter.toDf(ep, spark)
c=[field.name for field in censo_df.schema.fields]
c.remove('geometry')
i = [field.name for field in Isocrona_df.schema.fields]
i.remove('geometry')
atributos_join = i+c
atributos_join.insert(0, 'geom')
# atributos de cruce en a espacio publico
e=[field.name for field in ep_df.schema.fields]
e.remove('geometry')
atributosEP_join = i+e
atributosEP_join.insert(0, 'geom')

In [0]:
from sedona.core.spatialOperator import JoinQuery
# haciendo las particiones espaciales
censo.analyze()
censo.spatialPartitioning(GridType.KDBTREE, 4)
Isocrona.spatialPartitioning(censo.getPartitioner())
ep.spatialPartitioning(censo.getPartitioner())

In [0]:
from sedona.core.enums import GridType
from sedona.core.enums import IndexType
from sedona.core.spatialOperator import JoinQuery
using_index = True
result_censo = JoinQuery.SpatialJoinQueryFlat(censo, Isocrona, using_index, True)
result_ep = JoinQuery.SpatialJoinQueryFlat(ep, Isocrona, using_index, True)

In [0]:
#result.sample(withReplacement=False, fraction=0.01, seed=3).map(lambda x: x[1]).collect()

In [0]:
import geopandas as gpd
valores_censo = gpd.GeoDataFrame(
    result_censo.map(lambda x: [x[1].geom, *x[0].userData.split("\t"), *x[1].userData.split("\t")]).collect(),columns=atributos_join,
    geometry="geom", 
    crs= "epsg:4326"
)
valores_ep = gpd.GeoDataFrame(
    result_ep.map(lambda x: [x[1].geom, *x[0].userData.split("\t"), *x[1].userData.split("\t")]).collect(),columns=atributosEP_join,
    geometry="geom", 
    crs= "epsg:4326"
)

In [0]:
valores_ep[(valores_ep['layer'] == '8')&(valores_ep['Tiempo'] == '10 min')].explore()
#.map(lambda x: [x[1].geom, *x[0].userData.split("\t"), *x[1].userData.split("\t")])

In [0]:
censo2018 = valores_censo[['group_inde','Tiempo','layer','Linea','COD_DANE_A','ZU_CDIVI','SETU_CCDGO','SETU_CCNCT','SECU_CCDGO','SECU_CCNCT','MANZ_CCDGO','AG_CCDGO','DATO_ANM','AREA','LATITUD','LONGITUD','DENSIDAD','CTNENCUEST','TP9_1_USO','TP9_2_USO','TP9_3_USO','TP9_4_USO','TP9_2_1_MI','TP9_2_2_MI','TP9_2_3_MI','TP9_2_4_MI','TP9_2_9_MI','TP9_3_1_NO','TP9_3_2_NO','TP9_3_3_NO','TP9_3_4_NO','TP9_3_5_NO','TP9_3_6_NO','TP9_3_7_NO','TP9_3_8_NO','TP9_3_9_NO','TP9_3_10_N','TP9_3_99_N','TVIVIENDA','TP14_1_TIP','TP14_2_TIP','TP14_3_TIP','TP16_HOG','TP19_EE_E1','TP19_EE_E2','TP19_EE_E3','TP19_EE_E4','TP19_EE_E5','TP19_EE_E6','TP19_EE_E9','TP27_PERSO','PERSONAS_L','PERSONAS_S','TP32_1_SEX','TP32_2_SEX','CD_LC_CM','NMB_LC_CM','geom']]


In [0]:
censo2018["TP27_PERSO"] = censo2018["TP27_PERSO"].astype('float')
valores_ep["AREA M2"]=valores_ep["AREA M2"].astype('float')

In [0]:

import numpy as np
import pandas as pd

valores_ep["AREA M2"] = valores_ep["AREA M2"].fillna(value = 0)
AreasXIso = pd.DataFrame(valores_ep.groupby(by=['Linea','layer','Tiempo']).agg({'AREA M2':'sum'}).unstack().to_records()).rename(columns={"('AREA M2', '10 min')": 'EP_10Min',"('AREA M2', '15 min')": 'EP_15Min',"('AREA M2', '20 min')": 'EP_20Min',"('AREA M2', '5 min')": 'EP_5Min'})

censo2018["TP27_PERSO"] = censo2018["TP27_PERSO"].fillna(value = 0)
personasXIso = pd.DataFrame(censo2018.groupby(by=['layer','Tiempo']).agg({'TP27_PERSO':'sum'}).unstack().to_records()).rename(columns={"('TP27_PERSO', '10 min')": 'personas_10Min',"('TP27_PERSO', '15 min')": 'personas_15Min',"('TP27_PERSO', '20 min')": 'personas_20Min',"('TP27_PERSO', '5 min')": 'personas_5Min'})

EP_process = AreasXIso.merge(personasXIso,how='inner',on='layer')

EP_process['EPF_20Min']=EP_process['EP_20Min']/EP_process['personas_20Min']
EP_process['EPF_15Min']=EP_process['EP_15Min']/EP_process['personas_15Min']
EP_process['EPF_10Min']=EP_process['EP_10Min']/EP_process['personas_10Min']
EP_process['EPF_5Min']=EP_process['EP_5Min']/EP_process['personas_5Min']
EP_process.rename(columns={'Linea_x':'Linea','layer':'Estacion'}, inplace=True)
EP_process.fillna(value = 0)
EP_process['EPT'] = EP_process[['EP_10Min','EP_15Min','EP_20Min','EP_5Min']].sum(axis=1)# espacio publico total
EP_process['EPF'] = EP_process[['EPF_10Min','EPF_15Min','EPF_20Min','EPF_5Min']].sum(axis=1)# espacio publico efectivo total
EP_process['PersonalT'] = EP_process[['personas_10Min','personas_15Min','personas_20Min','personas_5Min']].sum(axis=1)# total personas 

In [0]:
pd.DataFrame.iteritems = pd.DataFrame.items
EP_calculo = spark.createDataFrame(EP_process)
EP_calculo.createOrReplaceTempView("espacio_publico_efectivo")

In [0]:
%sql
CREATE  OR REPLACE TABLE ep_efectivo AS (Select 
  * 
from 
  espacio_publico_efectivo);