# Notebook explanation

This notebook performs the following steps:

1. ** Data load **:
   - Two data tables are loaded using Spark: `Silver_orders` (orders) and` bronze_geodata` (geodatos).

2. ** Geodatos processing **:
   - From the geodatos table, the relevant columns are selected and renamed:
     - `Identification 'is renamed as` district`.
     - `Name` is renamed as` Neighborhood`.
     - The `Geometry` column is maintained.

3. ** Visualization **:
   - The resulting dataframe `Geodata` is displayed to explore processed geographical data.

In [0]:
%run ../Transversal/config

In [0]:
%run ../Transversal/utils

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m




In [0]:
from pyspark.sql.functions import col


orders = spark.table(Silver_Orders)

geodata = spark.table(Bronze_Geodata)

geodata = geodata.select(
    col('IDENTIFICACION').alias('district'),
    col('NOMBRE').alias('neighborhood'),
    'geometry'
)



In [0]:
display(geodata)

district,neighborhood,geometry
COMUNA 1,POPULAR,AQMAAAABAAAABRUAAP4gjeIS41LATwjkBqk3GUDyF9loFONSwLF7UJamNxlA28TbExfjUsA9c7iWpjcZQJmZmjgY41LAUGKdb3k3GUAgnlhdGeNSwLJpE7VZNxlA1YxrqxnjUsBdo5ejPTcZQNdzUw8Z41LAr3iLPxo3GUBl3RM= (truncated)
COMUNA 2,SANTA CRUZ,AQMAAAABAAAAmggAAGYi2nSm41LAr6QzFYQ9GUCg5gT+puNSwBDKokCMPRlA9qDLdafjUsAkDeFikz0ZQIahOMGn41LAR5EC4Zc9GUB+GRawqONSwCz6CL2cPRlAH5nsRKnjUsCBa0hxiz0ZQHcbWrqp41LAlNATzH09GUC+Zb0= (truncated)
COMUNA 3,MANRIQUE,AQMAAAABAAAAIRwAAA18cLG44lLAsEnTTjIlGUCZ2VUNueJSwKbuvj8zJRlAvTmUMrniUsD2VwhSMyUZQAxs90654lLA0/TxXzMlGUBs9hd/ueJSwEp57aMzJRlAAXwPrbniUsBvvK0tNCUZQCi29sK54lLAxsU7uTQlGUBb1OA= (truncated)
COMUNA 4,ARANJUEZ,AQMAAAABAAAA9gMAAMTwg5j841LAGHwxgUAqGUDMJHCd/ONSwJ5FJY1AKhlA4PjUNP3jUsD+7WP8QSoZQFjz8ND+41LAI5uxk0oqGUCFIhtt/+NSwEMN8YJOKhlAoTAT+ADkUsAX0Yt2WCoZQFx3p5QC5FLAWma522IqGUBWp4E= (truncated)
COMUNA 5,CASTILLA,AQMAAAABAAAAUAUAAIZm/cUz5FLAuOju8xxAGUCI4Tn1NORSwPOGGh8hQBlAMdplMzbkUsBS4kWWKUAZQBfbU9A25FLAAQiKhzNAGUCBFElCN+RSwP08ksI/QBlAIlvqmDfkUsBDPEhqSEAZQMx4kAY45FLABRedA1FAGUCrDGM= (truncated)
COMUNA 6,DOCE DE OCTUBRE,AQMAAAABAAAAYw8AAOdLNd7a5FLAUSp1IWQ+GUDG9DYo2+RSwEVu7BJmPhlAi7WYRNvkUsDzLgDBZz4ZQKstXmvc5FLAIQAwHno+GUDhfyCi3ORSwNeaA9Z9PhlANWAy0dzkUsB9g+jogT4ZQG7eaP7c5FLApUMy+4U+GUBIIEU= (truncated)
COMUNA 7,ROBLEDO,AQMAAAABAAAA7SMAAOYqj8HK5VLAfBAT4wYvGUCGWcbHyuVSwJyU358HLxlA+Z+Bz8rlUsDCiomKCC8ZQLu03urK5VLADX7exwsvGUDrin/+yuVSwOP7gRsOLxlArta7/8rlUsB4JgBBDi8ZQNrdAAHL5VLACdB2Zw4vGUBfsbM= (truncated)
COMUNA 8,VILLA HERMOSA,AQMAAAABAAAApyEAALJNRck/41LA75vzC6oMGUDKwg8VQONSwJHHWTCrDBlABcvgVkDjUsAqlN3orQwZQOOMoZpA41LA0I9wgrAMGUBc1GjkQONSwMq90UKyDBlAiIQzMkHjUsABlWUpswwZQNg5Bc5B41LAXxi7QbMMGUC5GO0= (truncated)
COMUNA 9,BUENOS AIRES,AQMAAAABAAAAyCQAAPpZ/oGF41LA7nQ/n+76GEDPXP2uheNSwE2Y+sTy+hhAHNrS54XjUsAFGwpT9voYQImdByOG41LA+oZDlfn6GEA4/vdihuNSwO4Jp4v8+hhAOZ9HpYbjUsAjuQqC//oYQGxAl+eG41LAO95eeAL7GECmoYc= (truncated)
COMUNA 10,LA CANDELARIA,AQMAAAABAAAAfgIAAHg/N98c5FLA5U6K6UYOGUDDhGlKI+RSwFH4G6NWDhlAXdUPUijkUsBtXGL1Yg4ZQO4AtO8v5FLAP+GWrHYOGUAh/+yZN+RSwAxi3IOKDhlAx0oA5TjkUsCzjnBkjQ4ZQHZeP7Y/5FLAAKEUZp4OGUCYZb8= (truncated)


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import to_timestamp, row_number, sum as _sum, dense_rank


def customers_location(df_orders, geodata, table_gold):

    orders_filtered = df_orders.select('district',
                                    'neighborhood',
                                    'quantity_products')

    orders_grouped = orders_filtered.groupBy("district", "neighborhood").agg(
    _sum("quantity_products").alias("total_products"))

    window_spec = Window.orderBy(col("total_products").desc())
    orders_filtered_ranked = orders_grouped.withColumn("ranking", dense_rank().over(window_spec))

    orders_with_geom = orders_filtered_ranked.join(
    geodata,
    on=["district", "neighborhood"],
    how="left"
    )
    
    gdf_orders = spark_to_geopandas(orders_with_geom)
    gdf_orders = gdf_orders.set_crs("EPSG:3116", allow_override=True)
    gdf_orders["centroid"] = gdf_orders["geometry"].centroid
    gdf_orders = gdf_orders.to_crs("EPSG:4326")
    gdf_orders["longitude"] = gdf_orders["centroid"].x
    gdf_orders["latitude"] = gdf_orders["centroid"].y

    orders_enriched = spark.createDataFrame(gdf_orders.drop(columns=["geometry", "centroid"]))
    orders_enriched = orders_enriched.orderBy(col("ranking").asc())


    orders_enriched.write\
    .format("delta")\
    .mode("overwrite")\
    .option("overwriteSchema", "true")\
    .saveAsTable(table_gold)

    return orders_enriched

In [0]:
customers_location(
    df_orders=orders,
    geodata=geodata,
    table_gold=Gold_Location_Customers
)

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

ASDSADASD

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import to_timestamp, row_number, sum as _sum, dense_rank


orders = spark.table(Silver_Orders)
print(orders.head())
geodata = spark.table(Bronze_Geodata)
print(geodata.head())
geodata = geodata.select(
    col('IDENTIFICACION').alias('district'),
    col('NOMBRE').alias('neighborhood'),
    'geometry'
)
print(geodata.head())

df_orders=orders
geodata=geodata
table_gold=Gold_Location_Customers

orders_filtered = df_orders.select('district',
                                    'neighborhood',
                                    'quantity_products')

orders_grouped = orders_filtered.groupBy("district", "neighborhood").agg(_sum("quantity_products").alias("total_products"))

window_spec = Window.orderBy(col("total_products").desc())
orders_filtered_ranked = orders_grouped.withColumn("ranking", dense_rank().over(window_spec))

orders_with_geom = orders_filtered_ranked.join(
    geodata,
    on=["district", "neighborhood"],
    how="left"
    )
    
gdf_orders = spark_to_geopandas(orders_with_geom)
gdf_orders = gdf_orders.set_crs("EPSG:3116", allow_override=True)
gdf_orders["centroid"] = gdf_orders["geometry"].centroid
gdf_orders = gdf_orders.to_crs("EPSG:4326")
gdf_orders["longitude"] = gdf_orders["centroid"].x
gdf_orders["latitude"] = gdf_orders["centroid"].y

orders_enriched = spark.createDataFrame(gdf_orders.drop(columns=["geometry", "centroid"]))
orders_enriched = orders_enriched.orderBy(col("ranking").asc())


orders_enriched.write\
    .format("delta")\
    .mode("overwrite")\
    .option("overwriteSchema", "true")\
    .saveAsTable(Gold_Location_Customers)