In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "2",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
847,application_1761923966900_0859,pyspark,idle,Link,Link,,
848,application_1761923966900_0860,pyspark,idle,Link,Link,,
849,application_1761923966900_0861,pyspark,idle,Link,Link,,
850,application_1761923966900_0862,pyspark,idle,Link,Link,,
851,application_1761923966900_0863,pyspark,idle,Link,Link,,
856,application_1761923966900_0868,pyspark,idle,Link,Link,,
857,application_1761923966900_0869,pyspark,idle,Link,Link,,


In [2]:
from sedona.spark import SedonaContext
from sedona.register.geo_registrator import SedonaRegistrator
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import *
from sedona.sql.types import GeometryType
from sedona.sql import ST_Point, ST_Distance
from pyspark.sql.functions import min as sql_min
import time

start_time = time.time()
# -------------------------------------------------
# 1. Spark + Sedona setup
# -------------------------------------------------
spark = SparkSession.builder \
    .appName("Query Nearest Police Station") \
    .getOrCreate()

sedona = SedonaContext.create(spark)
SedonaRegistrator.registerAll(spark)

# -------------------------------------------------
# 2. Load crimes
# -------------------------------------------------
crimes_schema = StructType([
    StructField("dr_no", StringType()),
    StructField("date_rptd", StringType()),
    StructField("date_occ", StringType()),
    StructField("time_occ", StringType()),
    StructField("area", StringType()),
    StructField("area_name", StringType()),
    StructField("rpt_dist_no", StringType()),
    StructField("part_1_2", IntegerType()),
    StructField("crm_cd", StringType()),
    StructField("crm_cd_desc", StringType()),
    StructField("mocodes", StringType()),
    StructField("vict_age", StringType()),
    StructField("vict_sex", StringType()),
    StructField("vict_descent", StringType()),
    StructField("premis_cd", StringType()),
    StructField("premis_desc", StringType()),
    StructField("weapon_used_cd", StringType()),
    StructField("weapon_desc", StringType()),
    StructField("status", StringType()),
    StructField("status_desc", StringType()),
    StructField("crm_cd_1", StringType()),
    StructField("crm_cd_2", StringType()),
    StructField("crm_cd_3", StringType()),
    StructField("crm_cd_4", StringType()),
    StructField("location", StringType()),
    StructField("cross_street", StringType()),
    StructField("lat", FloatType()),
    StructField("lon", FloatType()),
])

crimes_2010_2019_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",
    header=False, schema=crimes_schema
)

crimes_2020_2025_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",
    header=False, schema=crimes_schema
)

# Combine datasets
crimes_total_df = crimes_2010_2019_df.union(crimes_2020_2025_df)

# Filter records with coordinates
crimes_points = crimes_total_df \
    .filter(col("lat").isNotNull() & col("lon").isNotNull()) \
    .withColumn("crime_geom", ST_Point(col("lon"), col("lat")))


police_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Police_Stations.csv",
    header=True, inferSchema=True
).select(
    col("DIVISION").alias("division"),
    col("X").alias("lon"),
    col("Y").alias("lat")
)

# Convert to geometry
police_points = police_df.withColumn(
    "police_geom",
    ST_Point(col("lon"), col("lat"))
)


joined = crimes_points.crossJoin(police_points) \
    .withColumn("distance", ST_Distance(col("crime_geom"), col("police_geom"))) \
    .select(
        col("dr_no"),
        col("division"),
        col("distance")
    )

j = joined.alias("j")

# 1. Minimum distance per crime
min_dist = j.groupBy(col("j.dr_no")).agg(
    sql_min("j.distance").alias("min_distance")
).alias("m")

# 2. Join χωρίς ambiguous dr_no
nearest = j.join(
    min_dist,
    (col("j.dr_no") == col("m.dr_no")) &
    (col("j.distance") == col("m.min_distance")),
    "inner"
).select(
    col("j.dr_no").alias("dr_no"),
    col("j.division").alias("division"),
    col("j.distance").alias("distance_to_station")
)

# -------------------------------------------------
# 6. Compute crime_count + avg_distance per division
# -------------------------------------------------
from pyspark.sql.functions import avg, count

division_stats = nearest.groupBy("division").agg(
    count("dr_no").alias("crime_count"),
    avg("distance_to_station").alias("avg_distance")
).orderBy(col("crime_count").desc())

division_stats.show(truncate=False)
division_stats.explain("formatted")

end_time = time.time()
print("Execution time for Query 4 (2 core, 2 executors, 4GB memory ): {:.4f} sec".format(end_time - start_time))

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
858,application_1761923966900_0870,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+-----------+--------------------+
|division        |crime_count|avg_distance        |
+----------------+-----------+--------------------+
|HOLLYWOOD       |214157     |0.020436213557045942|
|VAN NUYS        |212495     |0.028654072511946835|
|WILSHIRE        |199273     |0.026311034884955457|
|SOUTHWEST       |187418     |0.0215807431211962  |
|OLYMPIC         |181990     |0.017306293320735557|
|NORTH HOLLYWOOD |171974     |0.026116102217477895|
|77TH STREET     |168030     |0.016587006268602506|
|PACIFIC         |158587     |0.03752453485777577 |
|CENTRAL         |155553     |0.009875953835326358|
|SOUTHEAST       |153569     |0.0241625608528956  |
|RAMPART         |150615     |0.014739396233047736|
|TOPANGA         |150239     |0.032446654666430604|
|WEST VALLEY     |132004     |0.02899430194557513 |
|HARBOR          |130520     |3.0007647359015697  |
|FOOTHILL        |122935     |0.041266265564586554|
|WEST LOS ANGELES|121644     |0.029818699889031575|
|HOLLENBECK 