### 2 Executors x 4 cores/8GB

In [None]:
from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from pyspark.sql.functions import col, lit, min as spark_min, count, avg, first, expr
import time
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType

# Ξεκινάμε Spark Session
spark = SparkSession.builder \
    .appName("PoliceStations") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.debug.maxToStringFields", 1000) \
    .getOrCreate()


SedonaRegistrator.registerAll(spark)

crimeSchema = StructType([
    StructField("DR_NO", StringType(), True),
    StructField("Date Rptd", StringType(), True),
    StructField("DATE OCC", StringType(), True),
    StructField("TIME OCC", StringType(), True),
    StructField("AREA", StringType(), True),
    StructField("AREA NAME", StringType(), True),
    StructField("Rpt Dist No", StringType(), True),
    StructField("Part 1-2", StringType(), True),
    StructField("Crm Cd", StringType(), True),
    StructField("Crm Cd Desc", StringType(), True),
    StructField("Mocodes", StringType(), True),
    StructField("Vict Age", StringType(), True),
    StructField("Vict Sex", StringType(), True),
    StructField("Vict Descent", StringType(), True),
    StructField("Premis Cd", StringType(), True),
    StructField("Premis Desc", StringType(), True),
    StructField("Weapon Used Cd", StringType(), True),
    StructField("Weapon Desc", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Status Desc", StringType(), True),
    StructField("Crm Cd 1", StringType(), True),
    StructField("Crm Cd 2", StringType(), True),
    StructField("Crm Cd 3", StringType(), True),
    StructField("Crm Cd 4", StringType(), True),
    StructField("LOCATION", StringType(), True),
    StructField("Cross Street", StringType(), True),
    StructField("LAT", StringType(), True),
    StructField("LON", StringType(), True)
])

start_time = time.time()


CrimeDataFile1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
CrimeDataFile2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
PoliceStationsDataFile = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv"

crimes1_df = spark.read.csv(CrimeDataFile1, header=True, schema=crimeSchema)
crimes2_df = spark.read.csv(CrimeDataFile1, header=True, schema=crimeSchema)

crimes_df = crimes1_df.union(crimes2_df)

stations_df = spark.read.csv(PoliceStationsDataFile , header=True, inferSchema=True)

# Μετατρέπουμε lon και lat σε geo point για το Sedona 
crimes_df = crimes_df.withColumn("crime_point", expr("ST_Point(LON, LAT)"))
crimes_df = crimes_df.filter((col("LAT") != 0) | (col("LON") != 0))
stations_df = stations_df.withColumn("station_point", expr("ST_Point(x, y)"))



# Cartesian join για απόσταση κάθε εγκλήματος από κάθε τμήμα
distances_df = crimes_df.crossJoin(stations_df).withColumn(
    "distance", expr("ST_Distance(crime_point, station_point) * 111.32")
)

# Βρίσκουμε για κάθε έγκλημα το κοντινότερο του τμήμα
min_distances_df = distances_df.groupBy("DR_NO").agg(
    spark_min("distance").alias("min_distance")
)

# Κρατάμε μόνο το κοντινότερο για κάθε έγκλημα
nearest_stations_df = distances_df.join(
    min_distances_df,
    (distances_df["DR_NO"] == min_distances_df["DR_NO"]) &
    (distances_df["distance"] == min_distances_df["min_distance"])
).select(
    col("DIVISION").alias("Division"),
    col("Distance")
)

# Aggregate : Division, total crimes και average distance για κάθε τμήμα
result_df = nearest_stations_df.groupBy("Division").agg(
    avg("Distance").alias("avg_distance"),
    count("*").alias("total_crimes")
    
)

result_df = result_df.orderBy(col("total_crimes").desc())



# Αποτελέσματα
result_df.show(result_df.count(), truncate=False)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"\n\nTime taken: {elapsed_time:.2f} seconds")