In [75]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
849,application_1761923966900_0861,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
847,application_1761923966900_0859,pyspark,idle,Link,Link,,
848,application_1761923966900_0860,pyspark,idle,Link,Link,,
849,application_1761923966900_0861,pyspark,idle,Link,Link,,✔


#Query 1 
Να ταξινομηθούν, σε φθίνουσα σειρά, οι ηλικιακές ομάδες των θυμάτων σε περιστατικά που περιλαμβάνουν οποιαδήποτε μορφή “βαριάς σωματικής βλάβης”.   

Θεωρείστε τις εξής ηλικιακές ομάδες:


• Παιδιά: < 18

• Νεαροί ενήλικοι: 18 – 24

• Ενήλικοι: 25 – 64

• Ηλικιωμένοι: >64


Dataframes Implementetion

In [81]:
# Implementation with DataFrame API
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col
from pyspark.sql import functions as F
import time 

start_time=time.time()
spark = SparkSession \
    .builder \
    .appName("Query 1 implementation w Dataframes") \
    .getOrCreate()

crimes_schema = StructType([
    StructField("dr_no", StringType()),
    StructField("date_rptd", StringType()),
    StructField("date_occ", StringType()),
    StructField("time_occ", StringType()),
    StructField("area", StringType()),
    StructField("area_name", StringType()),
    StructField("rpt_dist_no", StringType()),
    StructField("part_1_2", IntegerType()),
    StructField("crm_cd", StringType()),
    StructField("crm_cd_desc", StringType()),
    StructField("mocodes", StringType()),
    StructField("vict_age", StringType()),
    StructField("vict_sex", StringType()),
    StructField("vict_descent", StringType()),
    StructField("premis_cd", StringType()),
    StructField("premis_desc", StringType()),
    StructField("weapon_used_cd", StringType()),
    StructField("weapon_desc", StringType()),
    StructField("status", StringType()),
    StructField("status_desc", StringType()),
    StructField("crm_cd_1",StringType()),
    StructField("crm_cd_2",StringType()),
    StructField("crm_cd_3",StringType()),
    StructField("crm_cd_4",StringType()),
    StructField("location", StringType()),
    StructField("cross_street", StringType()),
    StructField("lat", FloatType()),
    StructField("lon", FloatType()),
])


crimes_2010_2019_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=False, \
    schema=crimes_schema)

crimes_2020_2025_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
                                     header=False, \
                                     schema=crimes_schema)


#Union both datasets to have all the data available for the query
crimes_total_df = crimes_2010_2019_df.union(crimes_2020_2025_df)

#Βρίσκουμε όλα τα εγκλήματα που έχουν προκαλέσει βαριά σωματική βλάβη 
assault_df = crimes_total_df.filter(col("crm_cd_desc").like("%AGGRAVATED ASSAULT%"))

#μετατρεπουμε την μεταβλητη απο τύπο Sting σε τύπο int για να εχουμε συγκριση
test_df = assault_df.withColumn("vict_age_int", col("vict_age").cast("integer"))
#φτιαχνουμε το ζητούμενο dataset το οποίο περιέχει την ηλικιακή ομάδα του θύυματος και έπειτα το πληθος των περιστατικών
age_groups_counts = (
    test_df
    .withColumn(
        "age_group",
        F.when((col("vict_age_int") >= 0) & (col("vict_age_int") < 18), "Children")
         .when((col("vict_age_int") >= 18) & (col("vict_age_int") < 25), "Youth")
         .when((col("vict_age_int") >= 25) & (col("vict_age_int") < 65), "Adults")
         .when(col("vict_age_int") >= 65, "Seniors")
         .otherwise("Unknown")
    )
    .groupBy("age_group")
    .count()
    .orderBy(col("count").desc())
)

age_groups_counts.show()
end_time=time.time()
print("Dataframe API Execution time for Query 1: {:.4f} sec".format(end_time - start_time))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+------+
|age_group| count|
+---------+------+
|   Adults|121660|
|    Youth| 33758|
| Children| 16009|
|  Seniors|  6011|
|  Unknown|     5|
+---------+------+

Dataframe API Execution time for Query 1: 33.6270 sec

In [77]:
# Implementation with RDD API
from pyspark.sql import SparkSession
import time

start_time=time.time()

sc = SparkSession \
    .builder \
    .appName("Query 1 implementation w RDD") \
    .getOrCreate() \
    .sparkContext
    
# Load and process data

crimes_2010_2019 = sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv") \
                .map(lambda x: (x.split(","))) # Split lines into a list of elements -> delimiter: ","

crimes_2020_2025 = sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv") \
                .map(lambda x: (x.split(","))) # Split lines into a list of elements -> delimiter: ","


crimes=crimes_2010_2019.union(crimes_2020_2025)

#Ψαχνουμε μόνο τα εγκλήματα που έχουν σχέση με βαριά σωματική βλάβη -> δηλαδή ψάωνουμε aggravated assault 
assault_rdd = crimes.filter(
    lambda row: "AGGRAVATED ASSAULT" in row[9].upper()
)

#μετατρεπουμε τις ηλικιες απο στρινγκ σε ιντ και αποθηκευουμε σε καινουριο rdd μονο τις τιμες των ηλικιων για ολα τα στοιχεια που υπαρχουν στο assault_rdd 
#μετα απο debugging το προβλημα ηταν οτι τα στοιχεια ηταν φωλιασμενα μεσα σε "", οπότε κανουμε replace με απλά για τα strings
ages_rdd = assault_rdd.map(lambda row: int(row[11].replace('"', '')))
print(ages_rdd.take(5))

children_rdd = ages_rdd.filter(lambda age:  age< 18)
youths_rdd = ages_rdd.filter(lambda age:  18 <= age <= 24)
adults_rdd = ages_rdd.filter(lambda age: 25 <= age < 64)
seniors_rdd = ages_rdd.filter(lambda age: age >= 64)

#ftiaxnoyme mia lista me tuples (age group, incidents of Aggravated assault)
counts_list = [
    ("Παιδιά", children_rdd.count()),
    ("Νεαροί", youths_rdd.count()),
    ("Ενήλικοι", adults_rdd.count()),
    ("Ηλικιωμένοι", seniors_rdd.count())
]

# Δημιουργούμε RDD από τη λίστα
counts_rdd = sc.parallelize(counts_list)

sorted_counts_rdd = counts_rdd.sortBy(lambda x: x[1], ascending=False)

print(sorted_counts_rdd.collect())
print("Children:", children_rdd.count())
print("Youth:", youths_rdd.count())
print("Adults:", adults_rdd.count())
print("Seniors:", seniors_rdd.count())
end_time = time.time()
print("RDD API Execution time for Query 1: {:.4f} sec".format(end_time - start_time))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[18, 23, 36, 0, 32]
[('????????', 21477), ('??????', 5892), ('??????', 2794), ('???????????', 510)]
Children: 2794
Youth: 5892
Adults: 21477
Seniors: 510
RDD API Execution time for Query 1: 39.5206 sec

In [78]:
---------+------+
|   Adults|121660|
|    Youth| 33758|
| Children| 16009|
|  Seniors|  6011|
|  Unknown|     5|
+---------+------+

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
invalid syntax (<stdin>, line 1)
  File "<stdin>", line 1
    ---------+------+
                     ^
SyntaxError: invalid syntax



In [79]:
#Implementation with Dataframes using UDFs
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col, udf
import time

start_time = time.time()

spark = SparkSession \
    .builder \
    .appName("Query 1 implementation w Dataframes using UDF") \
    .getOrCreate()

crimes_schema = StructType([
    StructField("dr_no", StringType()),
    StructField("date_rptd", StringType()),
    StructField("date_occ", StringType()),
    StructField("time_occ", StringType()),
    StructField("area", StringType()),
    StructField("area_name", StringType()),
    StructField("rpt_dist_no", StringType()),
    StructField("part_1_2", IntegerType()),
    StructField("crm_cd", StringType()),
    StructField("crm_cd_desc", StringType()),
    StructField("mocodes", StringType()),
    StructField("vict_age", StringType()),
    StructField("vict_sex", StringType()),
    StructField("vict_descent", StringType()),
    StructField("premis_cd", StringType()),
    StructField("premis_desc", StringType()),
    StructField("weapon_used_cd", StringType()),
    StructField("weapon_desc", StringType()),
    StructField("status", StringType()),
    StructField("status_desc", StringType()),
    StructField("crm_cd_1",StringType()),
    StructField("crm_cd_2",StringType()),
    StructField("crm_cd_3",StringType()),
    StructField("crm_cd_4",StringType()),
    StructField("location", StringType()),
    StructField("cross_street", StringType()),
    StructField("lat", FloatType()),
    StructField("lon", FloatType()),
])


crimes_2010_2019_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=False, \
    schema=crimes_schema)

crimes_2020_2025_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
                                     header=False, \
                                     schema=crimes_schema)

#Union both datasets to have all the data available for the query
crimes_total_df = crimes_2010_2019_df.union(crimes_2020_2025_df)

#Βρίσκουμε όλα τα εγκλήματα που έχουν προκαλέσει βαριά σωματική βλάβη 
assault_df = crimes_total_df.filter(col("crm_cd_desc").like("%AGGRAVATED ASSAULT%"))

#μετατρεπουμε την μεταβλητη απο τύπο Sting σε τύπο int για να εχουμε συγκριση
test_df = assault_df.withColumn("vict_age_int", col("vict_age").cast("integer"))

###  WITH UDF  ###
def age_group(age):
    if age<18:
        return "Children"
    elif age < 25 :
        return "Youths"
    elif age < 65: 
        return "Adults"
    else :
        return "Seniors"

# Register the UDF
classify_victim_age_udf = udf(age_group, StringType())
age_groups_df = test_df \
    .withColumn("age_group", classify_victim_age_udf(col("vict_age_int"))).groupBy("age_group") \
    .count() \
    .orderBy(col("count").desc())


### WITHOUT UDF ###
# employees_yearly_df = employees_df \
#     .withColumn("yearly", (14*col("salary")+col("bonus"))).select("name", "yearly")
####################

age_groups_df.show()
end_time=time.time()
print("Dataframe using UDF API Execution time for Query 1: {:.4f} sec".format(end_time - start_time))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+------+
|age_group| count|
+---------+------+
|   Adults|121660|
|   Youths| 33758|
| Children| 16014|
|  Seniors|  6011|
+---------+------+

Dataframe using UDF API Execution time for Query 1: 6.1317 sec

In [80]:
---------+------+
|   Adults|121660|
|    Youth| 33758|
| Children| 16009|
|  Seniors|  6011|
|  Unknown|     5|

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
invalid syntax (<stdin>, line 1)
  File "<stdin>", line 1
    ---------+------+
                     ^
SyntaxError: invalid syntax

