### Query 1 using Dataframe API

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, sum as spark_sum
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
import time


# Δημιουργία SparkSession
spark = SparkSession.builder \
    .appName("Query 1 - Dataframe") \
    .config("spark.executor.instances", 4) \
    .getOrCreate()

crimeSchema = StructType([
    StructField("DR_NO", StringType(), True),
    StructField("Date Rptd", StringType(), True),
    StructField("DATE OCC", StringType(), True),
    StructField("TIME OCC", StringType(), True),
    StructField("AREA", StringType(), True),
    StructField("AREA NAME", StringType(), True),
    StructField("Rpt Dist No", StringType(), True),
    StructField("Part 1-2", StringType(), True),
    StructField("Crm Cd", StringType(), True),
    StructField("Crm Cd Desc", StringType(), True),
    StructField("Mocodes", StringType(), True),
    StructField("Vict Age", StringType(), True),
    StructField("Vict Sex", StringType(), True),
    StructField("Vict Descent", StringType(), True),
    StructField("Premis Cd", StringType(), True),
    StructField("Premis Desc", StringType(), True),
    StructField("Weapon Used Cd", StringType(), True),
    StructField("Weapon Desc", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Status Desc", StringType(), True),
    StructField("Crm Cd 1", StringType(), True),
    StructField("Crm Cd 2", StringType(), True),
    StructField("Crm Cd 3", StringType(), True),
    StructField("Crm Cd 4", StringType(), True),
    StructField("LOCATION", StringType(), True),
    StructField("Cross Street", StringType(), True),
    StructField("LAT", StringType(), True),
    StructField("LON", StringType(), True)
])

# Start timing
start_time = time.time()

# File paths
file1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
file2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"

# Function to process and aggregate a single file
def process_file(file_path):
    data = spark.read.csv(file_path, header=True, schema=crimeSchema)
    filtered_data = data.filter(
        (col("Vict Age").isNotNull()) &                 
        (col("Crm Cd Desc").contains("AGGRAVATED ASSAULT")) & 
        (col("Vict Age").cast("int") >= 0)              
    ).withColumn("Vict Age", col("Vict Age").cast("int"))
    
    categorized = filtered_data.withColumn(
        "Age Group",
        when(col("Vict Age") < 18, "Children")
        .when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), "Young Adults")
        .when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), "Adults")
        .when(col("Vict Age") > 64, "Elderly")
    )
    
    aggregated = categorized.groupBy("Age Group") \
        .agg(count("*").alias("Count"))
    return aggregated

# Process each file independently
result1 = process_file(file1)
result2 = process_file(file2)

# Combine the results from both files
combined_results = result1.union(result2) \
    .groupBy("Age Group") \
    .agg(spark_sum("Count").alias("Total Count")) \
    .orderBy(col("Total Count").desc())

# Show the final results
combined_results.show()

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")


