### Query 1 using RDD API


In [None]:
from pyspark.sql import SparkSession
import time

# Δημιουργία SparkSession
spark = SparkSession.builder \
    .appName("Query 1 - RDD ") \
    .config("spark.executor.instances", 4) \
    .getOrCreate()

# Get the SparkContext from the SparkSession
sc = spark.sparkContext

# Start timing
start_time = time.time()

# File paths
file1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
file2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"

# Function to process a single file and compute counts
def process_file_rdd(file_path):
    # Read file using spark.read.csv from SparkSession
    data = spark.read.csv(file_path, header=True)
    
    # Convert DataFrame to RDD
    rdd = data.rdd
    
    # Get column indices dynamically
    victim_age_idx = data.columns.index("Vict Age")
    crime_desc_idx = data.columns.index("Crm Cd Desc")
    
    # Process RDD
    age_groups = rdd.filter(
        lambda row: row[victim_age_idx] is not None and row[victim_age_idx].isdigit() and "AGGRAVATED ASSAULT" in row[crime_desc_idx]
    ).map(
        lambda row: int(row[victim_age_idx])
    ).map(
        lambda age: (
            "Children" if age < 18 else 
            "Young Adults" if 18 <= age <= 24 else 
            "Adults" if 25 <= age <= 64 else 
            "Elderly",
            1
        )
    )
    
    # Aggregate counts by age group
    counts = age_groups.reduceByKey(lambda a, b: a + b)
    return counts

# Process each file independently
result_rdd1 = process_file_rdd(file1)
result_rdd2 = process_file_rdd(file2)

# Combine results from both files
combined_rdd = result_rdd1.union(result_rdd2).reduceByKey(lambda a, b: a + b)

# Sort by count in descending order
sorted_results = combined_rdd.sortBy(lambda x: x[1], ascending=False)

# Collect and display results
for age_group, count in sorted_results.collect():
    print(f"{age_group}: {count}")

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")
