### Query 2 with Parquet and DataFrame

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, year, lit, when
from pyspark.sql.window import Window
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql import functions as F
import time


# Δημιουργία SparkSession
spark = SparkSession.builder.appName("Convert to Parquet").getOrCreate()

crimeSchema = StructType([
    StructField("DR_NO", StringType(), True),
    StructField("Date Rptd", StringType(), True),
    StructField("DATE OCC", StringType(), True),
    StructField("TIME OCC", StringType(), True),
    StructField("AREA", StringType(), True),
    StructField("AREA NAME", StringType(), True),
    StructField("Rpt Dist No", StringType(), True),
    StructField("Part 1-2", StringType(), True),
    StructField("Crm Cd", StringType(), True),
    StructField("Crm Cd Desc", StringType(), True),
    StructField("Mocodes", StringType(), True),
    StructField("Vict Age", StringType(), True),
    StructField("Vict Sex", StringType(), True),
    StructField("Vict Descent", StringType(), True),
    StructField("Premis Cd", StringType(), True),
    StructField("Premis Desc", StringType(), True),
    StructField("Weapon Used Cd", StringType(), True),
    StructField("Weapon Desc", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Status Desc", StringType(), True),
    StructField("Crm Cd 1", StringType(), True),
    StructField("Crm Cd 2", StringType(), True),
    StructField("Crm Cd 3", StringType(), True),
    StructField("Crm Cd 4", StringType(), True),
    StructField("LOCATION", StringType(), True),
    StructField("Cross Street", StringType(), True),
    StructField("LAT", StringType(), True),
    StructField("LON", StringType(), True)
])

# Ορισμός διευθύνσεων αρχείων
file1 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
file2 = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv"
parquet_file_path = "s3://groups-bucket-dblab-905418150721/group2/completeCrimeParquet"

df_csv1 = spark.read.csv(file1, header=True, schema=crimeSchema)
df_csv2 = spark.read.csv(file2, header=True, schema=crimeSchema)

df_csv = df_csv1.union(df_csv2)

df_csv.write.mode("overwrite").parquet(parquet_file_path)

start_time = time.time()

df = spark.read.parquet(parquet_file_path)

# Adjust the format to match the actual data format (e.g., 'MM/dd/yyyy hh:mm:ss a')
df = df.withColumn("DATE OCC", F.to_timestamp(F.col("DATE OCC"), "MM/dd/yyyy hh:mm:ss a"))

# Step 3: Extract year from `DATE OCC`
cases_with_year_df = df.withColumn("Year", F.year(F.col("DATE OCC")))

# Step 4: Filter for valid years (optional)
cases_with_year_df = cases_with_year_df.filter(F.col("Year").isNotNull())

# Step 5: Filter out the "UNK" and "Invest Cont" statuses
filtered_df = cases_with_year_df.filter(
    (F.col("Status Desc") != "UNK") & (F.col("Status Desc") != "Invest Cont")
)

# Step 6: Group by Year and Department, calculate total and closed percentages
percentages_df = cases_with_year_df.groupBy("Year", "AREA NAME").agg(
    F.count("*").alias("Total_Cases"),
    F.sum(
        F.when((F.col("Status Desc") != "UNK") & (F.col("Status Desc") != "Invest Cont"), 1).otherwise(0)
    ).alias("Closed_Cases")
).withColumn(
    "Closed_Percentage", (F.col("Closed_Cases") / F.col("Total_Cases")) * 100
)

# Step 7: Rank departments by closed percentage for each year
window_spec = Window.partitionBy("Year").orderBy(F.col("Closed_Percentage").desc())
ranked_df = percentages_df.withColumn("Rank", F.row_number().over(window_spec))

# Step 8: Filter top 3 departments per year
top_3_departments_df = ranked_df.filter(F.col("Rank") <= 3)

# Step 9: Sort results by year and rank
sorted_result_df = top_3_departments_df.orderBy("Year", "Rank")

# Show results
sorted_result_df.select("Year", "AREA NAME", "Closed_Percentage", "Rank").show(60, truncate=False)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")