# Περιεχόμενα

## Setup

In [16]:
# Files to be used

# Paths for csv
fcrime = "..\Data\Crime_Data_from_2010_to_2019_20241101.csv"
fstations = "..\Data\LA_Police_Stations.csv"
fincome = "..\Data\LA_income_2015.csv"
fcodes = "..\Data\RE_codes.csv"

# Paths for .parquet
fcrime_parq = "..\Data\CrimeData.parquet"
fstations_parq = "..\Data\PoliceStations.parquet"

# Paths for GeoJSON
fgeo = "../Data/2010_Census_Blocks.geojson"
fgeofields = "../Data/2010_Census_Blocks_fields.csv"

In [20]:
# Imports

from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType, DateType
from pyspark.sql.functions import year, when, count, sum, col, row_number, to_timestamp, regexp_replace
from pyspark.sql.window import Window
import time

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("DF/SQL API") \
    .getOrCreate()

#### Schemas

In [4]:
# Crimes table

crimes_schema = StructType([
    StructField("DR_NO", StringType()),
    StructField("DateRptd", DateType()),
    StructField("DATEOCC", DateType()),
    StructField("TIMEOCC", StringType()),
    StructField("AREA", StringType()),
    StructField("AREANAME", StringType()),
    StructField("RptDistNo", StringType()),
    StructField("Part", IntegerType()),
    StructField("CrmCd", StringType()),
    StructField("CrmCdDesc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("VictAge", StringType()),
    StructField("VictSex", StringType()),
    StructField("VictDescent", StringType()),
    StructField("PremisCd", StringType()),
    StructField("PremisDesc", StringType()),
    StructField("WeaponUsedCd", StringType()),
    StructField("WeaponDesc", StringType()),
    StructField("Status", StringType()),
    StructField("CrmCd1", StringType()),
    StructField("CrmCd2", StringType()),
    StructField("CrmCd3", StringType()),
    StructField("CrmCd4", StringType()),
    StructField("LOCATION", StringType()),
    StructField("CrossStreet", StringType()),
    StructField("LAT", FloatType()),
    StructField("LON", FloatType()),
])

crimes_df = spark.read.csv(fcrime, header=True, schema=crimes_schema, dateFormat='MM/dd/yyyy hh:mm:ss a')

In [5]:
# Stations table

stations_schema = StructType([
    StructField("X", FloatType()),
    StructField("Y", FloatType()),
    StructField("FID", IntegerType()),
    StructField("DIVISION", StringType()),
    StructField("LOCATION", StringType()),
    StructField("PREC", IntegerType()),
])

stations_df = spark.read.csv(fstations, header=True, schema=stations_schema, dateFormat='MM/dd/yyyy hh:mm:ss a')

## Query 1

### Να υλοποιηθεί το __Query 1__ χρησιμοποιώντας τα DataFrame και RDD APIs. Να εκτελέσετε και τις δύο υλοποιήσεις με 4 Spark executors. Υπάρχει διαφορά στην επίδοση μεταξύ των δύο APIs; Αιτιολογήσετε την απάντησή σας.

#### DataFrame API

In [6]:
# Create the spark session
spark = spark.newSession() \
    .builder \
    .appName("Query 1") \
    .config('spark.executor.instances','4') \
    .getOrCreate()

In [7]:
# Start timer
time_start = time.time()

In [8]:
# Categorize into age groups
categorized_df = crimes_df.withColumn('age_group',
                    when(col('VictAge').cast('int') < 18, 'Children')
                    .when(
                        ((col('VictAge').cast('int') >= 18) & (col('VictAge').cast('int') <= 24)), 'Young Adults'
                        )
                    .when(
                        ((col('VictAge').cast('int') >= 25) & (col('VictAge').cast('int') <= 64)), 'Adults'
                        )
                    .when(col('VictAge').cast('int') > 64, 'Elderly')
                    )

In [9]:
# Filter for 'AGGRAVATED ASSAULT'
assault_df = categorized_df.filter(
    col('CrmCdDesc').contains('AGGRAVATED ASSAULT')
    ) \
    .groupby('age_group') \
    .agg(count('*').alias('victim_count')) \
    .orderBy(col('victim_count').desc())

In [10]:
# Show results
assault_df.show()

time_end = time.time()
time_df = time_end - time_start

+------------+------------+
|   age_group|victim_count|
+------------+------------+
|      Adults|       72610|
|Young Adults|       23472|
|    Children|       10724|
|     Elderly|        3099|
+------------+------------+



#### RDD API

Δυστυχώς στα δεδομένα μας έχουμε μερικές περιπτώσεις όπου υπάρχει υποδιαστολή εντός quote marks (""), δημιουργώντας έτσι θέμα στο parse.
Για αυτό θα χρησιμοποιήσουμε μια βιβλιοθήκη της Python για την ανάγνωση των csv αρχείων και θα την περάσουμε μέσω mapping σε κάθε δεδομένο:

In [11]:
import csv
from datetime import datetime

def parse_line(line):
    reader = csv.reader([line])
    fields = next(reader)
    return {
        'DR_NO': fields[0],
        'DateRptd': fields[1],
        'DATEOCC': fields[2],
        'TIMEOCC': fields[3],
        'AREA': fields[4],
        'AREANAME': fields[5],
        'RptDistNo': fields[6],
        'Part': fields[7],
        'CrmCd': fields[8],
        'CrmCdDesc': fields[9],
        'Mocodes': fields[10],
        'VictAge': int(fields[11]),
        'VictSex': fields[12],
        'VictDescent': fields[13],
        'PremisCd': fields[14],
        'PremisDesc': fields[15],
        'WeaponUsedCd': fields[16],
        'WeaponDesc': fields[17],
        'Status': fields[18],
        'CrmCd1': fields[19],
        'CrmCd2': fields[20],
        'CrmCd3': fields[21],
        'CrmCd4': fields[22],
        'LOCATION': fields[23],
        'CrossStreet': fields[24],
        'LAT': fields[25],
        'LON': fields[26]
    }


In [12]:
# Read the .csv as Text
rdd = spark.sparkContext.textFile(fcrime)

# Remove the header
header = rdd.first()
crimes_rdd = rdd.filter(lambda line: line != header).map(parse_line)

In [13]:
# Start timer
time_start = time.time()

In [14]:
# Map function to categorize age groups
def categorize_age(crime):
    age = crime['VictAge']
    if age < 18:
        return 'Children'
    elif age >= 18 and age <= 24:
        return 'Young Adults'
    elif age >= 25 and age <= 64:
        return 'Adults'
    else:
        return 'Elderly'
    
categorized_rdd = crimes_rdd.filter(lambda x: 'AGGRAVATED ASSAULT' in x['CrmCdDesc']) \
                    .map(lambda x: (categorize_age(x), 1)) \
                    .reduceByKey(lambda a, b: a + b) \
                    .sortBy(lambda x: -x[1])

categorized_rdd.collect()
time_end = time.time()
time_rdd = time_end - time_start

### Παρατηρήσεις:

Η διαφορά στην απόδοση μεταξύ των 2 μεθόδων δεν είναι τόσο εμφανής όσο αναμέναμε.
Τα DataFrames αξιοποιούν το optimization και προσφέρουν γενικά μεγαλύτερη ταχύτητα, αν και στη συγκεκριμένη περίπτωση η διαφορά είναι μικρή.
Αντιθέτως, τα RDD προσφέρουν μεγαλύτερη ευελιξία, αφού είναι low level, στην επεξεργασία των δεδομένων. 

Πιθανόν το μέγεθος των δεδομένων να μην είναι αρκετά μεγάλο ώστε να αρχίσει να φαίνεται μια ουσιαστική διαφορά.

In [15]:
print(f"""Time taken for DataFrame API: {time_df}.
Time taken for RDD API: {time_rdd}
""")

Time taken for DataFrame API: 8.369946956634521.
Time taken for RDD API: 83.3004641532898



## Query 2

### α) Να υλοποιηθεί το __Query 2__ χρησιμοποιώντας τα DataFrame και SQL APIs. Να αναφέρετε και να συγκρίνετε τους χρόνους εκτέλεσης μεταξύ των δύο υλοποιήσεων.

#### DataFrame API

In [16]:
# Create the spark session

from pyspark.sql import SparkSession

spark = spark.newSession() \
    .builder \
    .appName("Query 2") \
    .getOrCreate()

In [17]:
crimes_df = spark.read.csv(fcrime, header=True, schema=crimes_schema, dateFormat='MM/dd/yyyy hh:mm:ss a')
stations_df = spark.read.csv(fstations, header=True, schema=stations_schema, dateFormat='MM/dd/yyyy hh:mm:ss a')

In [18]:
# Start timer
time_start = time.time()

In [19]:
# Create table yearly_precincts

yearly_precincts_df = crimes_df.join(
    stations_df,
    crimes_df.AREA.cast("int") == stations_df.FID
).groupBy(
    year(crimes_df.DateRptd).alias("year"),
    stations_df.DIVISION.alias("precinct")
).agg(
    sum(when(col('Status') != "IC", 1).otherwise(0)).alias("closed_cases"),
    count("*").alias("total_cases"),
    (
        sum(when(col('Status') != "IC", 1).otherwise(0)) * 100.0 / count("*")
    ).alias("closed_case_rate")
)

In [20]:
# Create table ranked precincts

windowSpec = Window.partitionBy('year').orderBy(col('closed_case_rate').desc())

ranked_precincts_df = yearly_precincts_df.withColumn(
    'ranking', row_number().over(windowSpec)
).select(
    col('year'),
    col('precinct'),
    col('closed_case_rate'),
    col('ranking')
)

In [21]:
# Create table rsults

results_df = ranked_precincts_df.filter(
    col('ranking') <= 3
).select(
    'year',
    'precinct',
    'closed_case_rate',
    'ranking'
).orderBy('year','ranking')

In [22]:
# Final results:

results_df.show()
time_end = time.time()
df_time = time_end - time_start
print(f'Time taken since creation of DF spark session to completion: {df_time:.2f} seconds')

+----+----------+------------------+-------+
|year|  precinct|  closed_case_rate|ranking|
+----+----------+------------------+-------+
|2010| SOUTHEAST|32.947355855318136|      1|
|2010|DEVONSHIRE|31.962706191728426|      2|
|2010| SOUTHWEST| 29.63203463203463|      3|
|2011|DEVONSHIRE|35.212167689161554|      1|
|2011| SOUTHEAST|32.511779630300836|      2|
|2011| SOUTHWEST| 28.65220520201501|      3|
|2012|DEVONSHIRE|34.414818310523835|      1|
|2012| SOUTHEAST|  32.9464181029429|      2|
|2012| SOUTHWEST|29.815133276010318|      3|
|2013|DEVONSHIRE| 33.52812271731191|      1|
|2013| SOUTHEAST| 32.08287360549222|      2|
|2013| SOUTHWEST|29.164224592662055|      3|
|2014|HOLLENBECK| 31.80567315834039|      1|
|2014|  WILSHIRE|31.311989956057754|      2|
|2014|  FOOTHILL|31.162790697674417|      3|
|2015|HOLLENBECK|32.641346981727736|      1|
|2015|  WILSHIRE|30.275974025974026|      2|
|2015|  FOOTHILL|30.179460678380156|      3|
|2016|HOLLENBECK|31.880755720117726|      1|
|2016|  WI

#### SQL API

In [23]:
# Start timer
time_start = time.time()

In [24]:
# Crimes Table

crimes_schema_sql = StructType([
    StructField("DR_NO", StringType()),
    StructField("DateRptd", StringType()),
    StructField("DATEOCC", StringType()),
    StructField("TIMEOCC", StringType()),
    StructField("AREA", StringType()),
    StructField("AREANAME", StringType()),
    StructField("RptDistNo", StringType()),
    StructField("Part", IntegerType()),
    StructField("CrmCd", StringType()),
    StructField("CrmCdDesc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("VictAge", StringType()),
    StructField("VictSex", StringType()),
    StructField("VictDescent", StringType()),
    StructField("PremisCd", StringType()),
    StructField("PremisDesc", StringType()),
    StructField("WeaponUsedCd", StringType()),
    StructField("WeaponDesc", StringType()),
    StructField("Status", StringType()),
    StructField("CrmCd1", StringType()),
    StructField("CrmCd2", StringType()),
    StructField("CrmCd3", StringType()),
    StructField("CrmCd4", StringType()),
    StructField("LOCATION", StringType()),
    StructField("CrossStreet", StringType()),
    StructField("LAT", FloatType()),
    StructField("LON", FloatType()),
])

crimes_df = spark.read.format('csv') \
    .options(header='true', dateFormat='MM/dd/yyyy hh:mm:ss a') \
    .schema(crimes_schema_sql) \
    .load(fcrime)

crimes_df = crimes_df.withColumn("DateRptd", to_timestamp("DateRptd", "MM/dd/yyyy hh:mm:ss a")) \
                     .withColumn("DATEOCC", to_timestamp("DATEOCC", "MM/dd/yyyy hh:mm:ss a"))

crimes_df.createOrReplaceTempView("crimes")

In [25]:
# Stations Table

stations_schema_sql = StructType([
    StructField("X", FloatType()),
    StructField("Y", FloatType()),
    StructField("FID", IntegerType()),
    StructField("DIVISION", StringType()),
    StructField("LOCATION", StringType()),
    StructField("PREC", IntegerType()),
])

stations_df = spark.read.format('csv') \
    .options(header='true') \
    .schema(stations_schema_sql) \
    .load(fstations)

stations_df.createOrReplaceTempView("stations")

In [26]:
query2_sql = """WITH YearlyPrecinctStats AS ( 
    SELECT 
        YEAR(c.DateRptd) AS year,
        s.DIVISION AS precinct,
        COUNT(*) AS total_cases,
        SUM(CASE WHEN c.Status != 'IC' THEN 1 ELSE 0 END) AS closed_cases,
        SUM(CASE WHEN c.Status != 'IC' THEN 1 ELSE 0 END) * 100.0 / COUNT(*) AS closed_case_rate
    FROM crimes c
    JOIN stations s
        ON CAST(c.AREA as INTEGER) = s.FID
    GROUP BY YEAR(c.DateRptd), s.DIVISION
    ),
    rankedPrecincts AS (
        SELECT
            year,
            precinct,
            closed_case_rate,
            ROW_NUMBER() OVER (PARTITION BY year ORDER BY closed_case_rate DESC) AS ranking
        FROM YearlyPrecinctStats
    )
    SELECT
        year,
        precinct,
        closed_case_rate,
        ranking
    FROM RankedPrecincts
    WHERE ranking <= 3
    ORDER BY year, ranking;"""

__Explanation:__ 

YearlyPrecinctStats
- We need to group our data by year and department
- Keep a count of all cases and a count of closed cases
- Create the rate as a percentage

RankedPrecincts
- From YearlyPrecinctStats keep: year, precinct and closed_case_rate
- We will need to create the ranking based on the closed cases rate of each department
- For each year assign a ranking (starting at 1) in descending order

Notes:

The symbol '#' is not supported in SQL so it was changed to ranking

In [27]:
spark.sql(query2_sql).show()
time_end = time.time()
sql_time = time_end - time_start
print(f"Time taken since creation of spark session to query completion: {sql_time:.2f} seconds")

+----+----------+-----------------+-------+
|year|  precinct| closed_case_rate|ranking|
+----+----------+-----------------+-------+
|2010| SOUTHEAST|32.94735585531813|      1|
|2010|DEVONSHIRE|31.96270619172842|      2|
|2010| SOUTHWEST|29.63203463203463|      3|
|2011|DEVONSHIRE|35.21216768916155|      1|
|2011| SOUTHEAST|32.51177963030083|      2|
|2011| SOUTHWEST|28.65220520201501|      3|
|2012|DEVONSHIRE|34.41481831052383|      1|
|2012| SOUTHEAST|32.94641810294290|      2|
|2012| SOUTHWEST|29.81513327601032|      3|
|2013|DEVONSHIRE|33.52812271731191|      1|
|2013| SOUTHEAST|32.08287360549222|      2|
|2013| SOUTHWEST|29.16422459266206|      3|
|2014|HOLLENBECK|31.80567315834039|      1|
|2014|  WILSHIRE|31.31198995605775|      2|
|2014|  FOOTHILL|31.16279069767442|      3|
|2015|HOLLENBECK|32.64134698172773|      1|
|2015|  WILSHIRE|30.27597402597403|      2|
|2015|  FOOTHILL|30.17946067838016|      3|
|2016|HOLLENBECK|31.88075572011773|      1|
|2016|  WILSHIRE|31.547987616099

#### Συμπεράσματα

In [28]:
print(f'''Time taken for DataFrame API: {df_time:.2f}
Time taken for SQL API: {sql_time:.2f}''')

Time taken for DataFrame API: 4.93
Time taken for SQL API: 4.55


Παρατηρούμε ότι οι χρόνοι εκτέλεσης δεν παρουσιάζουν μεγάλες αποκλίσεις.
Παραδόξως, οι χρόνοι εκτέλεσης για το SQL API είναι συνήθως σταθεροί για όσες φορές τρέξουμε τον κώδικα, ενώ στο DataFrame API παρατηρούμε μεγάλη απόκλιση μεταξύ των τιμών για πολλαπλές εκτελέσεις του κώδικα.

### β) Να γράψετε κώδικα Spark που μετατρέπει το κυρίως data set σε parquet file format και αποθηκεύει ένα μοναδικό .parquet αρχείο στο S3 bucket της ομάδας σας. Επιλέξτε μία από τις δύο υλοποιήσεις του υποερωτήματος α) (DataFrame ή SQL) και συγκρίνετε τους χρόνους εκτέλεσης της εφαρμογής σας όταν τα δεδομένα εισάγονται σαν .csv και σαν .parquet.

In [6]:
# Create the spark session

from pyspark.sql import SparkSession

spark = spark.newSession() \
    .builder \
    .config("spark.hadoop.fs.hdfs.impl.disable.cache", "true") \
    .config("spark.hadoop.io.native.lib.available", "false") \
    .appName("Query 2 Part 2") \
    .getOrCreate()

In [7]:
crimes_df = spark.read.csv(fcrime,header=True, inferSchema=True)
crimes_df.write.mode('overwrite').parquet(fcrime_parq)

stations_df = spark.read.csv(fstations,header=True, inferSchema=True)
stations_df.write.mode('overwrite').parquet(fstations_parq)

Θα χρησιμοποιήσουμε το DataFrame:

In [8]:
time_start = time.time()

In [9]:
crimes_df = spark.read.parquet(fcrime_parq)
stations_df = spark.read.parquet(fstations_parq)

In [10]:
crimes_df = crimes_df.withColumn('Date Rptd', to_timestamp('Date Rptd', 'MM/dd/yyyy hh:mm:ss a'))

In [11]:
# Create table yearly_precincts

yearly_precincts_df = crimes_df.join(
    stations_df,
    col('AREA ').cast("int") == stations_df.FID
).groupBy(
    year(col('Date Rptd')).alias("year"),
    stations_df.DIVISION.alias("precinct")
).agg(
    sum(when(col('Status') != "IC", 1).otherwise(0)).alias("closed_cases"),
    count("*").alias("total_cases"),
    (
        sum(when(col('Status') != "IC", 1).otherwise(0)) * 100.0 / count("*")
    ).alias("closed_case_rate")
)

In [12]:
# Create table ranked precincts

windowSpec = Window.partitionBy('year').orderBy(col('closed_case_rate').desc())

ranked_precincts_df = yearly_precincts_df.withColumn(
    'ranking', row_number().over(windowSpec)
).select(
    col('year'),
    col('precinct'),
    col('closed_case_rate'),
    col('ranking')
)

In [13]:
# Create table results

results_df = ranked_precincts_df.filter(
    col('ranking') <= 3
).select(
    'year',
    'precinct',
    'closed_case_rate',
    'ranking'
).orderBy('year','ranking')

In [14]:
# Final results:

results_df.show()
time_end = time.time()
df_time_parq = time_end - time_start
print(f'Time taken for DF with parquet to completion: {df_time_parq:.2f} seconds')

+----+----------+------------------+-------+
|year|  precinct|  closed_case_rate|ranking|
+----+----------+------------------+-------+
|2010| SOUTHEAST|32.947355855318136|      1|
|2010|DEVONSHIRE|31.962706191728426|      2|
|2010| SOUTHWEST| 29.63203463203463|      3|
|2011|DEVONSHIRE|35.212167689161554|      1|
|2011| SOUTHEAST|32.511779630300836|      2|
|2011| SOUTHWEST| 28.65220520201501|      3|
|2012|DEVONSHIRE|34.414818310523835|      1|
|2012| SOUTHEAST|  32.9464181029429|      2|
|2012| SOUTHWEST|29.815133276010318|      3|
|2013|DEVONSHIRE| 33.52812271731191|      1|
|2013| SOUTHEAST| 32.08287360549222|      2|
|2013| SOUTHWEST|29.164224592662055|      3|
|2014|HOLLENBECK| 31.80567315834039|      1|
|2014|  WILSHIRE|31.311989956057754|      2|
|2014|  FOOTHILL|31.162790697674417|      3|
|2015|HOLLENBECK|32.641346981727736|      1|
|2015|  WILSHIRE|30.275974025974026|      2|
|2015|  FOOTHILL|30.179460678380156|      3|
|2016|HOLLENBECK|31.880755720117726|      1|
|2016|  WI

Χρησιμοποιώντας τα parquet δεδομένα, παρατηρούμε σταθερά μια αρκετά μεγάλη βελτίωση στην ταχύτητα.

## Query 3

Να υλοποιηθεί το Query 3 χρησιμοποιώντας DataFrame ή SQL API. Χρησιμοποιήστε τις μεθόδους hint & explain για να βρείτε ποιες στρατηγικές join χρησιμοποιεί ο catalyst optimizer.
Πειραματιστείτε αναγκάζοντας το Spark να χρησιμοποιήσει διαφορετικές στρατηγικές (μεταξύ
των BROADCAST, MERGE, SHUFFLE_HASH, SHUFFLE_REPLICATE_NL) και σχολιάστε τα αποτελέσματα
που παρατηρείτε. Ποιά (ή ποιές) από τις διαθέσιμες στρατηγικές join του Spark είναι καταλληλότερη(ες) και γιατί;

In [17]:
from sedona.spark import *

sedona = SedonaContext.create(spark)

blocks_df = sedona.read.format('geojson') \
    .option('multiLine','true').load(fgeo) \
    .selectExpr('explode(features) as features') \
    .select('features.*')

flattened_df = blocks_df.select(
    [col(f'properties.{col_name}').alias(col_name) for col_name in \
    blocks_df.schema['properties'].dataType.fieldNames()] + ['geometry']) \
    .drop('properties').drop('type')

In [18]:
income_schema = StructType([
    StructField('ZipCode', IntegerType()),
    StructField('Community', StringType()),
    StructField('Income', StringType())
])

In [21]:
income_df = spark.read.csv(fincome, header=True, schema=income_schema)

# Remove the $ character
income_df = income_df.withColumn(
    'Income', 
    regexp_replace(col('Income'), r"[$,]", "").cast('float')
)

In [22]:
la_flattened_df = flattened_df.filter(col('CITY') == 'Los Angeles')
crimes_df = crimes_df.filter(
    (col('LAT') != 0) & (col('LON') != 0)
)

result_df = crimes_df.withColumn('geom', ST_Point(col('LON'), col('LAT')))

In [23]:
income_per_area_df = income_df.join(la_flattened_df, col('ZipCode') == la_flattened_df['ZCTA10'])

# Table for Query 4
q4 = result_df.join(
    income_per_area_df,
    ST_Within(result_df['geom'], la_flattened_df['geometry'])
)

result_df = result_df.join(
    income_per_area_df,
    ST_Within(result_df['geom'], la_flattened_df['geometry']),
    how='inner'
).groupBy('COMM').agg(
    (sum('Income') / sum('POP_2010')).alias('IncomePerPerson'),
    (count('*') / sum('POP_2010')).alias('CrimesPerPerson')
).orderBy(col('IncomePerPerson').asc())

In [24]:
result_df.show()

+------------------+------------------+--------------------+
|              COMM|   IncomePerPerson|     CrimesPerPerson|
+------------------+------------------+--------------------+
|          Westlake| 47.06267163013844|0.001818971610681...|
|     Panorama City| 59.95984645959698|0.001604741028536...|
| Little Bangladesh|  77.4462882372383|0.001979373403452...|
|     Baldwin Hills| 78.78636074000164|0.002120942479621...|
|   Wilshire Center| 81.12746576217563|0.002112811398485...|
|   University Park|  84.2864598957781|0.003648016379738...|
|         Thai Town| 92.49907318881583|0.001792791417556...|
|Wholesale District| 95.40555901521596|0.004404426023303084|
|     Vermont Vista| 95.87913381880738|0.003186652379442434|
|        South Park| 96.21397332574081|0.003385019532433...|
|        Pico-Union|101.48672177250172|0.003285527032422...|
|    Toluca Terrace|102.67364930834147|0.002117026109988...|
|        West Adams| 104.8323250501383|0.003550872219509559|
|          Van Nuys|105.

In [25]:
result_df = crimes_df.withColumn('geom', ST_Point(col('LON'), col('LAT')))

income_per_area_df = income_df.join(
    la_flattened_df.hint('broadcast'), 
    col('ZipCode') == la_flattened_df['ZCTA10'],
    how='inner'
)

result_df = result_df.join(
    income_per_area_df.hint('broadcast'),
    ST_Within(result_df['geom'], la_flattened_df['geometry']),
    how='inner'
).groupBy('COMM').agg(
    (sum('Income') / sum('POP_2010')).alias('IncomePerPerson'),
    (count('*') / sum('POP_2010')).alias('CrimesPerPerson')
).orderBy(col('IncomePerPerson').asc())

result_df.explain(True)

result_df.show()

== Parsed Logical Plan ==
'Sort ['IncomePerPerson ASC NULLS FIRST], true
+- Aggregate [COMM#545], [COMM#545, (sum(Income#678) / cast(sum(POP_2010#554L) as double)) AS IncomePerPerson#1362, (cast(count(1) as double) / cast(sum(POP_2010#554L) as double)) AS CrimesPerPerson#1365]
   +- Join Inner,  **org.apache.spark.sql.sedona_sql.expressions.ST_Within**
      :- Project [DR_NO#236, Date Rptd#304, DATE OCC#238, TIME OCC#239, AREA #240, AREA NAME#241, Rpt Dist No#242, Part 1-2#243, Crm Cd#244, Crm Cd Desc#245, Mocodes#246, Vict Age#247, Vict Sex#248, Vict Descent#249, Premis Cd#250, Premis Desc#251, Weapon Used Cd#252, Weapon Desc#253, Status#254, Status Desc#255, Crm Cd 1#256, Crm Cd 2#257, Crm Cd 3#258, Crm Cd 4#259, ... 5 more fields]
      :  +- Filter (NOT (LAT#262 = cast(0 as double)) AND NOT (LON#263 = cast(0 as double)))
      :     +- Project [DR_NO#236, to_timestamp(Date Rptd#237, Some(MM/dd/yyyy hh:mm:ss a), TimestampType, Some(Europe/Athens), false) AS Date Rptd#304, DATE OCC#

In [None]:
result_df_merge = crimes_df.withColumn('geom', ST_Point(col('LON'), col('LAT')))

result_df_merge = result_df_merge.join(
    income_per_area_df.hint('merge'),
    ST_Within(result_df_merge['geom'], la_flattened_df['geometry']),
    how='inner'
).groupBy('COMM').agg(
    (sum('Income') / sum('POP_2010')).alias('IncomePerPerson'),
    (count('*') / sum('POP_2010')).alias('CrimesPerPerson')
).orderBy(col('IncomePerPerson').asc())

#result_df_merge.show()

result_df_merge.explain(True)

In [None]:
result_df = crimes_df.withColumn('geom', ST_Point(col('LON'), col('LAT')))

income_per_area_df = income_df.join(
    la_flattened_df.hint('shuffle_hash'), 
    col('ZipCode') == la_flattened_df['ZCTA10'],
    how='inner'
)

result_df = result_df.join(
    income_per_area_df.hint('shuffle_hush'),
    ST_Within(result_df['geom'], la_flattened_df['geometry']),
    how='inner'
).groupBy('COMM').agg(
    (sum('Income') / sum('POP_2010')).alias('IncomePerPerson'),
    (count('*') / sum('POP_2010')).alias('CrimesPerPerson')
).orderBy(col('IncomePerPerson').asc())

result_df.explain(True)

#result_df.show()

In [None]:
result_df = crimes_df.withColumn('geom', ST_Point(col('LON'), col('LAT')))

income_per_area_df = income_df.join(
    la_flattened_df.hint('shuffle_replicate_nl'), 
    col('ZipCode') == la_flattened_df['ZCTA10'],
    how='inner'
)

result_df = result_df.join(
    income_per_area_df.hint('shuffle_replicate_nl'),
    ST_Within(result_df['geom'], la_flattened_df['geometry']),
    how='inner'
).groupBy('COMM').agg(
    (sum('Income') / sum('POP_2010')).alias('IncomePerPerson'),
    (count('*') / sum('POP_2010')).alias('CrimesPerPerson')
).orderBy(col('IncomePerPerson').asc())

result_df.explain(True)

#result_df.show()

## Query 4

Να υλοποιηθεί το Query 4 χρησιμοποιώντας το DataFrame ή SQL API. Να εκτελέσετε την υλοποίησή σας εφαρμόζοντας κλιμάκωση στο σύνολο των υπολογιστικών πόρων που θα χρησιμοποιήσετε: Συγκεκριμένα, καλείστε να εκτελέστε την υλοποίησή σας σε 2 executors με τα ακόλουθα configurations:

- 1 core/2 GB memory
- 2 cores/4GB memory
- 4 cores/8GB memory

Σχολιάστε τα αποτελέσματα.

## 1 core / 2 GB

In [26]:
# Create the spark session

from pyspark.sql import SparkSession

spark = spark.newSession() \
    .builder \
    .appName("Query 4 Part 1") \
    .config('spark.executor.instances','2') \
    .config('spark.executor.cores', '1') \
    .config('spark.executor.memory', '2g') \
    .getOrCreate()

In [27]:
code_df = spark.read.csv(fcodes, header=True, inferSchema=True)

In [28]:
highest_income_df = result_df.orderBy(col('IncomePerPerson').desc()).limit(3) # Using the table in Query 3
lowest_income_df = result_df.orderBy('IncomePerPerson').limit(3)

In [29]:
crimes_2015_df = q4.withColumn('DATE OCC', to_timestamp('DATE OCC','MM/dd/yyyy hh:mm:ss a'))
crimes_2015_df = crimes_2015_df.filter(year(col('DATE OCC')) == 2015)

high_income_crimes = crimes_2015_df.join(
    highest_income_df,
    crimes_2015_df['COMM'] == highest_income_df['COMM']
)

low_income_crimes = crimes_2015_df.join(
    lowest_income_df,
    crimes_2015_df['COMM'] == lowest_income_df['COMM']
)

high_income_race = high_income_crimes.join(
    code_df,
    high_income_crimes['Vict Descent'] == code_df['Vict Descent']
)

low_income_race = low_income_crimes.join(
    code_df,
    low_income_crimes['Vict Descent'] == code_df['Vict Descent']
)

In [30]:
results_high_df = high_income_race.groupBy('Vict Descent Full').count().alias('Count').orderBy(col('Count').desc())

results_high_df.show()

+--------------------+-----+
|   Vict Descent Full|count|
+--------------------+-----+
|               White|  150|
|               Other|   29|
|Hispanic/Latin/Me...|   23|
|               Black|   16|
|         Other Asian|    8|
|             Unknown|    5|
+--------------------+-----+



In [31]:
results_low_df = low_income_race.groupBy('Vict Descent Full').count().alias('Count').orderBy(col('Count').desc())

results_low_df.show()

+--------------------+-----+
|   Vict Descent Full|count|
+--------------------+-----+
|Hispanic/Latin/Me...| 4117|
|               Other| 1028|
|               White|  924|
|               Black|  809|
|         Other Asian|  289|
|             Unknown|  254|
|              Korean|  196|
|            Filipino|   23|
|American Indian/A...|    4|
|             Chinese|    4|
|    Pacific Islander|    3|
|           Guamanian|    3|
|            Japanese|    1|
+--------------------+-----+



## 2 cores / 4 GB

In [32]:
# Create the spark session

from pyspark.sql import SparkSession

spark = spark.newSession() \
    .builder \
    .appName("Query 4 Part 2") \
    .config('spark.executor.instances','2') \
    .config('spark.executor.cores', '2') \
    .config('spark.executor.memory', '4g') \
    .getOrCreate()

In [33]:
code_df = spark.read.csv(fcodes, header=True, inferSchema=True)

In [34]:
highest_income_df = result_df.orderBy(col('IncomePerPerson').desc()).limit(3) # Using the table in Query 3
lowest_income_df = result_df.orderBy('IncomePerPerson').limit(3)

In [35]:
crimes_2015_df = q4.withColumn('DATE OCC', to_timestamp('DATE OCC','MM/dd/yyyy hh:mm:ss a'))
crimes_2015_df = crimes_2015_df.filter(year(col('DATE OCC')) == 2015)

high_income_crimes = crimes_2015_df.join(
    highest_income_df,
    crimes_2015_df['COMM'] == highest_income_df['COMM']
)

low_income_crimes = crimes_2015_df.join(
    lowest_income_df,
    crimes_2015_df['COMM'] == lowest_income_df['COMM']
)

high_income_race = high_income_crimes.join(
    code_df,
    high_income_crimes['Vict Descent'] == code_df['Vict Descent']
)

low_income_race = low_income_crimes.join(
    code_df,
    low_income_crimes['Vict Descent'] == code_df['Vict Descent']
)

In [36]:
results_high_df = high_income_race.groupBy('Vict Descent Full').count().alias('Count').orderBy(col('Count').desc())

results_high_df.show()

+--------------------+-----+
|   Vict Descent Full|count|
+--------------------+-----+
|               White|  150|
|               Other|   29|
|Hispanic/Latin/Me...|   23|
|               Black|   16|
|         Other Asian|    8|
|             Unknown|    5|
+--------------------+-----+



In [37]:
results_low_df = low_income_race.groupBy('Vict Descent Full').count().alias('Count').orderBy(col('Count').desc())

results_low_df.show()

+--------------------+-----+
|   Vict Descent Full|count|
+--------------------+-----+
|Hispanic/Latin/Me...| 4117|
|               Other| 1028|
|               White|  924|
|               Black|  809|
|         Other Asian|  289|
|             Unknown|  254|
|              Korean|  196|
|            Filipino|   23|
|American Indian/A...|    4|
|             Chinese|    4|
|           Guamanian|    3|
|    Pacific Islander|    3|
|            Japanese|    1|
+--------------------+-----+



## 4 cores / 8 GB

In [38]:
# Create the spark session

from pyspark.sql import SparkSession

spark = spark.newSession() \
    .builder \
    .appName("Query 4 Part 3") \
    .config('spark.executor.instances','2') \
    .config('spark.executor.cores', '4') \
    .config('spark.executor.memory', '8g') \
    .getOrCreate()

In [39]:
code_df = spark.read.csv(fcodes, header=True, inferSchema=True)

In [40]:
highest_income_df = result_df.orderBy(col('IncomePerPerson').desc()).limit(3) # Using the table in Query 3
lowest_income_df = result_df.orderBy('IncomePerPerson').limit(3)

In [41]:
crimes_2015_df = q4.withColumn('DATE OCC', to_timestamp('DATE OCC','MM/dd/yyyy hh:mm:ss a'))
crimes_2015_df = crimes_2015_df.filter(year(col('DATE OCC')) == 2015)

high_income_crimes = crimes_2015_df.join(
    highest_income_df,
    crimes_2015_df['COMM'] == highest_income_df['COMM']
)

low_income_crimes = crimes_2015_df.join(
    lowest_income_df,
    crimes_2015_df['COMM'] == lowest_income_df['COMM']
)

high_income_race = high_income_crimes.join(
    code_df,
    high_income_crimes['Vict Descent'] == code_df['Vict Descent']
)

low_income_race = low_income_crimes.join(
    code_df,
    low_income_crimes['Vict Descent'] == code_df['Vict Descent']
)

In [42]:
results_high_df = high_income_race.groupBy('Vict Descent Full').count().alias('Count').orderBy(col('Count').desc())

results_high_df.show()

+--------------------+-----+
|   Vict Descent Full|count|
+--------------------+-----+
|               White|  150|
|               Other|   29|
|Hispanic/Latin/Me...|   23|
|               Black|   16|
|         Other Asian|    8|
|             Unknown|    5|
+--------------------+-----+



In [43]:
results_low_df = low_income_race.groupBy('Vict Descent Full').count().alias('Count').orderBy(col('Count').desc())

results_low_df.show()

+--------------------+-----+
|   Vict Descent Full|count|
+--------------------+-----+
|Hispanic/Latin/Me...| 4117|
|               Other| 1028|
|               White|  924|
|               Black|  809|
|         Other Asian|  289|
|             Unknown|  254|
|              Korean|  196|
|            Filipino|   23|
|             Chinese|    4|
|American Indian/A...|    4|
|           Guamanian|    3|
|    Pacific Islander|    3|
|            Japanese|    1|
+--------------------+-----+



## Query 5