In [1]:
# imports
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType
import pandas as pd

# local
from env import db_url

# Exercises
This exercises use the cases, dept, and source tables from the 311_data on the Codeup MySQL server.



In [2]:
# Generate spark object to initialize a local Spark JVM process
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/26 14:50:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## 1.
Read the case, department, and source data into their own spark dataframes.

In [20]:
# Readin in cases table from SQL

def get_sql(database, table, user, password, host):
    """
    This function acquires data from a SQL database and caches it locally.

    :param user: The username for accessing the MySQL database
    :param password: The password is unique per user saved in env
    :param host: The host parameter is the address of the server where the database is hosted
    :return: The function `get_sql` is returning a pandas DataFrame containing information on single family residential properties
    """
    # name of cached csv
    filename = table + ".csv"
    # if cached data exist
    if os.path.isfile(filename):
        # read data from cached csv
        df = pd.read_csv(filename)
        # Print size
        print(f"Total rows: {df.shape[0]}")
        print(f"Total columns: {df.shape[1]}")
    # wrangle from sql db if not cached
    else:
        # read sql query into df
        # 261 is single family residential id
        df = pd.read_sql(
            f"""select * 
            FROM {table}""",
            f"mysql+pymysql://{user}:{password}@{host}/{database}",
        )
        # cache data locally
        df.to_csv(filename + ".csv", index=False)
        # print total rows and columns
        print(f"Total rows: {df.shape[0]}")
        print(f"Total columns: {df.shape[1]}")        
        return df


```python

from env import user, password, host

cases = get_sql('311_data', 'cases', user, password, host)
dept = get_sql('311_data', 'dept', user, password, host)
source = get_sql('311_data', 'source', user, password, host)

In [31]:
# Read in the three .csv
case = pd.read_csv('/sources_csv/cases.csv')
dept = pd.read_csv('/sources_csv/dept.csv')
source = pd.read_csv('/sources_csv/source.csv')

In [32]:
# Convert to spark
case = spark.createDataFrame(case)
case.show(5)

23/10/26 15:15:36 WARN TaskSetManager: Stage 0 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
23/10/26 15:15:41 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 0 (TID 0): Attempting to kill Python Worker
                                                                                

+----------+----------------+----------------+------------+---------+-------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|
+----------+----------------+----------------+------------+---------+-------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|     1/1/18 0:42|    1/1/18 12:29|9/26/20 0:42|       NO| -998.5087616|        YES|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|
|1014127333|     1/1/18 0:46|     1/3/18 8:11| 1/5/18 8:30|       NO| -2.012604167|        YES|     Storm Water|Removal Of Obstru...|4.322222222|     Closed| svcCRMSS|2215  GOL

In [33]:
# Convert to spark
dept = spark.createDataFrame(dept)
dept.show(5)

+--------------------+--------------------+----------------------+-------------------+
|       dept_division|           dept_name|standardized_dept_name|dept_subject_to_SLA|
+--------------------+--------------------+----------------------+-------------------+
|     311 Call Center|    Customer Service|      Customer Service|                YES|
|               Brush|Solid Waste Manag...|           Solid Waste|                YES|
|     Clean and Green|Parks and Recreation|    Parks & Recreation|                YES|
|Clean and Green N...|Parks and Recreation|    Parks & Recreation|                YES|
|    Code Enforcement|Code Enforcement ...|  DSD/Code Enforcement|                YES|
+--------------------+--------------------+----------------------+-------------------+
only showing top 5 rows



In [34]:
# Convert to spark
source = spark.createDataFrame(source)
source.show(5)

+-----+---------+----------------+
|index|source_id| source_username|
+-----+---------+----------------+
|    0|   100137|Merlene Blodgett|
|    1|   103582|     Carmen Cura|
|    2|   106463| Richard Sanchez|
|    3|   119403|  Betty De Hoyos|
|    4|   119555|  Socorro Quiara|
+-----+---------+----------------+
only showing top 5 rows



## 2. Let's see how writing to the local disk works in spark:



- Write the code necessary to store the source data in both csv and json format, store these as sources_csv and sources_json


In [35]:
source.write.json("sources_json", mode="overwrite")
source.write.format("csv").mode("overwrite").option("header", True).save("sources_csv")

                                                                                

- Inspect your folder structure. What do you notice?


In [None]:
# both split into 7 parts with a success file and crc files for everything

## 3. Inspect the data in your dataframes. Are the data types appropriate? Write the code necessary to cast the values to the appropriate types.

In [36]:
case.schema

StructType([StructField('case_id', LongType(), True), StructField('case_opened_date', StringType(), True), StructField('case_closed_date', StringType(), True), StructField('SLA_due_date', StringType(), True), StructField('case_late', StringType(), True), StructField('num_days_late', DoubleType(), True), StructField('case_closed', StringType(), True), StructField('dept_division', StringType(), True), StructField('service_request_type', StringType(), True), StructField('SLA_days', DoubleType(), True), StructField('case_status', StringType(), True), StructField('source_id', StringType(), True), StructField('request_address', StringType(), True), StructField('council_district', LongType(), True)])

In [37]:
case.show(3)

23/10/26 15:20:25 WARN TaskSetManager: Stage 6 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
[Stage 6:>                                                          (0 + 1) / 1]

+----------+----------------+----------------+------------+---------+-------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|
+----------+----------------+----------------+------------+---------+-------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|     1/1/18 0:42|    1/1/18 12:29|9/26/20 0:42|       NO| -998.5087616|        YES|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|
|1014127333|     1/1/18 0:46|     1/3/18 8:11| 1/5/18 8:30|       NO| -2.012604167|        YES|     Storm Water|Removal Of Obstru...|4.322222222|     Closed| svcCRMSS|2215  GOL

23/10/26 15:20:29 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 6 (TID 20): Attempting to kill Python Worker
                                                                                

In [38]:
# case_opened_date  str-date
# case_closed_date  str-date
# SLA_due_date      str-date
# case_late         str-bool
# case_closed       str-bool
# council_district  num-str max('SLA_due_date')
case = (
    case.withColumn("case_closed", expr('case_closed=="YES"'))
    .withColumn("case_late", expr('case_late=="YES"'))
    .withColumn("council_district", col("council_district").cast("string"))
    .withColumn("case_opened_date", to_timestamp("case_opened_date", "M/d/yy H:mm"))
    .withColumn("case_closed_date", to_timestamp("case_closed_date", "M/d/yy H:mm"))
    .withColumn("SLA_due_date", to_timestamp("SLA_due_date", "M/d/yy H:mm"))
)
case.show(3)

23/10/26 15:20:30 WARN TaskSetManager: Stage 7 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
[Stage 7:>                                                          (0 + 1) / 1]

+----------+-------------------+-------------------+-------------------+---------+-------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|   case_opened_date|   case_closed_date|       SLA_due_date|case_late|num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|
+----------+-------------------+-------------------+-------------------+---------+-------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|2018-01-01 00:42:00|2018-01-01 12:29:00|2020-09-26 00:42:00|    false| -998.5087616|       true|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|
|1014127333|2018-01-01 00:46:00|2018-01-03 08:11:00|2018-01-05 08:30:00|    false| -2.012604167|       true|     Storm Water

23/10/26 15:20:34 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 7 (TID 21): Attempting to kill Python Worker
                                                                                

In [39]:
dept.schema

StructType([StructField('dept_division', StringType(), True), StructField('dept_name', StringType(), True), StructField('standardized_dept_name', StringType(), True), StructField('dept_subject_to_SLA', StringType(), True)])

In [40]:
dept.show(3)

+---------------+--------------------+----------------------+-------------------+
|  dept_division|           dept_name|standardized_dept_name|dept_subject_to_SLA|
+---------------+--------------------+----------------------+-------------------+
|311 Call Center|    Customer Service|      Customer Service|                YES|
|          Brush|Solid Waste Manag...|           Solid Waste|                YES|
|Clean and Green|Parks and Recreation|    Parks & Recreation|                YES|
+---------------+--------------------+----------------------+-------------------+
only showing top 3 rows



In [41]:
# dept_subject_to_SLA   str-bool
dept = dept.withColumn("dept_subject_to_SLA", expr('dept_subject_to_SLA=="YES"'))
dept.show(3)

+---------------+--------------------+----------------------+-------------------+
|  dept_division|           dept_name|standardized_dept_name|dept_subject_to_SLA|
+---------------+--------------------+----------------------+-------------------+
|311 Call Center|    Customer Service|      Customer Service|               true|
|          Brush|Solid Waste Manag...|           Solid Waste|               true|
|Clean and Green|Parks and Recreation|    Parks & Recreation|               true|
+---------------+--------------------+----------------------+-------------------+
only showing top 3 rows



In [42]:
source.schema

StructType([StructField('index', LongType(), True), StructField('source_id', StringType(), True), StructField('source_username', StringType(), True)])

In [43]:
source.show(3)

+-----+---------+----------------+
|index|source_id| source_username|
+-----+---------+----------------+
|    0|   100137|Merlene Blodgett|
|    1|   103582|     Carmen Cura|
|    2|   106463| Richard Sanchez|
+-----+---------+----------------+
only showing top 3 rows



## How old is the latest (in terms of days past SLA) currently open issue?

In [44]:
# 3 with same recently open date
case.select("SLA_days").where(expr("! case_closed")).sort(
    col("case_opened_date").desc()
).show(3)

23/10/26 15:20:36 WARN TaskSetManager: Stage 11 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+--------+
|SLA_days|
+--------+
|    14.0|
|    14.0|
|     9.0|
+--------+
only showing top 3 rows



## How long has the oldest (in terms of days since opened) currently opened issue been open?

In [45]:
# case.select(datediff(current_timestamp(), 'case_opened_date').alias('case_age')).where(expr('! case_closed')).sort(col('case_opened_date')).show(1)
max_date = case.select(max("SLA_due_date")).first()[0]
case.select(datediff(lit(max_date), "case_opened_date").alias("case_age")).where(
    expr("!case_closed")
).show(1)

23/10/26 15:20:39 WARN TaskSetManager: Stage 12 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
23/10/26 15:20:43 WARN TaskSetManager: Stage 15 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
[Stage 15:>                                                         (0 + 1) / 1]

+--------+
|case_age|
+--------+
|    1606|
+--------+
only showing top 1 row



23/10/26 15:20:47 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 15 (TID 42): Attempting to kill Python Worker
                                                                                

## How many Stray Animal cases are there?

In [46]:
case.where(expr('service_request_type=="Stray Animal"')).count()

23/10/26 15:20:47 WARN TaskSetManager: Stage 16 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

26760

## How many service requests that are assigned to the Field Operations department (dept_division) are not classified as "Officer Standby" request type (service_request_type)?

In [47]:
# case.where(expr('dept_division=="Field Operations"')).where(expr('service_request_type!="Officer Standby"')).count()
case.where(
    expr(
        'dept_division=="Field Operations" and service_request_type!="Officer Standby"'
    )
).count()

23/10/26 15:21:30 WARN TaskSetManager: Stage 19 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

113902

## Convert the council_district column to a string column.

In [48]:
# done earlier
case = case.withColumn("council_district", col("council_district").cast("string"))
case

DataFrame[case_id: bigint, case_opened_date: timestamp, case_closed_date: timestamp, SLA_due_date: timestamp, case_late: boolean, num_days_late: double, case_closed: boolean, dept_division: string, service_request_type: string, SLA_days: double, case_status: string, source_id: string, request_address: string, council_district: string]

## Extract the year from the case_closed_date column.

In [49]:
case = case.withColumn("year_closed", year("case_closed_date"))
case.show(3)

23/10/26 15:21:36 WARN TaskSetManager: Stage 22 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
[Stage 22:>                                                         (0 + 1) / 1]

+----------+-------------------+-------------------+-------------------+---------+-------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+-----------+
|   case_id|   case_opened_date|   case_closed_date|       SLA_due_date|case_late|num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|year_closed|
+----------+-------------------+-------------------+-------------------+---------+-------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+-----------+
|1014127332|2018-01-01 00:42:00|2018-01-01 12:29:00|2020-09-26 00:42:00|    false| -998.5087616|       true|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|       2018|
|1014127333|2018-01-01 00:46:00|2018-01-03 08:11:00|2018-01-05 08:30:00|    

23/10/26 15:21:40 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 22 (TID 61): Attempting to kill Python Worker
                                                                                

## Convert num_days_late from days to hours in new columns num_hours_late.

In [50]:
case = case.withColumn("num_hours_late", expr("num_days_late*24"))
case.show(3)

23/10/26 15:21:40 WARN TaskSetManager: Stage 23 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
[Stage 23:>                                                         (0 + 1) / 1]

+----------+-------------------+-------------------+-------------------+---------+-------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+-----------+-------------------+
|   case_id|   case_opened_date|   case_closed_date|       SLA_due_date|case_late|num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|year_closed|     num_hours_late|
+----------+-------------------+-------------------+-------------------+---------+-------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+-----------+-------------------+
|1014127332|2018-01-01 00:42:00|2018-01-01 12:29:00|2020-09-26 00:42:00|    false| -998.5087616|       true|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|       2018|-23964.2102783999

23/10/26 15:21:44 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 23 (TID 62): Attempting to kill Python Worker
                                                                                

## Join the case data with the source and department data.

In [51]:
case.show(3)

23/10/26 15:21:45 WARN TaskSetManager: Stage 24 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
[Stage 24:>                                                         (0 + 1) / 1]

+----------+-------------------+-------------------+-------------------+---------+-------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+-----------+-------------------+
|   case_id|   case_opened_date|   case_closed_date|       SLA_due_date|case_late|num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|year_closed|     num_hours_late|
+----------+-------------------+-------------------+-------------------+---------+-------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+-----------+-------------------+
|1014127332|2018-01-01 00:42:00|2018-01-01 12:29:00|2020-09-26 00:42:00|    false| -998.5087616|       true|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|       2018|-23964.2102783999

23/10/26 15:21:49 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 24 (TID 63): Attempting to kill Python Worker
                                                                                

In [52]:
dept.show(3)

+---------------+--------------------+----------------------+-------------------+
|  dept_division|           dept_name|standardized_dept_name|dept_subject_to_SLA|
+---------------+--------------------+----------------------+-------------------+
|311 Call Center|    Customer Service|      Customer Service|               true|
|          Brush|Solid Waste Manag...|           Solid Waste|               true|
|Clean and Green|Parks and Recreation|    Parks & Recreation|               true|
+---------------+--------------------+----------------------+-------------------+
only showing top 3 rows



In [53]:
source.show(3)

+-----+---------+----------------+
|index|source_id| source_username|
+-----+---------+----------------+
|    0|   100137|Merlene Blodgett|
|    1|   103582|     Carmen Cura|
|    2|   106463| Richard Sanchez|
+-----+---------+----------------+
only showing top 3 rows



In [54]:
df = (
    case
    # left join on dept_division
    .join(dept, "dept_division", "left")
    .join(source, "source_id", "left")
    # drop all the columns except for standardized name, as it has much fewer unique values
    .drop(dept.dept_division)
    .drop(dept.dept_name)
    .drop(case.dept_division)
    .drop(source.source_id)
    .drop(case.source_id)
    .withColumnRenamed("standardized_dept_name", "department")
    # convert last col to a boolean
    .withColumn("dept_subject_to_SLA", col("dept_subject_to_SLA") == "YES")
)

df.show(3)

23/10/26 15:21:50 WARN TaskSetManager: Stage 27 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
23/10/26 15:21:54 WARN PythonRunner: Detected deadlock while completing task 6.0 in stage 27 (TID 72): Attempting to kill Python Worker
23/10/26 15:21:54 WARN PythonRunner: Detected deadlock while completing task 2.0 in stage 27 (TID 68): Attempting to kill Python Worker
23/10/26 15:21:54 WARN PythonRunner: Detected deadlock while completing task 7.0 in stage 27 (TID 73): Attempting to kill Python Worker
23/10/26 15:21:54 WARN PythonRunner: Detected deadlock while completing task 5.0 in stage 27 (TID 71): Attempting to kill Python Worker
23/10/26 15:21:54 WARN PythonRunner: Detected deadlock while completing task 1.0 in stage 27 (TID 67): Attempting to kill Python Worker
23/10/26 15:21:54 WARN PythonRunner: Detected deadlock while completing task 3.0 in stage 27 (TID 69): Attempting to kill Python Worker
23/10/26 15:21:54 WARN PythonRunner: Dete

+----------+-------------------+-------------------+-------------------+---------+-------------+-----------+--------------------+-----------+-----------+--------------------+----------------+-----------+-------------------+--------------------+-------------------+-----+---------------+
|   case_id|   case_opened_date|   case_closed_date|       SLA_due_date|case_late|num_days_late|case_closed|service_request_type|   SLA_days|case_status|     request_address|council_district|year_closed|     num_hours_late|          department|dept_subject_to_SLA|index|source_username|
+----------+-------------------+-------------------+-------------------+---------+-------------+-----------+--------------------+-----------+-----------+--------------------+----------------+-----------+-------------------+--------------------+-------------------+-----+---------------+
|1014127332|2018-01-01 00:42:00|2018-01-01 12:29:00|2020-09-26 00:42:00|    false| -998.5087616|       true|        Stray Animal|      999.

## Are there any cases that do not have a request source?

In [55]:
# df.where(col('request_address').isNull()).count()
df.select(
    [
        count(
            when(
                col("request_address").contains("None")
                | col("request_address").contains("NULL")
                | (col("request_address") == "")
                | col("request_address").isNull()
                | isnan("request_address"),
                "request_address",
            )
        ).alias("null_count")
    ]
).show()

23/10/26 15:21:56 WARN TaskSetManager: Stage 36 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.

+----------+
|null_count|
+----------+
|        31|
+----------+



                                                                                

## What are the top 10 service request types in terms of number of requests?

In [56]:
df.groupBy("service_request_type").agg(
    count("service_request_type").alias("num_requests")
).sort(col("num_requests").desc()).show(10)

23/10/26 15:22:01 WARN TaskSetManager: Stage 48 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
[Stage 56:>                                                         (0 + 4) / 4]

+--------------------+------------+
|service_request_type|num_requests|
+--------------------+------------+
|           No Pickup|       89210|
|Overgrown Yard/Trash|       66403|
|        Bandit Signs|       32968|
|        Damaged Cart|       31163|
|Front Or Side Yar...|       28920|
|        Stray Animal|       27361|
|Aggressive Animal...|       25492|
|Cart Exchange Req...|       22608|
|Junk Vehicle On P...|       21649|
|     Pot Hole Repair|       20827|
+--------------------+------------+
only showing top 10 rows



                                                                                

## What are the top 10 service request types in terms of average days late?


In [57]:
df.groupBy("service_request_type").agg(
    avg("num_days_late").alias("avg_days_late")
).sort(col("avg_days_late").desc()).show(10)

23/10/26 15:22:07 WARN TaskSetManager: Stage 60 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
[Stage 68:>                                                         (0 + 4) / 4]

+--------------------+------------------+
|service_request_type|     avg_days_late|
+--------------------+------------------+
|Request for Resea...|               NaN|
|CCO_Request for R...|               NaN|
|  Zoning: Junk Yards| 175.9563621042095|
|Labeling for Used...|162.43032902285717|
|Record Keeping of...| 153.9972403942857|
|Signage Requied f...|151.63868055333333|
|Storage of Used M...|     142.112556415|
|Zoning: Recycle Yard|  135.928516124798|
|Donation Containe...|131.75610506358706|
|License Requied U...|128.79828704142858|
+--------------------+------------------+
only showing top 10 rows



                                                                                

## Does number of days late depend on department?


In [58]:
df.groupBy("department").agg(avg("num_days_late").alias("avg_days_late")).sort(
    col("avg_days_late").desc()
).show()

23/10/26 15:22:12 WARN TaskSetManager: Stage 72 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+--------------------+-------------------+
|          department|      avg_days_late|
+--------------------+-------------------+
|        City Council|                NaN|
|    Customer Service|   59.7370914963008|
|         Solid Waste|-2.2000575136721796|
|        Metro Health| -4.911766979607001|
|  Parks & Recreation| -5.251521960055137|
|Trans & Cap Impro...|-20.612837354052626|
|DSD/Code Enforcement|-38.369388926144914|
|Animal Care Services|-226.51783940550334|
+--------------------+-------------------+



## How do number of days late depend on department and request type?


In [59]:
df.groupBy("department", "service_request_type").agg(
    avg("num_days_late").alias("avg_days_late")
).sort(col("avg_days_late").desc()).show()

23/10/26 15:22:16 WARN TaskSetManager: Stage 84 contains a task of very large size (11384 KiB). The maximum recommended task size is 1000 KiB.
[Stage 92:>                                                         (0 + 4) / 4]

+--------------------+--------------------+------------------+
|          department|service_request_type|     avg_days_late|
+--------------------+--------------------+------------------+
|        City Council|Request for Resea...|               NaN|
|        City Council|CCO_Request for R...|               NaN|
|DSD/Code Enforcement|  Zoning: Junk Yards| 175.9563621042095|
|DSD/Code Enforcement|Labeling for Used...|162.43032902285717|
|DSD/Code Enforcement|Record Keeping of...| 153.9972403942857|
|DSD/Code Enforcement|Signage Requied f...|151.63868055333333|
|DSD/Code Enforcement|Storage of Used M...|     142.112556415|
|DSD/Code Enforcement|Zoning: Recycle Yard|  135.928516124798|
|DSD/Code Enforcement|Donation Containe...|131.75610506358706|
|DSD/Code Enforcement|License Requied U...|128.79828704142858|
|Trans & Cap Impro...|Traffic Signal Gr...|101.79846062200002|
|    Customer Service|           Complaint| 72.87050230311694|
|DSD/Code Enforcement|             Vendors|   66.548098

                                                                                

## You might have noticed that the latest date in the dataset is fairly far off from the present day. To account for this, replace any occurances of the current time with the maximum date from the dataset.

In [63]:
max_date = df.select(max("SLA_due_date")).first()[0]
max_date

ConnectionRefusedError: [Errno 61] Connection refused

23/10/26 18:04:55 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1616987 ms exceeds timeout 120000 ms
23/10/26 18:04:55 WARN SparkContext: Killing executors is not supported by current scheduler.
23/10/26 18:04:55 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$