In [1]:
#establishing environment 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType


#creating spark object
spark = SparkSession.builder.getOrCreate()

These exercises use the `case.csv`, `dept.csv`, and `source.csv` files from the San Antonio 311 call dataset.

### 1. Read the case, department, and source data into their own spark dataframes.

In [2]:
#read case data into spark df
case = spark.read.csv("case.csv", sep=",", header=True, inferSchema=True)

#read dept data into spark df
dept = spark.read.csv("dept.csv", sep=",", header=True, inferSchema=True)

#read source data into spark df
schema = StructType([
    StructField("source_id", StringType()),
    StructField("source_username", StringType()),
])

source = spark.read.csv("source.csv", header=True, schema=schema)

In [3]:
case.show(2, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616000001   
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 1/1/18 0:46          
 case_closed_date     | 1/3/18 8:11          
 SLA_due_date         | 1/5/18 8:30          
 case_late            | NO                   
 num_days_late        | -2.0126041

In [4]:
dept.show(2, vertical = True)

-RECORD 0--------------------------------------
 dept_division          | 311 Call Center      
 dept_name              | Customer Service     
 standardized_dept_name | Customer Service     
 dept_subject_to_SLA    | YES                  
-RECORD 1--------------------------------------
 dept_division          | Brush                
 dept_name              | Solid Waste Manag... 
 standardized_dept_name | Solid Waste          
 dept_subject_to_SLA    | YES                  
only showing top 2 rows



In [5]:
source.show(2)

+---------+----------------+
|source_id| source_username|
+---------+----------------+
|   100137|Merlene Blodgett|
|   103582|     Carmen Cura|
+---------+----------------+
only showing top 2 rows



### 2. Let's see how writing to the local disk works in spark:

#### Write the code necessary to store the source data in both csv and json format, store these as `sources_csv` and `sources_json`

In [6]:
#write source data to csv
source.write.csv("sources_csv", mode = "overwrite")

In [8]:
#write source data to json
source.write.json("sources_json", mode = "overwrite")

#### Inspect your folder structure. What do you notice?

- The csv and json were created in their separate folders. 
- There are 2 files in each folder. Main file contains the data written from the DF and a separate success file.

#### 3. Inspect the data in your dataframes. Are the data types appropriate? Write the code necessary to cast the values to the appropriate types.

In [9]:
#case dtypes
case.dtypes

[('case_id', 'int'),
 ('case_opened_date', 'string'),
 ('case_closed_date', 'string'),
 ('SLA_due_date', 'string'),
 ('case_late', 'string'),
 ('num_days_late', 'double'),
 ('case_closed', 'string'),
 ('dept_division', 'string'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'int')]

In [10]:
#dept dtypes
dept.dtypes

[('dept_division', 'string'),
 ('dept_name', 'string'),
 ('standardized_dept_name', 'string'),
 ('dept_subject_to_SLA', 'string')]

In [11]:
#source dtypes
source.dtypes

[('source_id', 'string'), ('source_username', 'string')]

In [13]:
#case data
#change case_closed and case_late col dtypes to booleans
case = case.withColumn("case_closed", expr('case_closed=="YES"'))\
            .withColumn("case_late", expr('case_late=="YES"'))

#change council_district dtype to string
case = case.withColumn('council_district', col('council_district').cast('string'))

#change case_opened_date, case_closed_date, & SLA_due_date dtype to datetime
#identify format they are in
fmt = 'M/d/yy H:mm'

#recreate each col and convert to timestamp using fmt it is in
case = (
    case.withColumn('case_opened_date', to_timestamp('case_opened_date', fmt))
    .withColumn('case_closed_date', to_timestamp('case_closed_date', fmt))
    .withColumn('SLA_due_date', to_timestamp('SLA_due_date', fmt))
)

In [14]:
#double check dtypes for case
case.dtypes

[('case_id', 'int'),
 ('case_opened_date', 'timestamp'),
 ('case_closed_date', 'timestamp'),
 ('SLA_due_date', 'timestamp'),
 ('case_late', 'boolean'),
 ('num_days_late', 'double'),
 ('case_closed', 'boolean'),
 ('dept_division', 'string'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'string')]

In [16]:
case.show(1, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 12:29:00  
 SLA_due_date         | 2020-09-26 00:42:00  
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true                 
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
only showing top 1 row



In [17]:
#dept data
#change dept_subject_to_SLA dtype to boolean
dept = dept.withColumn("dept_subject_to_SLA", expr('dept_subject_to_SLA=="YES"'))

In [18]:
#double check dtypes for dept
dept.dtypes

[('dept_division', 'string'),
 ('dept_name', 'string'),
 ('standardized_dept_name', 'string'),
 ('dept_subject_to_SLA', 'boolean')]

In [20]:
dept.show(1, vertical=True)

-RECORD 0----------------------------------
 dept_division          | 311 Call Center  
 dept_name              | Customer Service 
 standardized_dept_name | Customer Service 
 dept_subject_to_SLA    | true             
only showing top 1 row



***
#### 1. How old is the latest (in terms of days past SLA) currently open issue? How long has the oldest (in terms of days since opened) currently opened issue been open?

#### 2. How many Stray Animal cases are there?

#### 3. How many service requests that are assigned to the Field Operations department (`dept_division`) are not classified as "Officer Standby" request type (`service_request_type`)?

#### 4. Convert the `council_district` column to a string column.

#### 5. Extract the year from the `case_closed_date` column.

#### 6. Convert `num_days_late` from days to hours in new columns num_hours_late.

#### 7. Join the case data with the source and department data.

#### 8. Are there any cases that do not have a request source?

#### 9. What are the top 10 service request types in terms of number of requests?

#### 10. What are the top 10 service request types in terms of average days late?

#### 11. Does number of days late depend on department?

#### 12. How do number of days late depend on department and request type?