In [1]:
# Start the pyspark session

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

## Data Acquisition
These exercises should go in a notebook or script named wrangle. Add, commit, and push your changes.

This exercises uses the `case.csv`, `dept.csv`, and `source.csv` files from the san antonio 311 call dataset.

1. Read the case, department, and source data into their own spark dataframes.
2. Let's see **how writing to the local disk works in spark**:
    - Write the code necessary to store the source data in both csv and json format, store these as sources_csv and sources_json
    - Inspect your folder structure. What do you notice?
3. Inspect the data in your dataframes. Are the data types appropriate? Write the code necessary to cast the values to the appropriate types.

### 1. Read the case, department, and source data into their own spark dataframes.

In [2]:
!ls

case.csv                             source.csv
dept.csv                             [34mspark-warehouse[m[m
lesson_with_notes_data_wrangle.ipynb wrangle.ipynb
lesson_with_notes_spark.ipynb


In [34]:
# Read case.csv

df_case = spark.read.csv("case.csv", sep=",", header=True, inferSchema=True)
df_case

DataFrame[case_id: int, case_opened_date: string, case_closed_date: string, SLA_due_date: string, case_late: string, num_days_late: double, case_closed: string, dept_division: string, service_request_type: string, SLA_days: double, case_status: string, source_id: string, request_address: string, council_district: int]

In [43]:
# Read dept.csv

df_source = (spark.read.format("csv")
             .option("sep", ",")
             .option("inferSchema", True)
             .option("header", True)
             .load("source.csv")
            )

df_source

DataFrame[source_id: string, source_username: string]

In [44]:
# Read dept.csv

df_dept = spark.read.csv("dept.csv", sep=",", header=True, inferSchema=True)
df_dept

DataFrame[dept_division: string, dept_name: string, standardized_dept_name: string, dept_subject_to_SLA: string]

### 2. Write to the local disk works in spark

In [6]:
# Write the df_source spark dataframe to csv

df_source.write.csv("source_csv", mode="overwrite")
df_source.write.json("source_json", mode="overwrite")

In [7]:
!ls

case.csv                             [34msource_csv[m[m
dept.csv                             [34msource_json[m[m
lesson_with_notes_data_wrangle.ipynb [34mspark-warehouse[m[m
lesson_with_notes_spark.ipynb        wrangle.ipynb
source.csv


**Takeaways**
1. Two folders are created respectively, instead of .csv and .json files.
2. Inside each folder, there are two files: _SUCCESS and part-***** files.

### 3.  Inspect the data in your dataframes. Are the data types appropriate? Write the code necessary to cast the values to the appropriate types.

In [13]:
# Inspect the df_case

df_case.printSchema()
df_case.show(2, vertical=True)

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616000001   
 case_closed          | YES                  


In [37]:
# Convert the council_district to 3 digits

df = df_case.withColumn(
                        "council_district",
                        format_string("%03d", col("council_district"))
                    )

df.show(1, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616000001   
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 005                  
only showing top 1 row



In [38]:
# Convert the Yes/No in case_late to boolean using expr() method

df = df.withColumn(
                    "case_late", 
                    expr("case_late == 'Yes'")
                )

# Convert the Yes/No in case_closed to boolean using expr() method

df = df.withColumn(
                    "case_closed",
                    expr("case_closed == 'YES'")
                )

df.show(1, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true                 
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 005                  
only showing top 1 row



In [39]:
# Convert the string datetime to standard format

# Define the format

fmt = "M/d/yy H:mm"

# Use to_timestamp method to standardize the datetime

df = df.withColumn(
                    "case_opened_date",
                    to_timestamp("case_opened_date", fmt)
                )

df = df.withColumn(
                    "case_closed_date",
                    to_timestamp("case_closed_date", fmt)
                )

df = df.withColumn(
                    "SLA_due_date",
                    to_timestamp("SLA_due_date", fmt)
                )

df.show(1, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 12:29:00  
 SLA_due_date         | 2020-09-26 00:42:00  
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true                 
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 005                  
only showing top 1 row



In [40]:
# Trim the text value

df = df.withColumn(
                    "request_address",
                    lower(trim("request_address"))
                )

df.show(1, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 12:29:00  
 SLA_due_date         | 2020-09-26 00:42:00  
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true                 
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  el paso st,... 
 council_district     | 005                  
only showing top 1 row



In [41]:
# Assign the df to overwrite df_case

df_case = df
df_case.printSchema()
df_case.show(1, vertical=True)

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: timestamp (nullable = true)
 |-- case_closed_date: timestamp (nullable = true)
 |-- SLA_due_date: timestamp (nullable = true)
 |-- case_late: boolean (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: boolean (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: string (nullable = false)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 12:29:00  
 SLA_due_date         | 2020-09-26 00:42:00  
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true       

In [45]:
# Inspect the dept.csv

df_dept.printSchema()
df_dept.show(1, vertical=True)

root
 |-- dept_division: string (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- standardized_dept_name: string (nullable = true)
 |-- dept_subject_to_SLA: string (nullable = true)

-RECORD 0----------------------------------
 dept_division          | 311 Call Center  
 dept_name              | Customer Service 
 standardized_dept_name | Customer Service 
 dept_subject_to_SLA    | YES              
only showing top 1 row



In [47]:
# Copy df_dept and verify dept_subject_to_SLA's values

df = df_dept
df.groupby("dept_subject_to_SLA").count().show()

+-------------------+-----+
|dept_subject_to_SLA|count|
+-------------------+-----+
|                YES|   31|
|                 NO|    8|
+-------------------+-----+



In [48]:
# Convert the YES/NO to boolean

df = df.withColumn(
                    "dept_subject_to_SLA",
                    expr("dept_subject_to_SLA == 'YES'")
                )

df.show(1, vertical=True)

-RECORD 0----------------------------------
 dept_division          | 311 Call Center  
 dept_name              | Customer Service 
 standardized_dept_name | Customer Service 
 dept_subject_to_SLA    | true             
only showing top 1 row



In [49]:
# Overwrite the df_dept

df_dept = df
df_dept.printSchema()

root
 |-- dept_division: string (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- standardized_dept_name: string (nullable = true)
 |-- dept_subject_to_SLA: boolean (nullable = true)



In [50]:
# Inspect the source.csv

df_source.printSchema()
df_source.show(1, vertical=True)

root
 |-- source_id: string (nullable = true)
 |-- source_username: string (nullable = true)

-RECORD 0---------------------------
 source_id       | 100137           
 source_username | Merlene Blodgett 
only showing top 1 row



### 1. How old is the latest (in terms of days past SLA) currently open issue? How long has the oldest (in terms of days since opened) currently opened issue been open?