# Chapter 4. Spark SQL and DataFrames - Introduction to Built-in Data Sources

# Setup

NameError: name 'spark' is not defined

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
from pyspark import SparkContext

sc = SparkContext('local', 'Ch4')

In [2]:
from pyspark.sql import SparkSession

# create a SparkSession
spark = (SparkSession
    .builder
    .appName("ch4 example")
    .getOrCreate())

In [3]:
spark.version

'2.4.5'

# Imports

In [5]:
import os

# Load flight data

This data is from kaggle

There is more data, and the names are different.

some of the definitions may be different too

In [6]:
data_dir = os.path.expanduser('~/dev/data/airline-kaggle')
fn_flights = os.path.join(data_dir, 'flights.csv')
fn_airlines = os.path.join(data_dir, 'airlines.csv')
fn_airports = os.path.join(data_dir, 'airports.csv')

In [32]:
fn_airlines

'/Users/bartev/dev/data/airline-kaggle/airlines.csv'

In [7]:
def lcase_cols(df):
    """return a new DataFrame with all columns lower cased"""
    return df.toDF(*[c.lower() for c in df.columns])

In [8]:
airlines = (spark
 .read
 .format('csv')
 .option('inferSchema', True)
 .option('header', True)
 .csv(fn_airlines)
)
airlines = lcase_cols(airlines)

In [9]:
airlines.show()

+---------+--------------------+
|iata_code|             airline|
+---------+--------------------+
|       UA|United Air Lines ...|
|       AA|American Airlines...|
|       US|     US Airways Inc.|
|       F9|Frontier Airlines...|
|       B6|     JetBlue Airways|
|       OO|Skywest Airlines ...|
|       AS|Alaska Airlines Inc.|
|       NK|    Spirit Air Lines|
|       WN|Southwest Airline...|
|       DL|Delta Air Lines Inc.|
|       EV|Atlantic Southeas...|
|       HA|Hawaiian Airlines...|
|       MQ|American Eagle Ai...|
|       VX|      Virgin America|
+---------+--------------------+



In [42]:
airports = (spark
 .read
 .format('csv')
 .option('inferSchema', True)
 .option('header', True)
 .csv(fn_airports)
)
airports = lcase_cols(airports)

In [43]:
airports.show()

+---------+--------------------+-------------+-----+-------+--------+----------+
|iata_code|             airport|         city|state|country|latitude| longitude|
+---------+--------------------+-------------+-----+-------+--------+----------+
|      ABE|Lehigh Valley Int...|    Allentown|   PA|    USA|40.65236|  -75.4404|
|      ABI|Abilene Regional ...|      Abilene|   TX|    USA|32.41132|  -99.6819|
|      ABQ|Albuquerque Inter...|  Albuquerque|   NM|    USA|35.04022|-106.60919|
|      ABR|Aberdeen Regional...|     Aberdeen|   SD|    USA|45.44906| -98.42183|
|      ABY|Southwest Georgia...|       Albany|   GA|    USA|31.53552| -84.19447|
|      ACK|Nantucket Memoria...|    Nantucket|   MA|    USA|41.25305| -70.06018|
|      ACT|Waco Regional Air...|         Waco|   TX|    USA|31.61129| -97.23052|
|      ACV|      Arcata Airport|Arcata/Eureka|   CA|    USA|40.97812|-124.10862|
|      ACY|Atlantic City Int...|Atlantic City|   NJ|    USA|39.45758| -74.57717|
|      ADK|        Adak Airp

In [44]:
flights = (spark
 .read
 .format('csv')
 .option('samplingRatio', 0.01)
 .option('inferSchema', True)
 .option('header', True)
 .csv(fn_flights)
)
flights = lcase_cols(flights)

In [45]:
flights.count()

5819079

In [46]:
# match columns with example from book
flights = (flights
          .withColumnRenamed('origin_airport', 'origin')
          .withColumnRenamed('destination_airport', 'destination')
           .withColumnRenamed('departure_delay', 'dep_delay')
           .withColumnRenamed('arrival_delay', 'delay')
          )

flights.createOrReplaceTempView('us_delay_flights_tbl')

In [47]:
flights.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight_number: integer (nullable = true)
 |-- tail_number: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)
 |-- scheduled_departure: integer (nullable = true)
 |-- departure_time: integer (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- taxi_out: integer (nullable = true)
 |-- wheels_off: integer (nullable = true)
 |-- scheduled_time: integer (nullable = true)
 |-- elapsed_time: integer (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- wheels_on: integer (nullable = true)
 |-- taxi_in: integer (nullable = true)
 |-- scheduled_arrival: integer (nullable = true)
 |-- arrival_time: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- diverted: in

In [49]:
flights.select('airline', 'delay', 'delay',
              'scheduled_departure', 'departure_time',
              'scheduled_time', 'elapsed_time', 'air_time').show()

+-------+-----+-----+-------------------+--------------+--------------+------------+--------+
|airline|delay|delay|scheduled_departure|departure_time|scheduled_time|elapsed_time|air_time|
+-------+-----+-----+-------------------+--------------+--------------+------------+--------+
|     AS|  -22|  -22|                  5|          2354|           205|         194|     169|
|     AA|   -9|   -9|                 10|             2|           280|         279|     263|
|     US|    5|    5|                 20|            18|           286|         293|     266|
|     AA|   -9|   -9|                 20|            15|           285|         281|     258|
|     AS|  -21|  -21|                 25|            24|           235|         215|     199|
|     DL|    8|    8|                 25|            20|           217|         230|     206|
|     NK|  -17|  -17|                 25|            19|           181|         170|     154|
|     US|  -10|  -10|                 30|            44|    

In [50]:
flights.show(3)

+----+-----+---+-----------+-------+-------------+-----------+------+-----------+-------------------+--------------+---------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-----+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+
|year|month|day|day_of_week|airline|flight_number|tail_number|origin|destination|scheduled_departure|departure_time|dep_delay|taxi_out|wheels_off|scheduled_time|elapsed_time|air_time|distance|wheels_on|taxi_in|scheduled_arrival|arrival_time|delay|diverted|cancelled|cancellation_reason|air_system_delay|security_delay|airline_delay|late_aircraft_delay|weather_delay|
+----+-----+---+-----------+-------+-------------+-----------+------+-----------+-------------------+--------------+---------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-----+--------+---------+

## SQL Query-1: Find all flights whose distance between origin and destination is greater than 1000 miles in Scala or Python

In [51]:
spark.sql("""
select distance, origin, destination 
from us_delay_flights_tbl
where distance > 1000
order by distance desc
limit 10
""").show()

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4983|   JFK|        HNL|
|    4983|   HNL|        JFK|
|    4983|   HNL|        JFK|
|    4983|   JFK|        HNL|
|    4983|   HNL|        JFK|
|    4983|   HNL|        JFK|
|    4983|   JFK|        HNL|
|    4983|   JFK|        HNL|
|    4983|   JFK|        HNL|
|    4983|   HNL|        JFK|
+--------+------+-----------+



In [52]:
(flights
 .select('distance', 'origin', 'destination')
 .filter('distance > 1000')
 .orderBy('distance', ascending=False)
 .show(10))

# or orderBy(desc('distance'))

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4983|   JFK|        HNL|
|    4983|   HNL|        JFK|
|    4983|   JFK|        HNL|
|    4983|   JFK|        HNL|
|    4983|   HNL|        JFK|
|    4983|   HNL|        JFK|
|    4983|   HNL|        JFK|
|    4983|   JFK|        HNL|
|    4983|   JFK|        HNL|
|    4983|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [53]:
from pyspark.sql.functions import desc

(flights
 .select('distance', 'origin', 'destination')
 .filter('distance > 1000')
 .orderBy(desc('distance'))
 .show(10))

# or orderBy(desc('distance'))

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4983|   JFK|        HNL|
|    4983|   HNL|        JFK|
|    4983|   JFK|        HNL|
|    4983|   JFK|        HNL|
|    4983|   HNL|        JFK|
|    4983|   HNL|        JFK|
|    4983|   HNL|        JFK|
|    4983|   JFK|        HNL|
|    4983|   JFK|        HNL|
|    4983|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



## SQL Query-2: Find all flights with at least 2 hour delays between San Francisco (SFO) and Chicago (ORD) in Scala or Python



In [54]:
spark.sql("""
select year, month, day, delay, origin, destination
from us_delay_flights_tbl
where 1=1
and delay > 120
and origin = 'SFO'
and destination = 'ORD'
order by delay desc
""").show(10)

+----+-----+---+-----+------+-----------+
|year|month|day|delay|origin|destination|
+----+-----+---+-----+------+-----------+
|2015|   12| 13| 1187|   SFO|        ORD|
|2015|   12| 18| 1008|   SFO|        ORD|
|2015|   12| 18|  951|   SFO|        ORD|
|2015|    6| 20|  506|   SFO|        ORD|
|2015|   11| 21|  488|   SFO|        ORD|
|2015|    3| 31|  471|   SFO|        ORD|
|2015|   12| 26|  458|   SFO|        ORD|
|2015|    8| 23|  422|   SFO|        ORD|
|2015|    6| 23|  409|   SFO|        ORD|
|2015|    1|  5|  408|   SFO|        ORD|
+----+-----+---+-----+------+-----------+
only showing top 10 rows



In [55]:
(flights
 .select('year', 'month', 'day', 'delay', 'origin', 'destination')
 .filter('delay > 120')
 .filter('origin = "SFO"')
 .filter('destination = "ORD"')
 .orderBy('delay', ascending=False)
 .show(10))

+----+-----+---+-----+------+-----------+
|year|month|day|delay|origin|destination|
+----+-----+---+-----+------+-----------+
|2015|   12| 13| 1187|   SFO|        ORD|
|2015|   12| 18| 1008|   SFO|        ORD|
|2015|   12| 18|  951|   SFO|        ORD|
|2015|    6| 20|  506|   SFO|        ORD|
|2015|   11| 21|  488|   SFO|        ORD|
|2015|    3| 31|  471|   SFO|        ORD|
|2015|   12| 26|  458|   SFO|        ORD|
|2015|    8| 23|  422|   SFO|        ORD|
|2015|    6| 23|  409|   SFO|        ORD|
|2015|    1|  5|  408|   SFO|        ORD|
+----+-----+---+-----+------+-----------+
only showing top 10 rows



In [56]:
spark.sql("""
select year, count(*) as cnt
from us_delay_flights_tbl
group by year
order by cnt
""").show(30)

+----+-------+
|year|    cnt|
+----+-------+
|2015|5819079|
+----+-------+



In [57]:
flights.groupBy('year').count().show()

+----+-------+
|year|  count|
+----+-------+
|2015|5819079|
+----+-------+



In [58]:
flights.groupBy('month').count().orderBy('month').show()

+-----+------+
|month| count|
+-----+------+
|    1|469968|
|    2|429191|
|    3|504312|
|    4|485151|
|    5|496993|
|    6|503897|
|    7|520718|
|    8|510536|
|    9|464946|
|   10|486165|
|   11|467972|
|   12|479230|
+-----+------+



## SQL Query-3: A more complicated query in SQL: let’s label all US flights with a human readable label: Very Long Delay (> 6 hours), Lon g Delay (2 - 6 hours), etc. in a new column called “Flight_Delays” in Scala or Python

In [59]:
spark.sql("""
select delay,
    origin,
    destination,
    case when delay > 360 then 'very long'
         when delay > 120 then 'long'
         when delay > 60 then 'short'
         when delay > 0 then 'tolerable'
         when delay = 0 then 'no delay'
         else 'early'
    end as flight_delays
from us_delay_flights_tbl
order by delay desc
""").show(20)

+-----+------+-----------+-------------+
|delay|origin|destination|flight_delays|
+-----+------+-----------+-------------+
| 1971|   BHM|        DFW|    very long|
| 1898|   RIC|        DFW|    very long|
| 1665|   SAN|        DFW|    very long|
| 1638|   DTW|        ORD|    very long|
| 1636|   ABQ|        DFW|    very long|
| 1636|   IND|        LAX|    very long|
| 1627|   STL|        MIA|    very long|
| 1598|   OMA|        DFW|    very long|
| 1593|   LAS|        LAX|    very long|
| 1576|   HNL|        LAX|    very long|
| 1574|   HNL|        LAX|    very long|
| 1557|   MSP|        ORD|    very long|
| 1556|   MCO|        JFK|    very long|
| 1555| 14747|      11298|    very long|
| 1554|   SAT|        DFW|    very long|
| 1554|   SAN|        JFK|    very long|
| 1546|   FAT|        DFW|    very long|
| 1528| 11612|      13930|    very long|
| 1514|   SMF|        DFW|    very long|
| 1508|   RIC|        DFW|    very long|
+-----+------+-----------+-------------+
only showing top

In [60]:
from pyspark.sql.functions import col, when

In [61]:
(flights
 .select('delay', 
         'origin', 
         'destination',
         when(col('delay') > 360, 'very long')
         .when(col('delay') > 120, 'long')
         .when(col('delay') > 60, 'short')
         .when(col('delay') > 0, 'tolerable')
         .when(col('delay') == 0, 'no delay')
         .when(col('delay') < 0, 'early')
         .otherwise('unknown')
         .alias('flight_delays'))
 .orderBy(desc('delay'))
#  .groupBy('flight_delays')
#  .count()
.show())

+-----+------+-----------+-------------+
|delay|origin|destination|flight_delays|
+-----+------+-----------+-------------+
| 1971|   BHM|        DFW|    very long|
| 1898|   RIC|        DFW|    very long|
| 1665|   SAN|        DFW|    very long|
| 1638|   DTW|        ORD|    very long|
| 1636|   ABQ|        DFW|    very long|
| 1636|   IND|        LAX|    very long|
| 1627|   STL|        MIA|    very long|
| 1598|   OMA|        DFW|    very long|
| 1593|   LAS|        LAX|    very long|
| 1576|   HNL|        LAX|    very long|
| 1574|   HNL|        LAX|    very long|
| 1557|   MSP|        ORD|    very long|
| 1556|   MCO|        JFK|    very long|
| 1555| 14747|      11298|    very long|
| 1554|   SAT|        DFW|    very long|
| 1554|   SAN|        JFK|    very long|
| 1546|   FAT|        DFW|    very long|
| 1528| 11612|      13930|    very long|
| 1514|   SMF|        DFW|    very long|
| 1508|   RIC|        DFW|    very long|
+-----+------+-----------+-------------+
only showing top

In [62]:
spark

# CREATING SQL DATABASES AND TABLES

In [63]:
spark.sql("""create database learn_spark_db""")
spark.sql("use learn_spark_db")

DataFrame[]

## create a managed table

In [64]:
(flights
 .select('year', 'month', 'day',
         'delay', 
         'distance',
         'origin', 
         'destination')
).printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



In [65]:
# sql

spark.sql("""
create table managed_us_delay_flights_tbl
(year int, month int, day int, delay int, distance int,
origin string, destination string)""")

AnalysisException: "Hive support is required to CREATE Hive TABLE (AS SELECT);;\n'CreateTable `managed_us_delay_flights_tbl`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, ErrorIfExists\n"

In [66]:
# in python DataFrame API

flights.write.saveAsTable('managed_us_delay_flights_tbl')

AnalysisException: "Can not create the managed table('`managed_us_delay_flights_tbl`'). The associated location('file:/Users/bartev/dev/github-bv/san-tan/lrn-spark/spark-warehouse/learn_spark_db.db/managed_us_delay_flights_tbl') already exists.;"

## create an unmanaged table

In [67]:
# in python DataFrame API

data_dir = '/Users/bartev/dev/github-bv/san-tan/lrn-spark/tmp/data/us_flights_delay'
(flights.select('year', 'month', 'day', 'delay', 'distance', 'origin', 'destination')
    .write
    .option('path', data_dir)
    .saveAsTable('us_delay_flights_tbl'))

## create views

Note: 

Views can be

* global - visible across all SparkSessions on a given cluster
* temporary - visible only to a single SparkSession

Views disappear after your Spark application or notebook terminates

to query the global temp view, use the prefix `global_temp`

In [68]:
# in python DataFrame API

In [69]:
df_sfo = spark.sql("""
select year,
    month,
    day,
    delay,
    distance,
    origin,
    destination 
from us_delay_flights_tbl 
where origin = 'SFO'""")

df_jfk = spark.sql("""
select year,
    month,
    day,
    delay,
    distance,
    origin,
    destination 
from us_delay_flights_tbl 
where origin = 'JFK'""")


In [70]:
df_sfo.createOrReplaceGlobalTempView('us_origin_airport_SFO_global_tmp_view')

df_jfk.createOrReplaceTempView('us_origin_airport_JFK_tmp_view')

In [71]:
(spark
 .sql("select * from global_temp.us_origin_airport_SFO_global_tmp_view")
 .show())

+----+-----+---+-----+--------+------+-----------+
|year|month|day|delay|distance|origin|destination|
+----+-----+---+-----+--------+------+-----------+
|2015|    1|  1|    5|    2296|   SFO|        CLT|
|2015|    1|  1|    8|    1589|   SFO|        MSP|
|2015|    1|  1|  -13|    1464|   SFO|        DFW|
|2015|    1|  1|   -7|    1635|   SFO|        IAH|
|2015|    1|  1|   26|     967|   SFO|        DEN|
|2015|    1|  1|   -2|     967|   SFO|        DEN|
|2015|    1|  1|  -16|    2139|   SFO|        ATL|
|2015|    1|  1|   -4|    1846|   SFO|        ORD|
|2015|    1|  1|  -11|     337|   SFO|        LAX|
|2015|    1|  1|    3|     651|   SFO|        PHX|
|2015|    1|  1| null|    1464|   SFO|        DFW|
|2015|    1|  1|    1|    1635|   SFO|        IAH|
|2015|    1|  1|  -23|    2565|   SFO|        EWR|
|2015|    1|  1|  -14|     354|   SFO|        LGB|
|2015|    1|  1|   -2|     599|   SFO|        SLC|
|2015|    1|  1|   -2|    1635|   SFO|        IAH|
|2015|    1|  1|   -3|    2139|

In [72]:
spark.sql("select * from us_origin_airport_JFK_tmp_view").show()

+----+-----+---+-----+--------+------+-----------+
|year|month|day|delay|distance|origin|destination|
+----+-----+---+-----+--------+------+-----------+
|2015|    1|  1|   19|    1598|   JFK|        SJU|
|2015|    1|  1|   69|    1089|   JFK|        MIA|
|2015|    1|  1|  -19|    1576|   JFK|        BQN|
|2015|    1|  1|  -22|    1028|   JFK|        PBI|
|2015|    1|  1|  -19|     944|   JFK|        MCO|
|2015|    1|  1|   -1|    1005|   JFK|        TPA|
|2015|    1|  1|  -19|     760|   JFK|        ATL|
|2015|    1|  1|   -4|    2586|   JFK|        SFO|
|2015|    1|  1|   -2|    1069|   JFK|        FLL|
|2015|    1|  1|  -23|    1598|   JFK|        SJU|
|2015|    1|  1|    5|    2153|   JFK|        PHX|
|2015|    1|  1|    4|    2475|   JFK|        LAX|
|2015|    1|  1|  -17|    1074|   JFK|        RSW|
|2015|    1|  1|    2|    1182|   JFK|        MSY|
|2015|    1|  1|   45|    2248|   JFK|        LAS|
|2015|    1|  1|  -12|     187|   JFK|        BOS|
|2015|    1|  1|   -7|    1089|

In [73]:
spark.read.table("us_origin_airport_JFK_tmp_view").show()

+----+-----+---+-----+--------+------+-----------+
|year|month|day|delay|distance|origin|destination|
+----+-----+---+-----+--------+------+-----------+
|2015|    1|  1|   19|    1598|   JFK|        SJU|
|2015|    1|  1|   69|    1089|   JFK|        MIA|
|2015|    1|  1|  -19|    1576|   JFK|        BQN|
|2015|    1|  1|  -22|    1028|   JFK|        PBI|
|2015|    1|  1|  -19|     944|   JFK|        MCO|
|2015|    1|  1|   -1|    1005|   JFK|        TPA|
|2015|    1|  1|  -19|     760|   JFK|        ATL|
|2015|    1|  1|   -4|    2586|   JFK|        SFO|
|2015|    1|  1|   -2|    1069|   JFK|        FLL|
|2015|    1|  1|  -23|    1598|   JFK|        SJU|
|2015|    1|  1|    5|    2153|   JFK|        PHX|
|2015|    1|  1|    4|    2475|   JFK|        LAX|
|2015|    1|  1|  -17|    1074|   JFK|        RSW|
|2015|    1|  1|    2|    1182|   JFK|        MSY|
|2015|    1|  1|   45|    2248|   JFK|        LAS|
|2015|    1|  1|  -12|     187|   JFK|        BOS|
|2015|    1|  1|   -7|    1089|

### Question
What is the difference between a SparkSession and a Spark application

## Drop view

### drop using sql

In [74]:
spark.sql("select count(*) as jfk from us_origin_airport_JFK_tmp_view").show()
spark.sql("select count(*) as sfo from global_temp.us_origin_airport_SFO_global_tmp_view").show()

+-----+
|  jfk|
+-----+
|93811|
+-----+

+------+
|   sfo|
+------+
|148008|
+------+



In [75]:
spark.sql("""drop view if exists us_origin_airport_SFO_global_tmp_view""")

spark.sql("""drop view if exists us_origin_airport_JFK_tmp_view""")

DataFrame[]

### create again

In [76]:
df_sfo.createOrReplaceGlobalTempView('us_origin_airport_SFO_global_tmp_view')
df_jfk.createOrReplaceTempView('us_origin_airport_JFK_tmp_view')

### drop using `catalog`

In [77]:
spark.catalog.dropGlobalTempView('us_origin_airport_SFO_global_tmp_view')
spark.catalog.dropTempView('us_origin_airport_JFK_tmp_view')

## Look at Metadata via `catalog`

### `listDatabases`

In [78]:
dbs = spark.catalog.listDatabases()

In [79]:
[print(f'{db}\n') for db in dbs]

Database(name='default', description='default database', locationUri='file:/Users/bartev/dev/github-bv/san-tan/lrn-spark/spark-warehouse')

Database(name='learn_spark_db', description='', locationUri='file:/Users/bartev/dev/github-bv/san-tan/lrn-spark/spark-warehouse/learn_spark_db.db')



[None, None]

In [80]:
len(dbs)

2

In [81]:
dbs[0]

Database(name='default', description='default database', locationUri='file:/Users/bartev/dev/github-bv/san-tan/lrn-spark/spark-warehouse')

In [82]:
dbs[1]

Database(name='learn_spark_db', description='', locationUri='file:/Users/bartev/dev/github-bv/san-tan/lrn-spark/spark-warehouse/learn_spark_db.db')

### `listTables`

In [83]:
tbls = spark.catalog.listTables()

In [84]:
[print(f'{tbl}\n') for tbl in tbls]

Table(name='us_delay_flights_tbl', database='learn_spark_db', description=None, tableType='EXTERNAL', isTemporary=False)

Table(name='us_delay_flights_tbl', database=None, description=None, tableType='TEMPORARY', isTemporary=True)



[None, None]

### `listColumns`

In [85]:
spark.catalog.listColumns('us_delay_flights_tbl')

[Column(name='year', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='month', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='day', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='delay', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='distance', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='origin', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='destination', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False)]

## Cache sql tables

In [86]:
spark.sql("cache table us_delay_flights_tbl")

DataFrame[]

In [87]:
spark.sql("uncache table us_delay_flights_tbl")

DataFrame[]

# Read table into dataframe

In [88]:
us_flights_df = spark.sql('select * from us_delay_flights_tbl')

In [89]:
# No need for `read` here

us_flights_df2 = spark.table('us_delay_flights_tbl')

In [90]:
us_flights_df.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight_number: integer (nullable = true)
 |-- tail_number: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)
 |-- scheduled_departure: integer (nullable = true)
 |-- departure_time: integer (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- taxi_out: integer (nullable = true)
 |-- wheels_off: integer (nullable = true)
 |-- scheduled_time: integer (nullable = true)
 |-- elapsed_time: integer (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- wheels_on: integer (nullable = true)
 |-- taxi_in: integer (nullable = true)
 |-- scheduled_arrival: integer (nullable = true)
 |-- arrival_time: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- diverted: in

# Data Sources for DataFrames and SQL Tables

## DataFrameReader

* access from a `spark` session

`DataFrameReader.format(args).option("key", "value").schema(args).load()`

## DataFrameWriter

* access from a `DataFrame`

```
DataFrameWriter.format(args).option(args).bucketBy(args).partitionBy(args).save()
DataFrameWriter.format(args).option(args).sortBy(args).saveAsTable()
DataFrame.write or DataFrame.writeStream
```

Options

* format: parque (default), csv, txt, ...
* mode: append, overwrite, ignore, error, errorifexists (default)
* save: /path/to/data/source (can be empty if specified path in options)
* saveAsTable: table_name

## Parquet

### read DataFrame from parquet

In [91]:
file = 'people.parquet'
spark.read.format('parquet').load(file).show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+



In [92]:
spark.read.load(file).show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+



### Read parquet file into a spark sql table

In [93]:
q = """
create or replace temporary view peeps_tbl
using parquet
options(
    path 'people.parquet')
    """

In [94]:
spark.sql(q)

DataFrame[]

then read into a DataFrame

In [95]:
spark.sql('select * from peeps_tbl').show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+



### write DataFrame to parquet

In [96]:
from pyspark.sql.functions import col, lit

In [97]:
df = (spark.sql('select * from peeps_tbl')
     .withColumn('foo', lit('red')))

df.show()

+-----+---+---------+---+
| name|age|      job|foo|
+-----+---+---------+---+
|Jorge| 30|Developer|red|
|  Bob| 32|Developer|red|
+-----+---+---------+---+



#### overwrite

In [98]:
(df.write
    .mode('overwrite')
    .option('compression', 'snappy')
    .save('/tmp/data/parquet/df_peeps_foo'))

In [99]:
spark.read.load('/tmp/data/parquet/df_peeps_foo/').show()

+-----+---+---------+---+
| name|age|      job|foo|
+-----+---+---------+---+
|Jorge| 30|Developer|red|
|  Bob| 32|Developer|red|
+-----+---+---------+---+



#### overwrite with a new column

In [100]:
(df.withColumn('bar', lit('blue'))
    .write
    .mode('overwrite')
    .option('compression', 'snappy')
    .save('/tmp/data/parquet/df_peeps_foo/'))

In [101]:
spark.read.load('/tmp/data/parquet/df_peeps_foo/').show()

+-----+---+---------+---+----+
| name|age|      job|foo| bar|
+-----+---+---------+---+----+
|Jorge| 30|Developer|red|blue|
|  Bob| 32|Developer|red|blue|
+-----+---+---------+---+----+



#### append

In [102]:
(df.withColumn('bar', lit('orange'))
    .write
    .mode('append')
    .option('compression', 'snappy')
    .save('/tmp/data/parquet/df_peeps_foo/'))

In [103]:
spark.read.load('/tmp/data/parquet/df_peeps_foo/').show()

+-----+---+---------+---+------+
| name|age|      job|foo|   bar|
+-----+---+---------+---+------+
|Jorge| 30|Developer|red|orange|
|  Bob| 32|Developer|red|orange|
|Jorge| 30|Developer|red|  blue|
|  Bob| 32|Developer|red|  blue|
+-----+---+---------+---+------+



#### Add a column (won't show up in table)

In [104]:
(df.withColumn('baz', lit('green'))
    .write
    .mode('append')
    .option('compression', 'snappy')
    .save('/tmp/data/parquet/df_peeps_foo/'))

In [105]:
spark.read.load('/tmp/data/parquet/df_peeps_foo/').show()

+-----+---+---------+---+-----+
| name|age|      job|foo|  baz|
+-----+---+---------+---+-----+
|Jorge| 30|Developer|red| null|
|  Bob| 32|Developer|red| null|
|Jorge| 30|Developer|red|green|
|  Bob| 32|Developer|red|green|
|Jorge| 30|Developer|red| null|
|  Bob| 32|Developer|red| null|
+-----+---+---------+---+-----+



In [106]:
(df
    .write
    .mode('append')
    .option('compression', 'snappy')
    .save('/tmp/data/parquet/df_peeps_foo/'))

In [107]:
spark.read.load('/tmp/data/parquet/df_peeps_foo/').show()

+-----+---+---------+---+-----+
| name|age|      job|foo|  baz|
+-----+---+---------+---+-----+
|Jorge| 30|Developer|red| null|
|  Bob| 32|Developer|red| null|
|Jorge| 30|Developer|red|green|
|  Bob| 32|Developer|red|green|
|Jorge| 30|Developer|red| null|
|  Bob| 32|Developer|red| null|
|Jorge| 30|Developer|red| null|
|  Bob| 32|Developer|red| null|
+-----+---+---------+---+-----+



## json

In [108]:
import numpy as np
import pandas as pd

In [109]:
from numpy.random  import MT19937
from numpy.random import RandomState, SeedSequence

### Create random df and write to json

#### aside: numpy `RandomState` - how does this work?

In [118]:
rs = RandomState(MT19937(SeedSequence(1)))

np.random.randint(0, 15, size=(5,3))

array([[10, 12,  6],
       [13,  2, 12],
       [ 1, 12,  4],
       [ 9, 11,  4],
       [12, 13, 12]])

#### using `np.random.seed`

In [125]:
np.random.seed(1)

# np.random.randint(0, 15, size=(5,3))

data = np.random.randint(0, 5*3, size=(5, 3))
columns = ['a', 'b', 'c']
toy_df = pd.DataFrame(data, columns=columns)
toy_df

Unnamed: 0,a,b,c
0,5,11,12
1,8,9,11
2,5,0,0
3,1,12,7
4,13,12,6


In [160]:
file = 'toy_df2.json'
print(toy_df.to_json(orient='records', indent=2))

[
  {
    "a":5,
    "b":11,
    "c":12
  },
  {
    "a":8,
    "b":9,
    "c":11
  },
  {
    "a":5,
    "b":0,
    "c":0
  },
  {
    "a":1,
    "b":12,
    "c":7
  },
  {
    "a":13,
    "b":12,
    "c":6
  }
]


### read json file into DataFrame

If the json has records on separate lines (as when I save with `indent=2`), then set `multiLine = True`

In [170]:
# this is more human readable

toy_df.to_json(file, orient='records', indent=2)

In [164]:
# Fail
spark.read.format('json').option('multiLine', False).load(file).show()

AnalysisException: 'Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the\nreferenced columns only include the internal corrupt record column\n(named _corrupt_record by default). For example:\nspark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count()\nand spark.read.schema(schema).json(file).select("_corrupt_record").show().\nInstead, you can cache or save the parsed results and then send the same query.\nFor example, val df = spark.read.schema(schema).json(file).cache() and then\ndf.filter($"_corrupt_record".isNotNull).count().;'

In [165]:
# Succeed
spark.read.format('json').option('multiLine', True).load(file).show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  5| 11| 12|
|  8|  9| 11|
|  5|  0|  0|
|  1| 12|  7|
| 13| 12|  6|
+---+---+---+



In [171]:
# less readable

toy_df.to_json(file, orient='records', indent=0)

In [172]:
# Succeed
spark.read.format('json').option('multiLine', True).load(file).show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  5| 11| 12|
|  8|  9| 11|
|  5|  0|  0|
|  1| 12|  7|
| 13| 12|  6|
+---+---+---+



In [173]:
# Succeed
spark.read.format('json').option('multiLine', False).load(file).show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  5| 11| 12|
|  8|  9| 11|
|  5|  0|  0|
|  1| 12|  7|
| 13| 12|  6|
+---+---+---+



### read json file into spark sql table

In [174]:
spark.sql(f"""
create or replace temporary view toy_tbl
using json
options (
path = '{file}')
""")

DataFrame[]

In [175]:
spark.sql('select * from toy_tbl').show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  5| 11| 12|
|  8|  9| 11|
|  5|  0|  0|
|  1| 12|  7|
| 13| 12|  6|
+---+---+---+



### write DataFrame to json file

In [179]:
toy = spark.read.format('json').option('multiLine', True).load(file)

In [181]:
toy.show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  5| 11| 12|
|  8|  9| 11|
|  5|  0|  0|
|  1| 12|  7|
| 13| 12|  6|
+---+---+---+



In [189]:
(toy.write.format('json')
    .mode('overwrite')
    .save('/tmp/data/json/toy.json'))

In [190]:
spark.stop()