# Description
----
The other version of this notebook did not have the Databricks data.
This version uses the data provided by Databricks

# Setup

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Imports

In [3]:
import os
import os.path as path

## Setup Spark

In [2]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = (SparkSession
  .builder
  .appName("SparkSQLExampleApp")
  .getOrCreate())

In [11]:
def db_fname(fname):
    import os.path as path
    data_dir = '~/dev/github-bv/LearningSparkV2/databricks-datasets/learning-spark-v2/'
    return path.expanduser(path.join(data_dir, fname))

In [44]:
csv_file = db_fname('flights/departuredelays.csv')

# Specify schema
schema = """
date STRING,
delay INTEGER,
distance INT,
origin STRING,
destination STRING
"""

df = (spark.read.format('csv')
#     .option('inferSchema', 'true')
      .schema(schema)
      .option('header', 'true')
      .load(csv_file))

In [45]:
# Create a temporary view
df.createOrReplaceTempView('us_delay_flights_tbl')

In [46]:
df.printSchema()

root
 |-- date: string (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



In [47]:
df.show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
|01061215|   -6|     602|   ABE|        ATL|
|01061725|   69|     602|   ABE|        ATL|
|01061230|    0|     369|   ABE|        DTW|
|01060625|   -3|     602|   ABE|        ATL|
|01070600|    0|     369|   ABE|        DTW|
|01071725|    0|     602|   ABE|        ATL|
|01071230|    0|     369|   ABE|        DTW|
|01070625|    0|     602|   ABE|        ATL|
|01071219|    0|     569|   ABE|        ORD|
|01080600|

## Try some sql queries

In [48]:
q = """
select distance,
    origin,
    destination
from us_delay_flights_tbl
where distance > 1000
order by distance desc
"""
spark.sql(q).show(10)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [43]:
q = """
select date,
    delay,
    origin,
    destination
from us_delay_flights_tbl
where delay > 120
and origin = 'SFO' 
and destination = 'ORD'
order by delay desc
"""
spark.sql(q).show(10)

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|02190925| 1638|   SFO|        ORD|
|01031755|  396|   SFO|        ORD|
|01022330|  326|   SFO|        ORD|
|01051205|  320|   SFO|        ORD|
|01190925|  297|   SFO|        ORD|
|02171115|  296|   SFO|        ORD|
|01071040|  279|   SFO|        ORD|
|01051550|  274|   SFO|        ORD|
|03120730|  266|   SFO|        ORD|
|01261104|  258|   SFO|        ORD|
+--------+-----+------+-----------+
only showing top 10 rows



In [50]:
q = """
select delay,
    origin,
    destination,
    case when delay > 360 then 'very long delays'
        when delay > 120 then 'long delays'
        when delay > 60 then 'short delays'
        when delay > 0 then 'tolerable delays'
        when delay = 0 then 'no delays'
        when delay < 0 then 'early'
    end as flight_delays
from us_delay_flights_tbl
order by origin, delay desc
"""
spark.sql(q).show(10)

+-----+------+-----------+-------------+
|delay|origin|destination|flight_delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  long delays|
|  305|   ABE|        ATL|  long delays|
|  275|   ABE|        ATL|  long delays|
|  257|   ABE|        ATL|  long delays|
|  247|   ABE|        ATL|  long delays|
|  247|   ABE|        DTW|  long delays|
|  219|   ABE|        ORD|  long delays|
|  211|   ABE|        ATL|  long delays|
|  197|   ABE|        DTW|  long delays|
|  192|   ABE|        ORD|  long delays|
+-----+------+-----------+-------------+
only showing top 10 rows



## Using DataFrame API query

In [56]:
from pyspark.sql.functions import col, desc, when

In [52]:
(df.select('distance', 'origin', 'destination')
    .where(col('distance') > 1000)
    .orderBy(desc('distance'))
    .show(10))

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [54]:
# or, same as above

(df.select('distance', 'origin', 'destination')
    .where(col('distance') > 1000)
    .orderBy('distance', ascending=False)
    .show(10))

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [59]:
(df.select('delay', 'origin', 'destination',
          when(col('delay') > 360, 'very long delays')
          .when(col('delay') > 120, 'long delays')
          .when(col('delay') > 60, 'short delays')
          .when(col('delay') > 0, 'tolerable delays')
          .when(col('delay') == 0, 'no delays')
          .when(col('delay') < 0, 'early').alias('flight_delays'))
    .orderBy('origin', desc('delay'))
    .show())

+-----+------+-----------+-------------+
|delay|origin|destination|flight_delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  long delays|
|  305|   ABE|        ATL|  long delays|
|  275|   ABE|        ATL|  long delays|
|  257|   ABE|        ATL|  long delays|
|  247|   ABE|        DTW|  long delays|
|  247|   ABE|        ATL|  long delays|
|  219|   ABE|        ORD|  long delays|
|  211|   ABE|        ATL|  long delays|
|  197|   ABE|        DTW|  long delays|
|  192|   ABE|        ORD|  long delays|
|  180|   ABE|        ATL|  long delays|
|  173|   ABE|        DTW|  long delays|
|  165|   ABE|        ATL|  long delays|
|  159|   ABE|        ORD|  long delays|
|  159|   ABE|        ATL|  long delays|
|  158|   ABE|        ATL|  long delays|
|  151|   ABE|        DTW|  long delays|
|  127|   ABE|        ATL|  long delays|
|  121|   ABE|        DTW|  long delays|
|  118|   ABE|        DTW| short delays|
+-----+------+-----------+-------------+
only showing top

# SQL Tables and Views

## Create SQL Databases and Tables

Default database is called `default`

In [63]:
spark.sql('create database learn_spark_db')
spark.sql('USE learn_spark_db')

DataFrame[]

From here on, any tables will be created in `learn_spark_db`

### Create Managed Table

Fails - needs Hive support

In [64]:
spark.sql("""create TABLE managed_us_delay_flights_tbl
(date STRING,
delay INT,
distance INT,
origin STRING, 
desitination STRING)""")

AnalysisException: "Hive support is required to CREATE Hive TABLE (AS SELECT);;\n'CreateTable `managed_us_delay_flights_tbl`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, ErrorIfExists\n"

Try using DataFrame API

In [65]:
csv_file = db_fname('flights/departuredelays.csv')

# Specify schema
schema = """
date STRING,
delay INTEGER,
distance INT,
origin STRING,
destination STRING
"""

flights_df = spark.read.csv(csv_file, schema=schema)

In [67]:
flights_df.write.saveAsTable('managed_us_delay_flights_tbl')

### Create Unmanaged Table

In [68]:
(flights_df
    .write
    .option('path', '/tmp/data/us_flights_delay')
    .saveAsTable('us_delay_flights_tbl'))