# Description
----
The other version of this notebook did not have the Databricks data.
This version uses the data provided by Databricks

# Setup

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Imports

In [3]:
import os
import os.path as path

## Setup Spark

In [2]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = (SparkSession
  .builder
  .appName("SparkSQLExampleApp")
  .getOrCreate())

In [11]:
def db_fname(fname):
    import os.path as path
    data_dir = '~/dev/github-bv/LearningSparkV2/databricks-datasets/learning-spark-v2/'
    return path.expanduser(path.join(data_dir, fname))

# Try some sql queries

In [44]:
csv_file = db_fname('flights/departuredelays.csv')

# Specify schema
schema = """
date STRING,
delay INTEGER,
distance INT,
origin STRING,
destination STRING
"""

df = (spark.read.format('csv')
#     .option('inferSchema', 'true')
      .schema(schema)
      .option('header', 'true')
      .load(csv_file))

In [45]:
# Create a temporary view
df.createOrReplaceTempView('us_delay_flights_tbl')

In [46]:
df.printSchema()

root
 |-- date: string (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



In [47]:
df.show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
|01061215|   -6|     602|   ABE|        ATL|
|01061725|   69|     602|   ABE|        ATL|
|01061230|    0|     369|   ABE|        DTW|
|01060625|   -3|     602|   ABE|        ATL|
|01070600|    0|     369|   ABE|        DTW|
|01071725|    0|     602|   ABE|        ATL|
|01071230|    0|     369|   ABE|        DTW|
|01070625|    0|     602|   ABE|        ATL|
|01071219|    0|     569|   ABE|        ORD|
|01080600|

In [48]:
q = """
select distance,
    origin,
    destination
from us_delay_flights_tbl
where distance > 1000
order by distance desc
"""
spark.sql(q).show(10)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [43]:
q = """
select date,
    delay,
    origin,
    destination
from us_delay_flights_tbl
where delay > 120
and origin = 'SFO' 
and destination = 'ORD'
order by delay desc
"""
spark.sql(q).show(10)

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|02190925| 1638|   SFO|        ORD|
|01031755|  396|   SFO|        ORD|
|01022330|  326|   SFO|        ORD|
|01051205|  320|   SFO|        ORD|
|01190925|  297|   SFO|        ORD|
|02171115|  296|   SFO|        ORD|
|01071040|  279|   SFO|        ORD|
|01051550|  274|   SFO|        ORD|
|03120730|  266|   SFO|        ORD|
|01261104|  258|   SFO|        ORD|
+--------+-----+------+-----------+
only showing top 10 rows



In [50]:
q = """
select delay,
    origin,
    destination,
    case when delay > 360 then 'very long delays'
        when delay > 120 then 'long delays'
        when delay > 60 then 'short delays'
        when delay > 0 then 'tolerable delays'
        when delay = 0 then 'no delays'
        when delay < 0 then 'early'
    end as flight_delays
from us_delay_flights_tbl
order by origin, delay desc
"""
spark.sql(q).show(10)

+-----+------+-----------+-------------+
|delay|origin|destination|flight_delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  long delays|
|  305|   ABE|        ATL|  long delays|
|  275|   ABE|        ATL|  long delays|
|  257|   ABE|        ATL|  long delays|
|  247|   ABE|        ATL|  long delays|
|  247|   ABE|        DTW|  long delays|
|  219|   ABE|        ORD|  long delays|
|  211|   ABE|        ATL|  long delays|
|  197|   ABE|        DTW|  long delays|
|  192|   ABE|        ORD|  long delays|
+-----+------+-----------+-------------+
only showing top 10 rows



## Using DataFrame API query

In [56]:
from pyspark.sql.functions import col, desc, when

In [52]:
(df.select('distance', 'origin', 'destination')
    .where(col('distance') > 1000)
    .orderBy(desc('distance'))
    .show(10))

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [54]:
# or, same as above

(df.select('distance', 'origin', 'destination')
    .where(col('distance') > 1000)
    .orderBy('distance', ascending=False)
    .show(10))

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [59]:
(df.select('delay', 'origin', 'destination',
          when(col('delay') > 360, 'very long delays')
          .when(col('delay') > 120, 'long delays')
          .when(col('delay') > 60, 'short delays')
          .when(col('delay') > 0, 'tolerable delays')
          .when(col('delay') == 0, 'no delays')
          .when(col('delay') < 0, 'early').alias('flight_delays'))
    .orderBy('origin', desc('delay'))
    .show())

+-----+------+-----------+-------------+
|delay|origin|destination|flight_delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  long delays|
|  305|   ABE|        ATL|  long delays|
|  275|   ABE|        ATL|  long delays|
|  257|   ABE|        ATL|  long delays|
|  247|   ABE|        DTW|  long delays|
|  247|   ABE|        ATL|  long delays|
|  219|   ABE|        ORD|  long delays|
|  211|   ABE|        ATL|  long delays|
|  197|   ABE|        DTW|  long delays|
|  192|   ABE|        ORD|  long delays|
|  180|   ABE|        ATL|  long delays|
|  173|   ABE|        DTW|  long delays|
|  165|   ABE|        ATL|  long delays|
|  159|   ABE|        ORD|  long delays|
|  159|   ABE|        ATL|  long delays|
|  158|   ABE|        ATL|  long delays|
|  151|   ABE|        DTW|  long delays|
|  127|   ABE|        ATL|  long delays|
|  121|   ABE|        DTW|  long delays|
|  118|   ABE|        DTW| short delays|
+-----+------+-----------+-------------+
only showing top

# SQL Tables and Views

## Create SQL Databases and Tables

Default database is called `default`

In [63]:
spark.sql('create database learn_spark_db')
spark.sql('USE learn_spark_db')

DataFrame[]

From here on, any tables will be created in `learn_spark_db`

### Create Managed Table

Fails - needs Hive support

In [64]:
spark.sql("""create TABLE managed_us_delay_flights_tbl
(date STRING,
delay INT,
distance INT,
origin STRING, 
desitination STRING)""")

AnalysisException: "Hive support is required to CREATE Hive TABLE (AS SELECT);;\n'CreateTable `managed_us_delay_flights_tbl`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, ErrorIfExists\n"

Try using DataFrame API

In [65]:
csv_file = db_fname('flights/departuredelays.csv')

# Specify schema
schema = """
date STRING,
delay INTEGER,
distance INT,
origin STRING,
destination STRING
"""

flights_df = spark.read.csv(csv_file, schema=schema)

In [67]:
flights_df.write.saveAsTable('managed_us_delay_flights_tbl')

### Create Unmanaged Table

In [68]:
(flights_df
    .write
    .option('path', '/tmp/data/us_flights_delay')
    .saveAsTable('us_delay_flights_tbl'))

## Create Views

* Views can be global (visible across all `SparkSession`s on a cluster) or session scoped

* Create and query like a table
* Views disappear after spark application terminates.
* tables persist
----
When querying against global view, must use the prefix `global_temp.`

### Create using SQL

In [69]:
q = """
create or replace temp view us_origin_airport_jfk_tmp_view as
select date, delay, origin, destination
from us_delay_flights_tbl
where origin = 'JFK'
"""
spark.sql(q)

DataFrame[]

In [70]:
spark.sql("""select * from us_origin_airport_jfk_tmp_view limit 10""").show()

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|01010900|   14|   JFK|        LAX|
|01011200|   -3|   JFK|        LAX|
|01011900|    2|   JFK|        LAX|
|01011700|   11|   JFK|        LAS|
|01010800|   -1|   JFK|        SFO|
|01011540|   -4|   JFK|        DFW|
|01011705|    5|   JFK|        SAN|
|01011530|   -3|   JFK|        SFO|
|01011630|   -3|   JFK|        SJU|
|01011345|    2|   JFK|        LAX|
+--------+-----+------+-----------+



### Create using Python and DataFrame API

In [71]:
df_sfo = spark.sql("""select date, delay, origin, destination
    from us_delay_flights_tbl where origin = 'SFO'""")
df_jfk = spark.sql("""select date, delay, origin, destination
    from us_delay_flights_tbl where origin = 'JFK'""")

Create temp and global temp view

In [72]:
df_sfo.createOrReplaceGlobalTempView("us_origin_airport_SFO_global_tmp_view")

In [73]:
df_jfk.createOrReplaceTempView("us_origin_airport_jfk_tmp_view")

### Query View

In [74]:
q = """select * from global_temp.us_origin_airport_SFO_global_tmp_view"""
spark.sql(q).show()

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|01011250|   55|   SFO|        JFK|
|01012230|    0|   SFO|        JFK|
|01010705|   -7|   SFO|        JFK|
|01010620|   -3|   SFO|        MIA|
|01010915|   -3|   SFO|        LAX|
|01011005|   -8|   SFO|        DFW|
|01011800|    0|   SFO|        ORD|
|01011740|   -7|   SFO|        LAX|
|01012015|   -7|   SFO|        LAX|
|01012110|   -1|   SFO|        MIA|
|01011610|  134|   SFO|        DFW|
|01011240|   -6|   SFO|        MIA|
|01010755|   -3|   SFO|        DFW|
|01010020|    0|   SFO|        DFW|
|01010705|   -6|   SFO|        LAX|
|01010925|   -3|   SFO|        ORD|
|01010555|   -6|   SFO|        ORD|
|01011105|   -8|   SFO|        DFW|
|01012330|   32|   SFO|        ORD|
|01011330|    3|   SFO|        DFW|
+--------+-----+------+-----------+
only showing top 20 rows



In [75]:
q = """select * from us_origin_airport_jfk_tmp_view"""
spark.sql(q).show()

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|01010900|   14|   JFK|        LAX|
|01011200|   -3|   JFK|        LAX|
|01011900|    2|   JFK|        LAX|
|01011700|   11|   JFK|        LAS|
|01010800|   -1|   JFK|        SFO|
|01011540|   -4|   JFK|        DFW|
|01011705|    5|   JFK|        SAN|
|01011530|   -3|   JFK|        SFO|
|01011630|   -3|   JFK|        SJU|
|01011345|    2|   JFK|        LAX|
|01011545|   -3|   JFK|        LAX|
|01011510|   -1|   JFK|        MIA|
|01011745|    7|   JFK|        SFO|
|01011250|    3|   JFK|        BOS|
|01011645|  142|   JFK|        LAX|
|01012135|   -2|   JFK|        LAX|
|01011715|   18|   JFK|        ORD|
|01011615|   25|   JFK|        IAH|
|01011850|   -2|   JFK|        SEA|
|01011725|   -5|   JFK|        BOS|
+--------+-----+------+-----------+
only showing top 20 rows



In [76]:
spark.read.table("us_origin_airport_jfk_tmp_view").show()

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|01010900|   14|   JFK|        LAX|
|01011200|   -3|   JFK|        LAX|
|01011900|    2|   JFK|        LAX|
|01011700|   11|   JFK|        LAS|
|01010800|   -1|   JFK|        SFO|
|01011540|   -4|   JFK|        DFW|
|01011705|    5|   JFK|        SAN|
|01011530|   -3|   JFK|        SFO|
|01011630|   -3|   JFK|        SJU|
|01011345|    2|   JFK|        LAX|
|01011545|   -3|   JFK|        LAX|
|01011510|   -1|   JFK|        MIA|
|01011745|    7|   JFK|        SFO|
|01011250|    3|   JFK|        BOS|
|01011645|  142|   JFK|        LAX|
|01012135|   -2|   JFK|        LAX|
|01011715|   18|   JFK|        ORD|
|01011615|   25|   JFK|        IAH|
|01011850|   -2|   JFK|        SEA|
|01011725|   -5|   JFK|        BOS|
+--------+-----+------+-----------+
only showing top 20 rows



### Drop view

In [77]:
spark.sql("drop view if exists us_origin_airport_SFO_global_tmp_view")

DataFrame[]

In [79]:
try:
    spark.read.table('us_origin_airport_SFO_global_tmp_view')
except Exception:
    print('failed')

failed


In [80]:
spark.catalog.dropTempView('us_origin_airport_jfk_tmp_view')

In [81]:
try:
    spark.read.table('us_origin_airport_jfk_tmp_view')
except Exception:
    print('failed')

failed


## View the metadata

In [82]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/Users/bartev/dev/github-bv/san-tan/lrn-spark/spark-warehouse'),
 Database(name='learn_spark_db', description='', locationUri='file:/Users/bartev/dev/github-bv/san-tan/lrn-spark/spark-warehouse/learn_spark_db.db')]

In [90]:
spark.catalog.listTables()

[Table(name='managed_us_delay_flights_tbl', database='learn_spark_db', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='us_delay_flights_tbl', database='learn_spark_db', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='us_delay_flights_tbl', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [88]:
[(t.database, t.name) for t in spark.catalog.listTables()]

[('learn_spark_db', 'managed_us_delay_flights_tbl'),
 ('learn_spark_db', 'us_delay_flights_tbl'),
 (None, 'us_delay_flights_tbl')]

In [89]:
spark.catalog.listColumns('us_delay_flights_tbl')

[Column(name='date', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='delay', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='distance', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='origin', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='destination', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False)]

## Cachine SQL Tables

In [91]:
spark.sql("cache table us_delay_flights_tbl")

DataFrame[]

In [92]:
spark.sql("uncache table us_delay_flights_tbl")

DataFrame[]

Lazy cache

In [93]:
spark.sql("cache lazy table us_delay_flights_tbl")

DataFrame[]

In [94]:
spark.sql("uncache table us_delay_flights_tbl")

DataFrame[]

## Read table into DataFrame

In [95]:
us_flights_df = spark.sql("select * from us_delay_flights_tbl")

In [96]:
us_flights_df2 = spark.table('us_delay_flights_tbl')

In [98]:
us_flights_df.show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
|01061215|   -6|     602|   ABE|        ATL|
|01061725|   69|     602|   ABE|        ATL|
|01061230|    0|     369|   ABE|        DTW|
|01060625|   -3|     602|   ABE|        ATL|
|01070600|    0|     369|   ABE|        DTW|
|01071725|    0|     602|   ABE|        ATL|
|01071230|    0|     369|   ABE|        DTW|
|01070625|    0|     602|   ABE|        ATL|
|01071219|    0|     569|   ABE|        ORD|
|01080600|

In [97]:
us_flights_df2.show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
|01030605|    0|     602|   ABE|        ATL|
|01041243|   10|     602|   ABE|        ATL|
|01040605|   28|     602|   ABE|        ATL|
|01051245|   88|     602|   ABE|        ATL|
|01050605|    9|     602|   ABE|        ATL|
|01061215|   -6|     602|   ABE|        ATL|
|01061725|   69|     602|   ABE|        ATL|
|01061230|    0|     369|   ABE|        DTW|
|01060625|   -3|     602|   ABE|        ATL|
|01070600|    0|     369|   ABE|        DTW|
|01071725|    0|     602|   ABE|        ATL|
|01071230|    0|     369|   ABE|        DTW|
|01070625|    0|     602|   ABE|        ATL|
|01071219|    0|     569|   ABE|        ORD|
|01080600|

# Data Sources for DataFrames and SQL Tables

## `DataFrameReader`

use `SparkSession.read` to get an instance of a `DataFrameReader`

In [107]:
file = db_fname('flights/summary-data/parquet/2010-summary.parquet')

In [108]:
file

'/Users/bartev/dev/github-bv/LearningSparkV2/databricks-datasets/learning-spark-v2/flights/summary-data/parquet/2010-summary.parquet'

In [110]:
df = spark.read.format('parquet').load(file)

In [111]:
df.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
+-----------------+-------------------+-----+
only showing top 3 rows



In [112]:
df2 = spark.read.load(file)
df2.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
+-----------------+-------------------+-----+
only showing top 3 rows



In [113]:
file3 = db_fname('flights/summary-data/csv/*')
file3

'/Users/bartev/dev/github-bv/LearningSparkV2/databricks-datasets/learning-spark-v2/flights/summary-data/csv/*'

In [114]:
df3 = (spark.read.format('csv')
          .option("inferSchema", "true")
          .option('header', 'true')
          .option('mode', 'PERMISSIVE')
          .load(file3))

In [115]:
df3.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
+-----------------+-------------------+-----+
only showing top 3 rows



In [117]:
file4 = db_fname('flights/summary-data/json/*')
df4 = (spark.read.format('json')
          .load(file4))
df4.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
+-----------------+-------------------+-----+
only showing top 3 rows



## `DataFrameWriter`
----
To get an instance, use

`DataFrame.write`

## Parquet

### Reading parquet file into a spark sql table

In [119]:
parq_fname = db_fname('flights/summary-data/parquet/2010-summary.parquet')

In [122]:
q = f"""create or replace temporary view us_delay_flights_tbl
using parquet
options(path "{parq_fname}")"""

In [123]:
print(q)

create or replace temporary view us_delay_flights_tbl
using parquet
options(path "/Users/bartev/dev/github-bv/LearningSparkV2/databricks-datasets/learning-spark-v2/flights/summary-data/parquet/2010-summary.parquet")


In [124]:
spark.sql(q)

DataFrame[]

In [126]:
df = spark.sql("select * from us_delay_flights_tbl")
df.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
+-----------------+-------------------+-----+
only showing top 3 rows



### Write DataFrame to parquet files

In [127]:
(df.write.format('parquet')
    .mode('overwrite')
    .option('compression', 'snappy')
    .save('/tmp/data/parquet/df_parquet'))

### Write DataFrame to spark sql table

In [131]:
(df.write
    .mode('overwrite')
    .saveAsTable('us_delay_flights_tbl'))

In [130]:
!ls -la spark-warehouse/learn_spark_db.db

total 0
drwxr-xr-x   4 bartev  staff  128 Oct  4 23:29 [1m[36m.[m[m
drwxr-xr-x   8 bartev  staff  256 Sep 17 10:48 [1m[36m..[m[m
drwxr-xr-x  20 bartev  staff  640 Oct  4 01:50 [1m[36mmanaged_us_delay_flights_tbl[m[m
drwxr-xr-x   6 bartev  staff  192 Oct  4 23:29 [1m[36mus_delay_flights_tbl[m[m


## JSON

In [132]:
json_fname = db_fname('flights/summary-data/json/*')

In [133]:
df_j = spark.read.format('json').load(json_fname)

In [135]:
df_j.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
+-----------------+-------------------+-----+
only showing top 3 rows



### Reading json file into a spark sql table

In [231]:
spark.sql("drop table if exists us_delay_flights_tbl")

DataFrame[]

In [235]:
try:
    df_j = spark.table('us_delay_flights_tbl')
    df_j.show()
except Exception:
    print('failed')

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [233]:
q = f"""create or replace temporary view us_delay_flights_tbl
using json
options(path "{json_fname}")"""
spark.sql(q)

DataFrame[]

In [234]:
try:
    df = spark.table('us_delay_flights_tbl')
    df.show(3)
except Exception:
    print('failed')

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
+-----------------+-------------------+-----+
only showing top 3 rows



In [237]:
df_j.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



### Write DataFrame to JSON file

In [179]:
(df.limit(10).show())

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
|    United States|          Singapore|    1|
|    United States|            Grenada|   62|
|       Costa Rica|      United States|  588|
|          Senegal|      United States|   40|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+



In [181]:
(df.limit(10)
    .write
    .format('json')
    .mode('overwrite')
#     .option('compression', 'snappy')
    .save('/tmp/data/json/df_json'))

In [185]:
(df.limit(10)
    .write
    .format('json')
    .mode('overwrite')
    .option('multiLine', 'true')
#     .option('compression', 'snappy')
    .save('/tmp/data/json/df_json_multi'))

## CSV

In [238]:
csv_fname = db_fname('flights/summary-data/csv/*')

In [239]:
schema = """
DEST_COUNTRY_NAME STRING, 
ORIGIN_COUNTRY_NAME STRING, 
count INT"""

In [240]:
df_csv = (spark.read.format('csv')
             .option('header', 'true')
             .schema(schema)
             .option('mode', 'FAILFAST')
             .option('nullValue', '')
             .load(csv_fname))

In [241]:
df_csv.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
+-----------------+-------------------+-----+
only showing top 3 rows



In [242]:
df_csv.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



### Reading csv file into a spark sql table

In [243]:
q = f"""
create or replace temporary view us_delay_flights_tbl
using csv
options (
    path "{csv_fname}",
    header "true",
    mode 'FAILFAST',
    nullvalue '',
    inferSchema 'true'
    )
"""
print(q)


create or replace temporary view us_delay_flights_tbl
using csv
options (
    path "/Users/bartev/dev/github-bv/LearningSparkV2/databricks-datasets/learning-spark-v2/flights/summary-data/csv/*",
    header "true",
    mode 'FAILFAST',
    nullvalue '',
    inferSchema 'true'
    )



In [244]:
spark.sql("drop table if exists us_delay_flights_tbl")

DataFrame[]

In [248]:
try:
    df_c1 = spark.table('us_delay_flights_tbl')
    df_c1.show()
except Exception:
    print('failed')

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [249]:
spark.sql(q)

DataFrame[]

In [250]:
df_c1.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [251]:
q = f"""
create or replace temporary view us_delay_flights_tbl
using csv
options (
    path "{csv_fname}",
    header "true",
    mode 'FAILFAST',
    nullvalue '',
    schema "{schema}"
    )
"""
print(q)


create or replace temporary view us_delay_flights_tbl
using csv
options (
    path "/Users/bartev/dev/github-bv/LearningSparkV2/databricks-datasets/learning-spark-v2/flights/summary-data/csv/*",
    header "true",
    mode 'FAILFAST',
    nullvalue '',
    schema "
DEST_COUNTRY_NAME STRING, 
ORIGIN_COUNTRY_NAME STRING, 
count INT"
    )



In [252]:
spark.sql("drop table if exists us_delay_flights_tbl")

DataFrame[]

In [253]:
try:
    df_c = spark.table('us_delay_flights_tbl')
    df_c.show()
except Exception:
    print('failed')

failed


In [254]:
spark.sql(q)

DataFrame[]

In [255]:
try:
    df_c = spark.table('us_delay_flights_tbl')
    df_c.show()
except Exception:
    print('failed')

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [256]:
df_c.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: string (nullable = true)

