In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *

In [2]:
sc = pyspark.SparkContext()
ss = SparkSession(sc)

In [11]:
# get the data file into a dataframe and view
fil = "./LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
delays = ss.read.csv(fil, header=True, inferSchema=True).cache()
#delays.createOrReplaceTempView("us_delay_flights_tbl")
delays.write.saveAsTable('us_delay_flights_tbl')

In [12]:
# get delays on long flights
ss.sql("""select * from us_delay_flights_tbl where distance > 1000 order by distance DESC, delay DESC;""").show()

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|2121625|  932|    4330|   HNL|        JFK|
|2130900|  922|    4330|   JFK|        HNL|
|1030900|  784|    4330|   JFK|        HNL|
|2140900|  175|    4330|   JFK|        HNL|
|2220900|  134|    4330|   JFK|        HNL|
|3010900|  123|    4330|   JFK|        HNL|
|1220900|  118|    4330|   JFK|        HNL|
|1211625|  115|    4330|   HNL|        JFK|
|1040900|  111|    4330|   JFK|        HNL|
|1021625|  110|    4330|   HNL|        JFK|
|1050900|   98|    4330|   JFK|        HNL|
|1260900|   38|    4330|   JFK|        HNL|
|3291530|   37|    4330|   HNL|        JFK|
|1230900|   20|    4330|   JFK|        HNL|
|1051625|   18|    4330|   HNL|        JFK|
|1160900|   17|    4330|   JFK|        HNL|
|2071625|   16|    4330|   HNL|        JFK|
|3021625|   14|    4330|   HNL|        JFK|
|1080900|   14|    4330|   JFK|        HNL|
|3281530|   14|    4330|   HNL| 

In [13]:
# get 2+ hr delays from sf to chicago
ss.sql("""select * from us_delay_flights_tbl where origin='SFO' and destination='ORD' and delay > 120 order by delay desc""").show()

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|2190925| 1638|    1604|   SFO|        ORD|
|1031755|  396|    1604|   SFO|        ORD|
|1022330|  326|    1604|   SFO|        ORD|
|1051205|  320|    1604|   SFO|        ORD|
|1190925|  297|    1604|   SFO|        ORD|
|2171115|  296|    1604|   SFO|        ORD|
|1071040|  279|    1604|   SFO|        ORD|
|1051550|  274|    1604|   SFO|        ORD|
|3120730|  266|    1604|   SFO|        ORD|
|1261104|  258|    1604|   SFO|        ORD|
|1161210|  225|    1604|   SFO|        ORD|
|2091800|  223|    1604|   SFO|        ORD|
|1221040|  215|    1604|   SFO|        ORD|
|3121155|  203|    1604|   SFO|        ORD|
|2111256|  197|    1604|   SFO|        ORD|
|3311405|  196|    1604|   SFO|        ORD|
|1031920|  193|    1604|   SFO|        ORD|
|1021410|  190|    1604|   SFO|        ORD|
|3171215|  189|    1604|   SFO|        ORD|
|1101410|  184|    1604|   SFO| 

In [14]:
# create a temporary view
ss.sql("""create or replace temp view us_origin_airport_SFO_tmp_view as select * from us_delay_flights_tbl where origin='SFO';""")
ss.sql("""select count(*) from us_origin_airport_SFO_tmp_view;""").show()

+--------+
|count(1)|
+--------+
|   39483|
+--------+



In [15]:
# check out the metadata
print(ss.catalog.listDatabases())
print(ss.catalog.listTables())
print(ss.catalog.listColumns('us_delay_flights_tbl')) # doesn't work because if create as a view

[Database(name='default', description='default database', locationUri='file:/C:/Users/howean/Documents/spark_learning/spark-warehouse')]
[Table(name='us_delay_flights_tbl', database='default', description=None, tableType='MANAGED', isTemporary=False), Table(name='us_delay_flights_tbl', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='us_origin_airport_sfo_global_tmp_view', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='us_origin_airport_sfo_tmp_view', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]
[Column(name='date', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False), Column(name='delay', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False), Column(name='distance', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False), Column(name='origin', description=None, dataType='string', nullable=Tr

In [None]:
sc.stop()