In [1]:
""" 
    Spark can create views on top of existing tables. 
    Views can be global (visible across all SparkSessions on a given cluster) 
    or session-scoped (visible only to a single SparkSession),
    and they are temporary: they disappear after your Spark application terminates.
"""

"""
  Once you create a view, you can query it as you would a table. 
  The difference between a view and a table is that views don’t 
  actually hold the data; tables persist after your Spark application terminates, 
  but views disappear.
"""

"""
 You can create a view from an existing table using SQL.
 For example, if you wish to work on only the subset of 
 the US flight delays data set with origin airports of 
 New York (JFK) and San Francisco (SFO), the following 
 queries will create global tempo‐ rary and temporary
 views consisting of just that slice of the table:
"""

'\n You can create a view from an existing table using SQL.\n For example, if you wish to work on only the subset of \n the US flight delays data set with origin airports of \n New York (JFK) and San Francisco (SFO), the following \n queries will create global tempo‐ rary and temporary\n views consisting of just that slice of the table:\n'

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()


# Dataframe API syntax
csv_file = "airlinedelaycauses_DelayedFlights.csv"
schema="date STRING, delay INT, distance INT, origin STRING, destination STRING"
flights_df = spark.read.csv(csv_file,schema=schema)
#flights_df.write.saveAsTable("managed_us_delay_flights_tbl1")



#In SQL
spark.sql(" CREATE OR REPLACE GLOBAL TEMP VIEW us_origin_airport_SFO_global_tmp_view \
AS SELECT date, delay, origin, destination from managed_us_delay_flights_tbl1 WHERE origin = 'SFO'")

spark.sql("CREATE OR REPLACE TEMP VIEW us_origin_airport_JFK_tmp_view AS \
SELECT date, delay, origin, destination from managed_us_delay_flights_tbl1 WHERE origin = 'JFK'")

DataFrame[]

In [9]:
#You can accomplish the same thing with the DataFrame API as follows:

# In Python
df_sfo = spark.sql("SELECT date, delay, origin, destination FROM \
         managed_us_delay_flights_tbl1 WHERE origin = 'SFO'")
df_jfk = spark.sql("SELECT date, delay, origin, destination FROM \
         managed_us_delay_flights_tbl1 WHERE origin = 'JFK'")

# Create a temporary and global temporary view
df_sfo.createOrReplaceGlobalTempView("us_origin_airport_SFO_global_tmp_view") 
df_jfk.createOrReplaceTempView("us_origin_airport_JFK_tmp_view") 


In [10]:
"""
   Once you’ve created these views, you can issue queries against them  just as you would against a table.
"""
"""
   Keep in mind that when accessing a global temporary view
   you must use the prefix global_temp.<view_name>,
   because Spark creates global temporary views in a 
   global temporary database called global_temp.
"""
#In SQL
spark.sql("SELECT * FROM global_temp.us_origin_airport_SFO_global_tmp_view")


DataFrame[date: string, delay: int, origin: string, destination: string]