In [1]:
from pyspark.sql import SparkSession

In [None]:
""" 
    Spark by default uses the Apache Hive metastore, 
    located at /user/hive/warehouse, to persist all the metadata about your tables. 

    Spark allows you to create two types of tables: managed and unmanaged. 
   
   For a manaaged table, Spark manages both the metadata and the data in the file store.
   This could be a local filesystem, HDFS, or an object store such as Amazon S3 or Azure Blob. 
   
   For unmanaged table an unmanaged table, Spark only manages the metadata,
   while you manage the data yourself(i.e. - we specify the location 
   where data will be stored) in an external data source such as Cassandra.

   With a managed table, because Spark manages everything, 
   a SQL command such as DROP TABLE table_name deletes both 
   the metadata and the data. With an unmanaged table, the same 
   command will delete only the metadata, not the actual data. 
   We will look at some examples of how to create managed and
   unmanaged tables in the next section 

"""

In [3]:
"""
   Tables reside within a database. 
   By default, Spark creates tables under the default database.
   To create your own database name, you can issue a SQL command
   from your Spark application or notebook. Using the US flight 
   delays data set, let’s create both a managed and an unmanaged table.
   To begin, we’ll create a database called learn_spark_db and tell
   Spark we want to use that database.
"""

# Create a spark session 
# Spark session is the entry point to access spark APIs
spark = SparkSession.builder.getOrCreate()
spark.sql("CREATE DATABASE learn_spark_db")
spark.sql("USE learn_spark_db")

DataFrame[]

In [15]:
"""To create a managed table within the database learn_spark_db,
we can issue a SQL query like the following:"""


"""
# SQL syntax
spark.sql("Create TABLE managed_us_delay_flights_tbl \
          (date STRING, delay INT, distance INT, origin STRING, destination STRING)")
"""

# Dataframe API syntax
csv_file = "airlinedelaycauses_DelayedFlights.csv"
schema="date STRING, delay INT, distance INT, origin STRING, destination STRING"
flights_df = spark.read.csv(csv_file,schema=schema)
flights_df.write.saveAsTable("managed_us_delay_flights_tbl1")


21/11/26 23:12:22 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/11/26 23:12:22 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/11/26 23:12:26 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
                                                                                

21/11/26 20:01:46 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/11/26 20:01:46 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/11/26 20:01:51 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
                                                                                

In [6]:
spark.sql("Select * from managed_us_delay_flights_tbl")

DataFrame[date: string, delay: int, distance: int, origin: string, destination: string]

In [None]:
""" Create an unmanaged tables """

spark.sql("""CREATE TABLE us_delay_flights_tbl(date STRING, delay INT,
      distance INT, origin STRING, destination STRING)
      USING csv OPTIONS (PATH
      '/databricks-datasets/learning-spark-v2/flights/departuredelays.csv')""")

#And within the DataFrame API use:
flights_df
      .write
      .option("path", "/tmp/data/us_flights_delay")
      .saveAsTable("us_delay_flights_tbl"))