In [None]:
from pyspark.sql import SparkSession 
# Create a SparkSession
spark = (SparkSession
 .builder
 .appName("SparkSQLExampleApp")
 .getOrCreate())
# Path to data set
csv_file = "/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
# Read and create a temporary view
# Infer schema (note that for larger files you 
# may want to specify the schema)
df = (spark.read.format("csv")
 .option("inferSchema", "true")
 .option("header", "true")
 .load(csv_file))
df.createOrReplaceTempView("us_delay_flights_tbl")


In [1]:
# If you want to specify a schema, you can use a DDL-formatted string. For example:

# In Python
schema = "`date` STRING, `delay` INT, `distance` INT, 
`origin` STRING, `destination` STRING"


SyntaxError: unterminated string literal (detected at line 4) (1447530214.py, line 4)

In [None]:
spark.sql("""SELECT distance, origin, destination 
FROM us_delay_flights_tbl WHERE distance > 1000 
ORDER BY distance DESC""").show(10)

In [None]:
spark.sql("""SELECT date, delay, origin, destination 
FROM us_delay_flights_tbl 
WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD' 
ORDER by delay DESC""").show(10)

In [None]:
spark.sql("""SELECT delay, origin, destination,
 CASE
 WHEN delay > 360 THEN 'Very Long Delays'
 WHEN delay > 120 AND delay < 360 THEN 'Long Delays'
 WHEN delay > 60 AND delay < 120 THEN 'Short Delays'
 WHEN delay > 0 and delay < 60 THEN 'Tolerable Delays'
 WHEN delay = 0 THEN 'No Delays'
 ELSE 'Early'
 END AS Flight_Delays
 FROM us_delay_flights_tbl
 ORDER BY origin, delay DESC""").show(10)


In [None]:
from pyspark.sql.functions import col, desc
(df.select("distance", "origin", "destination")
 .where(col("distance") > 1000)
 .orderBy(desc("distance"))).show(10)


In [None]:
# Or
(df.select("distance", "origin", "destination")
 .where("distance > 1000")
 .orderBy("distance", ascending=False).show(10))


In [None]:
spark.sql("CREATE DATABASE learn_spark_db")
spark.sql("USE learn_spark_db")


In [None]:
spark.sql("CREATE TABLE managed_us_delay_flights_tbl (date STRING, delay INT, 
 distance INT, origin STRING, destination STRING)")


In [None]:
# Path to our US flight delays CSV file 
csv_file = "/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"


In [None]:
# Schema as defined in the preceding example
schema="date STRING, delay INT, distance INT, origin STRING, destination STRING"
flights_df = spark.read.csv(csv_file, schema=schema)
flights_df.write.saveAsTable("managed_us_delay_flights_tbl")

In [None]:
spark.sql("""CREATE TABLE us_delay_flights_tbl(date STRING, delay INT, 
 distance INT, origin STRING, destination STRING) 
 USING csv OPTIONS (PATH 
 '/databricks-datasets/learning-spark-v2/flights/departuredelays.csv')""")


In [None]:
(flights_df
 .write
 .option("path", "/tmp/data/us_flights_delay")
 .saveAsTable("us_delay_flights_tbl"))
