### Spark SQL

Normally, in a standalone Spark application, you will create a SparkSession instance manually. However, in a Spark shell (or Databricks notebook), the SparkSession is created for you and accessible via the appropriately named variable spark.

In [49]:
from pyspark.sql import SparkSession 
# Create a SparkSession
spark = (SparkSession
 .builder
 .appName("SparkSQLExampleApp")
 .getOrCreate())

# Path to data set
csv_file = "C:/Users/alice.marchi/Downloads/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"

# Read and create a temporary view
# Infer schema (note that for larger files you may want to specify the schema)
df = (spark.read.format("csv")
 .option("inferSchema", "true")
 .option("header", "true")
 .load(csv_file))

df.createOrReplaceTempView("us_delay_flights_tbl")

If you want to specify a schema, you can use a DDL-formatted string. For example:
schema = "'date' STRING, 'delay' INT, 'distance' INT, 'origin' STRING, 'destination' STRING"

In [50]:
# Now that we have a temporary view, we can issue SQL queries using Spark SQL.
#These queries are no different from those you might issue against a SQL table.

spark.sql("""SELECT distance, origin, destination
FROM us_delay_flights_tbl 
WHERE distance > 1000
ORDER BY distance DESC""").show(10)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
+--------+------+-----------+
only showing top 10 rows



In [51]:
# All flights between San Francisco (SFO) and Chicago (ORD) with at least a 2-hour delay:

spark.sql("""SELECT date, delay, origin, destination
FROM us_delay_flights_tbl
WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD'
ORDER BY delay DESC""").show(10)

+-------+-----+------+-----------+
|   date|delay|origin|destination|
+-------+-----+------+-----------+
|2190925| 1638|   SFO|        ORD|
|1031755|  396|   SFO|        ORD|
|1022330|  326|   SFO|        ORD|
|1051205|  320|   SFO|        ORD|
|1190925|  297|   SFO|        ORD|
|2171115|  296|   SFO|        ORD|
|1071040|  279|   SFO|        ORD|
|1051550|  274|   SFO|        ORD|
|3120730|  266|   SFO|        ORD|
|1261104|  258|   SFO|        ORD|
+-------+-----+------+-----------+
only showing top 10 rows



In [52]:
# As an exercise, convert the date column into a readable format. 
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Define a UDF to convert the date format into a legible format.(UDF = user defined function)

def to_date_format_udf(d_str):
  l = [char for char in d_str]
  return "".join(l[0:2]) + "/" +  "".join(l[2:4]) + " " + " " +"".join(l[4:6]) + ":" + "".join(l[6:])

to_date_format_udf("02190925")

'02/19  09:25'

In [53]:
# Register the UDF

spark.udf.register("to_date_format_udf", to_date_format_udf, StringType())

<function __main__.to_date_format_udf(d_str)>

In [54]:
# Read US departure flight data

df = (spark.read.format("csv")
      .schema("date STRING, delay INT, distance INT, origin STRING, destination STRING")
      .option("header", "true")
      .option("path", "C:/Users/alice.marchi/Downloads/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv")
      .load())

display(df)

DataFrame[date: string, delay: int, distance: int, origin: string, destination: string]

In [55]:
# Test the UDF

df.selectExpr("to_date_format_udf(date) as data_format").show(10, truncate=False)

+------------+
|data_format |
+------------+
|01/01  12:45|
|01/02  06:00|
|01/02  12:45|
|01/02  06:05|
|01/03  12:45|
|01/03  06:05|
|01/04  12:43|
|01/04  06:05|
|01/05  12:45|
|01/05  06:05|
+------------+
only showing top 10 rows



In [56]:
# Create a temporary view to which we can issue SQL queries

df.createOrReplaceTempView("us_delay_flights_tbl")

In [57]:
# Cache Table so queries are expedient
spark.sql("CACHE TABLE us_delay_flights_tbl")

DataFrame[]

In [58]:
spark.sql("SELECT *, date, to_date_format_udf(date) AS date_fm FROM us_delay_flights_tbl").show(10, truncate=False)

+--------+-----+--------+------+-----------+--------+------------+
|date    |delay|distance|origin|destination|date    |date_fm     |
+--------+-----+--------+------+-----------+--------+------------+
|01011245|6    |602     |ABE   |ATL        |01011245|01/01  12:45|
|01020600|-8   |369     |ABE   |DTW        |01020600|01/02  06:00|
|01021245|-2   |602     |ABE   |ATL        |01021245|01/02  12:45|
|01020605|-4   |602     |ABE   |ATL        |01020605|01/02  06:05|
|01031245|-4   |602     |ABE   |ATL        |01031245|01/03  12:45|
|01030605|0    |602     |ABE   |ATL        |01030605|01/03  06:05|
|01041243|10   |602     |ABE   |ATL        |01041243|01/04  12:43|
|01040605|28   |602     |ABE   |ATL        |01040605|01/04  06:05|
|01051245|88   |602     |ABE   |ATL        |01051245|01/05  12:45|
|01050605|9    |602     |ABE   |ATL        |01050605|01/05  06:05|
+--------+-----+--------+------+-----------+--------+------------+
only showing top 10 rows



In [60]:
spark.sql("CREATE TEMPORARY VIEW delay_temp2 AS (SELECT *, date, to_date_format_udf(date) AS date_fm FROM us_delay_flights_tbl WHERE date IS NOT NULL)")

DataFrame[]

In [61]:
spark.sql("SELECT * FROM delay_temp2").show(10, truncate=False)

+--------+-----+--------+------+-----------+--------+------------+
|date    |delay|distance|origin|destination|date    |date_fm     |
+--------+-----+--------+------+-----------+--------+------------+
|01011245|6    |602     |ABE   |ATL        |01011245|01/01  12:45|
|01020600|-8   |369     |ABE   |DTW        |01020600|01/02  06:00|
|01021245|-2   |602     |ABE   |ATL        |01021245|01/02  12:45|
|01020605|-4   |602     |ABE   |ATL        |01020605|01/02  06:05|
|01031245|-4   |602     |ABE   |ATL        |01031245|01/03  12:45|
|01030605|0    |602     |ABE   |ATL        |01030605|01/03  06:05|
|01041243|10   |602     |ABE   |ATL        |01041243|01/04  12:43|
|01040605|28   |602     |ABE   |ATL        |01040605|01/04  06:05|
|01051245|88   |602     |ABE   |ATL        |01051245|01/05  12:45|
|01050605|9    |602     |ABE   |ATL        |01050605|01/05  06:05|
+--------+-----+--------+------+-----------+--------+------------+
only showing top 10 rows



In [62]:
spark.sql("SELECT delay, origin, destination, date, MONTH(date_fm) AS Month FROM delay_temp2").show(10)

+-----+------+-----------+--------+-----+
|delay|origin|destination|    date|Month|
+-----+------+-----------+--------+-----+
|    6|   ABE|        ATL|01011245| null|
|   -8|   ABE|        DTW|01020600| null|
|   -2|   ABE|        ATL|01021245| null|
|   -4|   ABE|        ATL|01020605| null|
|   -4|   ABE|        ATL|01031245| null|
|    0|   ABE|        ATL|01030605| null|
|   10|   ABE|        ATL|01041243| null|
|   28|   ABE|        ATL|01040605| null|
|   88|   ABE|        ATL|01051245| null|
|    9|   ABE|        ATL|01050605| null|
+-----+------+-----------+--------+-----+
only showing top 10 rows



In [63]:
# we want to label all US flights with an indication of the delays: Very Long Delays, Long Delays...

spark.sql("""SELECT delay, origin, destination,
 CASE
 WHEN delay > 360 THEN 'Very Long Delays'
 WHEN delay > 120 AND delay < 360 THEN 'Long Delays'
 WHEN delay > 60 AND delay < 120 THEN 'Short Delays'
 WHEN delay > 0 and delay < 60 THEN 'Tolerable Delays'
 WHEN delay = 0 THEN 'No Delays'
 ELSE 'Early'
 END AS Flight_Delays
 FROM us_delay_flights_tbl
 ORDER BY origin, delay DESC""").show(10)

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
+-----+------+-----------+-------------+
only showing top 10 rows



In [64]:
# All SQL queries can be expressed with an equivalent Data‐Frame API query

from pyspark.sql.functions import col, desc

(df.select("distance", "origin", "destination")
 .where(col("distance") > 1000)
 .orderBy(desc("distance"))).show(10)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [65]:
from pyspark.sql.functions import *

(df.select("date", "delay", "origin", "destination")
 .where(col("delay") > 120)
 .where(col("origin") == "SFO")
 .where(col("destination") == "ORD")
 .orderBy(desc("delay"))).show(10)

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|02190925| 1638|   SFO|        ORD|
|01031755|  396|   SFO|        ORD|
|01022330|  326|   SFO|        ORD|
|01051205|  320|   SFO|        ORD|
|01190925|  297|   SFO|        ORD|
|02171115|  296|   SFO|        ORD|
|01071040|  279|   SFO|        ORD|
|01051550|  274|   SFO|        ORD|
|03120730|  266|   SFO|        ORD|
|01261104|  258|   SFO|        ORD|
+--------+-----+------+-----------+
only showing top 10 rows



In [66]:
from pyspark.sql.functions import *

df.withColumn("Flight_Delays",
                    when(col("delay") > 360, "Very long delays")
                    .when((df.delay > 120) & (df.delay < 360), "Long delays")
                    .when((df.delay > 60) & (df.delay < 120), "Short delays")
                    .when((df.delay > 0) & (df.delay < 60), "Tolerable delays")
                    .when((df.delay > 120) & (df.delay < 360), "Long delays")
                    .when(df.delay == 0, "No delays")
                    .otherwise("Early")
                   ).show(10)


+--------+-----+--------+------+-----------+----------------+
|    date|delay|distance|origin|destination|   Flight_Delays|
+--------+-----+--------+------+-----------+----------------+
|01011245|    6|     602|   ABE|        ATL|Tolerable delays|
|01020600|   -8|     369|   ABE|        DTW|           Early|
|01021245|   -2|     602|   ABE|        ATL|           Early|
|01020605|   -4|     602|   ABE|        ATL|           Early|
|01031245|   -4|     602|   ABE|        ATL|           Early|
|01030605|    0|     602|   ABE|        ATL|       No delays|
|01041243|   10|     602|   ABE|        ATL|Tolerable delays|
|01040605|   28|     602|   ABE|        ATL|Tolerable delays|
|01051245|   88|     602|   ABE|        ATL|    Short delays|
|01050605|    9|     602|   ABE|        ATL|Tolerable delays|
+--------+-----+--------+------+-----------+----------------+
only showing top 10 rows



##### SQL Tables and Views

In [35]:
spark.sql("CREATE DATABASE learn_spark_db")
spark.sql("USE learn_spark_db")

DataFrame[]

In [36]:
# Create a managed table

spark.sql("CREATE TABLE managed_us_delay_flights_tbl (date STRING, delay INT, distance INT, origin STRING, destination STRING)")

DataFrame[]

You can do the same thing using the DataFrame API like this:

#In Python
#Path to US flight delays CSV file 

csv_file = "C:/Users/alice.marchi/Downloads/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv")
     
#Schema as defined in the preceding example
schema="date STRING, delay INT, distance INT, origin STRING, destination STRING"
flights_df = spark.read.csv(csv_file, schema=schema)
flights_df.write.saveAsTable("managed_us_delay_flights_tbl")

Create a unmanaged table
You can create unmanaged tables from your own data sources—say, Par‐quet, CSV, or JSON files stored in a file store accessible to your Spark application.

In [37]:
#Create an unmanaged table

spark.sql("""CREATE TABLE us_delay_flights_tbl(date STRING, delay INT, 
 distance INT, origin STRING, destination STRING) 
 USING csv OPTIONS (PATH 
 'C:/Users/alice.marchi/Downloads/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv')""")


DataFrame[]

#Within the DataFrame API use:

(flights_df
 .write
 .option("path", "/tmp/data/us_flights_delay")
 .saveAsTable("us_delay_flights_tbl"))

In [39]:
# Create a global temporary view with SQL

spark.sql("""CREATE OR REPLACE GLOBAL TEMP VIEW us_origin_airport_SFO_global_tmp_view AS
 SELECT date, delay, origin, destination from us_delay_flights_tbl WHERE
 origin = 'SFO';""")

DataFrame[]

In [40]:
spark.sql("""CREATE OR REPLACE TEMP VIEW us_origin_airport_JFK_tmp_view AS
 SELECT date, delay, origin, destination from us_delay_flights_tbl WHERE
 origin = 'JFK'""")

DataFrame[]

In [42]:
# You can accomplish the same thing with the DataFrame API as follows:

df_sfo = spark.sql("SELECT date, delay, origin, destination FROM us_delay_flights_tbl WHERE origin = 'SFO'")
df_jfk = spark.sql("SELECT date, delay, origin, destination FROM us_delay_flights_tbl WHERE origin = 'JFK'")

# Create a temporary and global temporary view
df_sfo.createOrReplaceGlobalTempView("us_origin_airport_SFO_global_tmp_view")
df_jfk.createOrReplaceTempView("us_origin_airport_JFK_tmp_view")

In [43]:
# When accessing a global temporary view you must use the prefix global_temp.<view_name>

spark.sql("""SELECT * FROM global_temp.us_origin_airport_SFO_global_tmp_view""")

DataFrame[date: string, delay: int, origin: string, destination: string]

In [44]:
# You can access the normal temporary view without the global_temp prefix

spark.sql("""SELECT * FROM us_origin_airport_JFK_tmp_view""")

DataFrame[date: string, delay: int, origin: string, destination: string]

In [45]:
# You can simply use SQL to query the table and assign the returned result to a DataFrame:

us_flights_df = spark.sql("SELECT * FROM us_delay_flights_tbl")
us_flights_df2 = spark.table("us_delay_flights_tbl")

Spark manages the metadata associated with each managed or unmanaged table. This is captured in the Catalog, a high-level abstraction in Spark SQL for storing metadata. 

In [28]:
# You can access all the stored metadata

spark.catalog.listDatabases()

[Database(name='default', description='Default Hive database', locationUri='file:/C:/Windows/system32/spark-warehouse'),
 Database(name='learn_spark_db', description='', locationUri='file:/C:/Windows/system32/spark-warehouse/learn_spark_db.db')]

In [29]:
spark.catalog.listTables()

[Table(name='firecalls', database='default', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='delay_temp', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='us_delay_flights_tbl', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

DataFrameWriter 
Saves or writes data to a specified built-in data source. Unlike with DataFrameReader, you access its instance not from a 
SparkSession but from the DataFrame you wish to save.

PARQUET is an open source columnar file format, it's the default data source in Spark. 

In [31]:
# Write a DataFrame to Parquet file

(df.write.format("parquet")
 .mode("overwrite")
 .option("compression", "snappy")
 .save("/tmp/data/parquet/df_parquet"))

In [32]:
# Write DataFrame to Spark SQL table

(df.write
 .mode("overwrite")
 .saveAsTable("us_delay_flights_tbl"))

In [34]:
# Reading a CSV file into a DataFrame

file = "C:/Users/alice.marchi/Downloads/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/csv/*"
schema = "DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count INT"
df = (spark.read.format("csv")
 .option("header", "true")
 .schema(schema)
 .option("mode", "FAILFAST") # Exit if any errors
 .option("nullValue", "") # Replace any null data field with quotes
 .load(file))

In [36]:
# Create a SQL table from a CSV data source (it is no different from Parquet or JSON)

spark.sql("""CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
 USING csv
 OPTIONS (
 path "C:/Users/alice.marchi/Downloads/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/csv/*",
 header "true",
 inferSchema "true",
 mode "FAILFAST"
 )"""
)

DataFrame[]

In [37]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show(10)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
|    United States|          Singapore|   25|
|    United States|            Grenada|   54|
|       Costa Rica|      United States|  477|
|          Senegal|      United States|   29|
|    United States|   Marshall Islands|   44|
+-----------------+-------------------+-----+
only showing top 10 rows



In [38]:
# Save a DataFrame as a CSV file

df.write.format("csv").mode("overwrite").save("/tmp/data/csv/df_csv")

In [40]:
# Read an ORC file into a DataFrame

file = "C:/Users/alice.marchi/Downloads/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/orc/*"
df = spark.read.format("orc").option("path", file).load()
df.show(10, False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |1    |
|United States    |Ireland            |264  |
|United States    |India              |69   |
|Egypt            |United States      |24   |
|Equatorial Guinea|United States      |1    |
|United States    |Singapore          |25   |
|United States    |Grenada            |54   |
|Costa Rica       |United States      |477  |
|Senegal          |United States      |29   |
|United States    |Marshall Islands   |44   |
+-----------------+-------------------+-----+
only showing top 10 rows



In [41]:
# Read an ORC file into a Spark SQL table

spark.sql("""CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
 USING orc
 OPTIONS (
 path "C:/Users/alice.marchi/Downloads/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/orc/*",
 header "true",
 inferSchema "true",
 mode "FAILFAST"
 )"""
)

spark.sql("SELECT * FROM us_delay_flights_tbl").show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [42]:
# Write a DataFrame to ORC file

(df.write.format("orc")
 .mode("overwrite")
 .option("compression", "snappy")
 .save("/tmp/data/orc/flights_orc"))

In [43]:
# Reading an image file into a DataFrame

from pyspark.ml import image

image_dir = "C:/Users/alice.marchi/Downloads/LearningSparkV2-master/databricks-datasets/learning-spark-v2/cctvVideos/train_images/"
images_df = spark.read.format("image").load(image_dir)
images_df.printSchema()


root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: integer (nullable = true)



In [44]:
images_df.select("image.height", "image.width", "image.nChannels", "image.mode",
 "label").show(5, truncate=False)

+------+-----+---------+----+-----+
|height|width|nChannels|mode|label|
+------+-----+---------+----+-----+
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |1    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
+------+-----+---------+----+-----+
only showing top 5 rows



The DataFrameReader converts each binary file into a single DataFrame row (record) that contains the raw content and metadata of the file.

In [45]:
# Read a binary file into a DataFrame

path = "C:/Users/alice.marchi/Downloads/LearningSparkV2-master/databricks-datasets/learning-spark-v2/cctvVideos/train_images/"
binary_files_df = (spark.read.format("binaryFile")
 .option("pathGlobFilter", "*.jpg")
 .load(path))
binary_files_df.show(5)

+--------------------+--------------------+------+--------------------+-----+
|                path|    modificationTime|length|             content|label|
+--------------------+--------------------+------+--------------------+-----+
|file:/C:/Users/al...|2022-04-26 18:34:...| 55037|[FF D8 FF E0 00 1...|    0|
|file:/C:/Users/al...|2022-04-26 18:34:...| 54634|[FF D8 FF E0 00 1...|    1|
|file:/C:/Users/al...|2022-04-26 18:34:...| 54624|[FF D8 FF E0 00 1...|    0|
|file:/C:/Users/al...|2022-04-26 18:34:...| 54505|[FF D8 FF E0 00 1...|    0|
|file:/C:/Users/al...|2022-04-26 18:34:...| 54475|[FF D8 FF E0 00 1...|    0|
+--------------------+--------------------+------+--------------------+-----+
only showing top 5 rows



In [46]:
binary_files_df = (spark.read.format("binaryFile")
 .option("pathGlobFilter", "*.jpg")
 .option("recursiveFileLookup", "true")
 .load(path))
binary_files_df.show(5)

+--------------------+--------------------+------+--------------------+
|                path|    modificationTime|length|             content|
+--------------------+--------------------+------+--------------------+
|file:/C:/Users/al...|2022-04-26 18:34:...| 55037|[FF D8 FF E0 00 1...|
|file:/C:/Users/al...|2022-04-26 18:34:...| 54634|[FF D8 FF E0 00 1...|
|file:/C:/Users/al...|2022-04-26 18:34:...| 54624|[FF D8 FF E0 00 1...|
|file:/C:/Users/al...|2022-04-26 18:34:...| 54505|[FF D8 FF E0 00 1...|
|file:/C:/Users/al...|2022-04-26 18:34:...| 54475|[FF D8 FF E0 00 1...|
+--------------------+--------------------+------+--------------------+
only showing top 5 rows



The binary file data source does not support writing a DataFrame back to the original file format.

Leer los AVRO, Parquet, JSON y CSV escritos en el cap.3

In [68]:
file = "C:/tmp/output/fire_csv/test-fire.csv/*"
df = spark.read.format("csv").option("path", file).load()
df.show(5, False)

+--------+---+-------+----------------+----------+----------+-----+----------------------+---------------------------+---+-----+----+----+----+----+----+----+-----+----+----+------+----+----+----+---------------------+-------------------------------------+-------------+---------+
|_c0     |_c1|_c2    |_c3             |_c4       |_c5       |_c6  |_c7                   |_c8                        |_c9|_c10 |_c11|_c12|_c13|_c14|_c15|_c16|_c17 |_c18|_c19|_c20  |_c21|_c22|_c23|_c24                 |_c25                                 |_c26         |_c27     |
+--------+---+-------+----------------+----------+----------+-----+----------------------+---------------------------+---+-----+----+----+----+----+----+----+-----+----+----+------+----+----+----+---------------------+-------------------------------------+-------------+---------+
|20110016|T13|2003235|Structure Fire  |01/11/2002|01/10/2002|Other|01/11/2002 01:51:44 AM|2000 Block of CALIFORNIA ST|SF |94109|B04 |38  |3362|3   |3   |3   

In [69]:
file = "C:/tmp/output/fire_json/*"
df = spark.read.format("json").option("path", file).load()
df.show(5, False)

+-------+---------------------------+----------------------+---------+----+----------+--------------------+----------+----------------+-------------+----+---------+-------------+----------------------+--------------+-------------------------------------+---------------------+---------+----------------+--------+-------------+-----------+------------------+------+--------------------------+--------+----------+-------+
|ALSUnit|Address                    |AvailableDtTm         |Battalion|Box |CallDate  |CallFinalDisposition|CallNumber|CallType        |CallTypeGroup|City|Delay    |FinalPriority|FirePreventionDistrict|IncidentNumber|Location                             |Neighborhood         |NumAlarms|OriginalPriority|Priority|RowID        |StationArea|SupervisorDistrict|UnitID|UnitSequenceInCallDispatch|UnitType|WatchDate |Zipcode|
+-------+---------------------------+----------------------+---------+----+----------+--------------------+----------+----------------+-------------+----+------

In [70]:
file = "C:/tmp/output/fire_parquet/*"
df = spark.read.format("parquet").option("path", file).load()
df.show(5, False)

+----------+------+--------------+----------------+----------+----------+--------------------+----------------------+------------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+------------------+-------------------------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|CallType        |CallDate  |WatchDate |CallFinalDisposition|AvailableDtTm         |Address                 |City|Zipcode|Battalion|StationArea|Box |OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|Neighborhood      |Location                             |RowID        |Delay    |
+----------+------+--------------+----------------+----------+----------+--------------------+----------------------+------------------------+----+-------+---------+-------