<H1 align="Center"> Spark Dataframe Basics </H1>
This Spark Program shows how to create a Spark Dataframe from - CSV, JSON or Parquet file; and also from a Spark Temp Table.
<BR><BR>

In [1]:
from pyspark.sql import SparkSession

In [2]:
import getpass
username = getpass.getuser()

In [3]:
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [13]:
orders_df = spark.read \
.format("csv") \
.option("header","True") \
.option("inferSchema","True") \
.load("/public/trendytech/orders_wh/*")

In [14]:
orders_df.show(truncate = False)

+--------+---------------------+-----------+---------------+
|order_id|order_date           |customer_id|order_status   |
+--------+---------------------+-----------+---------------+
|1       |2013-07-25 00:00:00.0|11599      |CLOSED         |
|2       |2013-07-25 00:00:00.0|256        |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111      |COMPLETE       |
|4       |2013-07-25 00:00:00.0|8827       |CLOSED         |
|5       |2013-07-25 00:00:00.0|11318      |COMPLETE       |
|6       |2013-07-25 00:00:00.0|7130       |COMPLETE       |
|7       |2013-07-25 00:00:00.0|4530       |COMPLETE       |
|8       |2013-07-25 00:00:00.0|2911       |PROCESSING     |
|9       |2013-07-25 00:00:00.0|5657       |PENDING_PAYMENT|
|10      |2013-07-25 00:00:00.0|5648       |PENDING_PAYMENT|
|11      |2013-07-25 00:00:00.0|918        |PAYMENT_REVIEW |
|12      |2013-07-25 00:00:00.0|1837       |CLOSED         |
|13      |2013-07-25 00:00:00.0|9149       |PENDING_PAYMENT|
|14      |2013-07-25 00:

In [27]:
json_df = spark.read \
.json("/public/trendytech/datasets/orders.json")

In [41]:
json_df.show(truncate = False)

+-----------+---------------------+--------+---------------+
|customer_id|order_date           |order_id|order_status   |
+-----------+---------------------+--------+---------------+
|11599      |2013-07-25 00:00:00.0|1       |CLOSED         |
|256        |2013-07-25 00:00:00.0|2       |PENDING_PAYMENT|
|12111      |2013-07-25 00:00:00.0|3       |COMPLETE       |
|8827       |2013-07-25 00:00:00.0|4       |CLOSED         |
|11318      |2013-07-25 00:00:00.0|5       |COMPLETE       |
|7130       |2013-07-25 00:00:00.0|6       |COMPLETE       |
|4530       |2013-07-25 00:00:00.0|7       |COMPLETE       |
|2911       |2013-07-25 00:00:00.0|8       |PROCESSING     |
|5657       |2013-07-25 00:00:00.0|9       |PENDING_PAYMENT|
|5648       |2013-07-25 00:00:00.0|10      |PENDING_PAYMENT|
|918        |2013-07-25 00:00:00.0|11      |PAYMENT_REVIEW |
|1837       |2013-07-25 00:00:00.0|12      |CLOSED         |
|9149       |2013-07-25 00:00:00.0|13      |PENDING_PAYMENT|
|9842       |2013-07-25 

In [42]:
parquet_ds = spark.read.parquet("/public/trendytech/datasets/ordersparquet")

In [44]:
parquet_ds.show(truncate = False)

+-----------+---------------------+--------+---------------+
|customer_id|order_date           |order_id|order_status   |
+-----------+---------------------+--------+---------------+
|11599      |2013-07-25 00:00:00.0|1       |CLOSED         |
|256        |2013-07-25 00:00:00.0|2       |PENDING_PAYMENT|
|12111      |2013-07-25 00:00:00.0|3       |COMPLETE       |
|8827       |2013-07-25 00:00:00.0|4       |CLOSED         |
|11318      |2013-07-25 00:00:00.0|5       |COMPLETE       |
|7130       |2013-07-25 00:00:00.0|6       |COMPLETE       |
|4530       |2013-07-25 00:00:00.0|7       |COMPLETE       |
|2911       |2013-07-25 00:00:00.0|8       |PROCESSING     |
|5657       |2013-07-25 00:00:00.0|9       |PENDING_PAYMENT|
|5648       |2013-07-25 00:00:00.0|10      |PENDING_PAYMENT|
|918        |2013-07-25 00:00:00.0|11      |PAYMENT_REVIEW |
|1837       |2013-07-25 00:00:00.0|12      |CLOSED         |
|9149       |2013-07-25 00:00:00.0|13      |PENDING_PAYMENT|
|9842       |2013-07-25 

In [31]:
parquet_ds.createOrReplaceTempView("Orders")

In [34]:
spark.sql("Select * from Orders WHERE order_status = 'CLOSED'")

customer_id,order_date,order_id,order_status
11599,2013-07-25 00:00:...,1,CLOSED
8827,2013-07-25 00:00:...,4,CLOSED
1837,2013-07-25 00:00:...,12,CLOSED
1205,2013-07-25 00:00:...,18,CLOSED
11441,2013-07-25 00:00:...,24,CLOSED
9503,2013-07-25 00:00:...,25,CLOSED
5863,2013-07-25 00:00:...,37,CLOSED
12271,2013-07-25 00:00:...,51,CLOSED
7073,2013-07-25 00:00:...,57,CLOSED
4791,2013-07-25 00:00:...,61,CLOSED


In [39]:
table_ds = spark.read.table("Orders")

In [40]:
table_ds.show()

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
|       7130|2013-07-25 00:00:...|       6|       COMPLETE|
|       4530|2013-07-25 00:00:...|       7|       COMPLETE|
|       2911|2013-07-25 00:00:...|       8|     PROCESSING|
|       5657|2013-07-25 00:00:...|       9|PENDING_PAYMENT|
|       5648|2013-07-25 00:00:...|      10|PENDING_PAYMENT|
|        918|2013-07-25 00:00:...|      11| PAYMENT_REVIEW|
|       1837|2013-07-25 00:00:...|      12|         CLOSED|
|       9149|2013-07-25 00:00:...|      13|PENDING_PAYMENT|
|       9842|2013-07-25 00:00:...|      