<H2 align = "Center"> Spark Dataframe & Spark SQL </H2>

#### Answering questions regarding Order Dataset using both Spark Dataframe and Spark SQL
<BR>

In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()

In [2]:
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [3]:
orders_df = spark.read \
.format("csv") \
.option("header", "true") \
.option("inferSchema","true") \
.load("/public/trendytech/orders_wh/*")

In [4]:
orders_df.createOrReplaceTempView("Orders")

In [5]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [6]:
orders_df.show(truncate = False)

+--------+---------------------+-----------+---------------+
|order_id|order_date           |customer_id|order_status   |
+--------+---------------------+-----------+---------------+
|1       |2013-07-25 00:00:00.0|11599      |CLOSED         |
|2       |2013-07-25 00:00:00.0|256        |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111      |COMPLETE       |
|4       |2013-07-25 00:00:00.0|8827       |CLOSED         |
|5       |2013-07-25 00:00:00.0|11318      |COMPLETE       |
|6       |2013-07-25 00:00:00.0|7130       |COMPLETE       |
|7       |2013-07-25 00:00:00.0|4530       |COMPLETE       |
|8       |2013-07-25 00:00:00.0|2911       |PROCESSING     |
|9       |2013-07-25 00:00:00.0|5657       |PENDING_PAYMENT|
|10      |2013-07-25 00:00:00.0|5648       |PENDING_PAYMENT|
|11      |2013-07-25 00:00:00.0|918        |PAYMENT_REVIEW |
|12      |2013-07-25 00:00:00.0|1837       |CLOSED         |
|13      |2013-07-25 00:00:00.0|9149       |PENDING_PAYMENT|
|14      |2013-07-25 00:

In [12]:
spark.sql("Select * from Orders")

order_id,order_date,customer_id,order_status
1,2013-07-25 00:00:...,11599,CLOSED
2,2013-07-25 00:00:...,256,PENDING_PAYMENT
3,2013-07-25 00:00:...,12111,COMPLETE
4,2013-07-25 00:00:...,8827,CLOSED
5,2013-07-25 00:00:...,11318,COMPLETE
6,2013-07-25 00:00:...,7130,COMPLETE
7,2013-07-25 00:00:...,4530,COMPLETE
8,2013-07-25 00:00:...,2911,PROCESSING
9,2013-07-25 00:00:...,5657,PENDING_PAYMENT
10,2013-07-25 00:00:...,5648,PENDING_PAYMENT


#### 1. Top 15 customers who placed most number of orders

In [13]:
orders_df.groupBy("customer_id").count().sort("count",ascending = False).limit(15)

customer_id,count
5897,16
12431,16
569,16
6316,16
12284,15
4320,15
5624,15
5283,15
221,15
5654,15


In [20]:
spark.sql("SELECT customer_id, count(1) \
            FROM Orders \
            GROUP BY customer_id \
            ORDER BY count(1) desc \
            LIMIT 15")

customer_id,count(1)
6316,16
12431,16
5897,16
569,16
4320,15
221,15
12284,15
5283,15
5654,15
5624,15


#### 2. Find the number of orders under each order status

In [24]:
orders_df.groupBy("order_status").count().sort("count",ascending = False).show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|       COMPLETE|22899|
|PENDING_PAYMENT|15030|
|     PROCESSING| 8275|
|        PENDING| 7610|
|         CLOSED| 7556|
|        ON_HOLD| 3798|
|SUSPECTED_FRAUD| 1558|
|       CANCELED| 1428|
| PAYMENT_REVIEW|  729|
+---------------+-----+



In [28]:
spark.sql("SELECT order_status, count(1) \
           FROM Orders \
           GROUP BY order_status \
           ORDER BY count(1) DESC \
          ")

order_status,count(1)
COMPLETE,22899
PENDING_PAYMENT,15030
PROCESSING,8275
PENDING,7610
CLOSED,7556
ON_HOLD,3798
SUSPECTED_FRAUD,1558
CANCELED,1428
PAYMENT_REVIEW,729


#### 3. Number of active customers(customers who have placed atleast one order)

In [38]:
orders_df.select("customer_id").distinct().count()

12405

In [34]:
spark.sql("SELECT count(distinct customer_id) FROM Orders")

count(DISTINCT customer_id)
12405


#### 4. Customers with most number of closed orders

In [41]:
orders_df.filter("order_status = 'CLOSED'").groupBy("customer_id").count().sort("count",ascending = False)

customer_id,count
1833,6
1363,5
1687,5
5493,5
7948,4
2768,4
10263,4
7850,4
2403,4
437,4


In [43]:
spark.sql("SELECT customer_id, count(1) \
           FROM Orders \
           WHERE order_status = 'CLOSED' \
           GROUP BY 1 \
           ORDER BY 2 DESC\
          ")

customer_id,count(1)
1833,6
1687,5
1363,5
5493,5
7948,4
2768,4
10263,4
2236,4
2403,4
437,4
