<a href="https://colab.research.google.com/github/jagatabhay/pysparktest/blob/main/FHDAL1Coding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/89/db/e18cfd78e408de957821ec5ca56de1250645b05f8523d169803d8df35a64/pyspark-3.1.2.tar.gz (212.4MB)
[K     |████████████████████████████████| 212.4MB 64kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 19.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=34ceaf8443694d92a44155099a8228953a7288aad0f36f5f988c7de697951586
  Stored in directory: /root/.cache/pip/wheels/40/1b/2c/30f43be2627857ab80062bef1527c0128f7b4070b6b2d02139
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, isnan, count, col, when, year, month, sum as _sum, create_map, lit
import calendar
from itertools import chain

In [None]:
spark = SparkSession.builder.appName('FHDAL1Code').getOrCreate()

In [None]:
spark

In [None]:
df = spark.read.csv('drive/MyDrive/FoodHubDA/orders_test.csv',header=True,inferSchema=True)
df_customer = spark.read.csv('drive/MyDrive/FoodHubDA/customer_test.csv',header=True,inferSchema=True)
df_store = spark.read.csv('drive/MyDrive/FoodHubDA/store_test.csv',header=True,inferSchema=True)

Customer Dataframe Schema

In [None]:
df_customer.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)



Order Dataframe Schema

In [None]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- total: double (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- store_id: integer (nullable = true)
 |-- order_date: string (nullable = true)



Store dataframe schema

In [None]:
df_store.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)



In [None]:
df_customer.columns

['id', 'first_name', 'last_name', 'email']

In [None]:
df.columns

['id', 'total', 'customer_id', 'store_id', 'order_date']

In [None]:
df_store.columns

['id', 'name', 'address']

First 5 rows of order dataframe

In [None]:
df.show(5)

+---+-----+-----------+--------+----------+
| id|total|customer_id|store_id|order_date|
+---+-----+-----------+--------+----------+
|  1|19.36|         21|       1|2020-03-03|
|  2| 8.85|         88|       8|2020-04-02|
|  3| 5.53|         41|       3|2020-03-03|
|  4| 12.9|         96|       8|2020-03-15|
|  5| 8.19|         25|       7|2020-01-21|
+---+-----+-----------+--------+----------+
only showing top 5 rows



First 5 rows of Customer Dataframe

In [None]:
df_customer.show(5)

+---+----------+---------+--------------------+
| id|first_name|last_name|               email|
+---+----------+---------+--------------------+
|  1|      Sara|  Ramirez|samantha67@yahoo.com|
|  2|    Joshua|  Jimenez|richardtimothy@ho...|
|  3|    Nicole|  Navarro|nicholsonwilliam@...|
|  4|      John| Anderson|jenniferhowell@ya...|
|  5|Alexandria| Alvarado|sjohnston@young-b...|
+---+----------+---------+--------------------+
only showing top 5 rows



first 5 rows of store dataframe

In [None]:
df_store.show(5)

+-------------+------------+------------------+
|           id|        name|           address|
+-------------+------------+------------------+
|            1|  Valdez Inc|18321 Joseph Lodge|
|Christineland|   NH 69026"|              null|
|            2|Stevens-Barr|Unit 0902 Box 4445|
|DPO AE 19637"|        null|              null|
|            3|  Taylor Ltd|  3433 Hill Forest|
+-------------+------------+------------------+
only showing top 5 rows



Count Of Both Null and Missing values of Order Dataframe

In [None]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+-----+-----------+--------+----------+
| id|total|customer_id|store_id|order_date|
+---+-----+-----------+--------+----------+
|  0|    0|          0|       0|         0|
+---+-----+-----------+--------+----------+



Count Of Both Null and Missing values of Customer Dataframe

In [None]:
df_customer.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_customer.columns]).show()

+---+----------+---------+-----+
| id|first_name|last_name|email|
+---+----------+---------+-----+
|  0|         0|        0|    0|
+---+----------+---------+-----+



Count Of Both Null and Missing values of Store Dataframe

In [None]:
df_store.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_store.columns]).show()

+---+----+-------+
| id|name|address|
+---+----+-------+
|  0|   2|     10|
+---+----+-------+



There is no null or missing values in all dataframe



Conversion of Order Date Column to get the Month name

In [None]:
df = df.withColumn('order_date_1',to_date("order_date"))
df = df.withColumn('YEAR',year("order_date_1"))
df = df.withColumn('INTMONTH',month("order_date_1"))

In [None]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- total: double (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- store_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_date_1: date (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- INTMONTH: integer (nullable = true)



In [None]:
df.show(5)

+---+-----+-----------+--------+----------+------------+----+--------+
| id|total|customer_id|store_id|order_date|order_date_1|YEAR|INTMONTH|
+---+-----+-----------+--------+----------+------------+----+--------+
|  1|19.36|         21|       1|2020-03-03|  2020-03-03|2020|       3|
|  2| 8.85|         88|       8|2020-04-02|  2020-04-02|2020|       4|
|  3| 5.53|         41|       3|2020-03-03|  2020-03-03|2020|       3|
|  4| 12.9|         96|       8|2020-03-15|  2020-03-15|2020|       3|
|  5| 8.19|         25|       7|2020-01-21|  2020-01-21|2020|       1|
+---+-----+-----------+--------+----------+------------+----+--------+
only showing top 5 rows



In [None]:
monthDict = dict((index,month) \
            for index, month in enumerate(calendar.month_name) \
            if month)

for _ in monthDict.items():
    print(_)

(1, 'January')
(2, 'February')
(3, 'March')
(4, 'April')
(5, 'May')
(6, 'June')
(7, 'July')
(8, 'August')
(9, 'September')
(10, 'October')
(11, 'November')
(12, 'December')


In [None]:
mapping = create_map([lit(x) for x in chain(*monthDict.items())])
df = df.withColumn('MONTH',mapping[df['INTMONTH']])

In [None]:
df.show(5)

+---+-----+-----------+--------+----------+------------+----+--------+-------+
| id|total|customer_id|store_id|order_date|order_date_1|YEAR|INTMONTH|  MONTH|
+---+-----+-----------+--------+----------+------------+----+--------+-------+
|  1|19.36|         21|       1|2020-03-03|  2020-03-03|2020|       3|  March|
|  2| 8.85|         88|       8|2020-04-02|  2020-04-02|2020|       4|  April|
|  3| 5.53|         41|       3|2020-03-03|  2020-03-03|2020|       3|  March|
|  4| 12.9|         96|       8|2020-03-15|  2020-03-15|2020|       3|  March|
|  5| 8.19|         25|       7|2020-01-21|  2020-01-21|2020|       1|January|
+---+-----+-----------+--------+----------+------------+----+--------+-------+
only showing top 5 rows



In [None]:
df = df.select(['id','total','customer_id','store_id','YEAR','MONTH'])
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- total: double (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- store_id: integer (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- MONTH: string (nullable = true)



Final Dataframe, required for solution of Question1

In [None]:
df.show(5)

+---+-----+-----------+--------+----+-------+
| id|total|customer_id|store_id|YEAR|  MONTH|
+---+-----+-----------+--------+----+-------+
|  1|19.36|         21|       1|2020|  March|
|  2| 8.85|         88|       8|2020|  April|
|  3| 5.53|         41|       3|2020|  March|
|  4| 12.9|         96|       8|2020|  March|
|  5| 8.19|         25|       7|2020|January|
+---+-----+-----------+--------+----+-------+
only showing top 5 rows



Aggregate table showingthe total orders and revenueeach store had each month

In [None]:
df.groupBy(['YEAR','MONTH','store_id'])\
       .agg(count('total'),_sum('total'))\
       .withColumnRenamed('count(total)','NumberOfOrders')\
       .withColumnRenamed('sum(total)','TotalRevenue')\
       .orderBy(['YEAR','MONTH','store_id'])\
       .show()

+----+--------+--------+--------------+------------------+
|YEAR|   MONTH|store_id|NumberOfOrders|      TotalRevenue|
+----+--------+--------+--------------+------------------+
|2020|   April|       1|            26|406.60999999999996|
|2020|   April|       2|            24|            343.31|
|2020|   April|       3|            24| 393.1499999999999|
|2020|   April|       4|            27|            354.82|
|2020|   April|       5|            30| 416.1200000000001|
|2020|   April|       6|            20|391.17999999999995|
|2020|   April|       7|            25|            391.25|
|2020|   April|       8|            23|361.03999999999996|
|2020|   April|       9|            24|319.28999999999996|
|2020|   April|      10|            26|366.11999999999995|
|2020|February|       1|            25|            383.29|
|2020|February|       2|            24|322.04999999999995|
|2020|February|       3|            27| 343.6599999999999|
|2020|February|       4|            24|436.3100000000000

a list of users who have placed less than 10 orders

In [None]:
df.groupBy('customer_id')\
  .agg(count('id'))\
  .where(col('count(id)')<10)\
  .show()

+-----------+---------+
|customer_id|count(id)|
+-----------+---------+
|         85|        6|
|         65|        7|
|         53|        9|
|         28|        7|
|         27|        9|
|         44|        5|
|         12|        8|
|         93|        7|
|         47|        6|
|          1|        9|
|         13|        5|
|         86|        7|
|         20|        9|
|         40|        7|
|         57|        9|
|         48|        9|
|          5|        4|
|         19|        5|
|         64|        6|
|         15|        9|
+-----------+---------+
only showing top 20 rows

