In [38]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, min, max, sum, sumDistinct, avg, month, year, dayofmonth, expr, hour, collect_list, size
import pandas as pd
import matplotlib as plt
import seaborn as sns


spark = (SparkSession.builder.appName('Challenge 3').getOrCreate())

sales_df = spark.read.parquet('sales.parquet/ReportYear=2019')
sales_df.show()

+-------+--------------------+--------+------+-------------------+--------------------+-------------+-----+-----+
|OrderID|             Product|Quantity| Price|          OrderDate|        StoreAddress|         City|State|Month|
+-------+--------------------+--------+------+-------------------+--------------------+-------------+-----+-----+
| 295665|  Macbook Pro Laptop|       1|1700.0|2019-12-30 00:01:00|136 Church St, Ne...|New York City|  NY |   12|
| 295666|  LG Washing Machine|       1| 600.0|2019-12-29 07:03:00|562 2nd St, New Y...|New York City|  NY |   12|
| 295667|USB-C Charging Cable|       1| 11.95|2019-12-12 18:21:00|277 Main St, New ...|New York City|  NY |   12|
| 295668|    27in FHD Monitor|       1|149.99|2019-12-22 15:13:00|410 6th St, San F...|San Francisco|  CA |   12|
| 295669|USB-C Charging Cable|       1| 11.95|2019-12-18 12:38:00|43 Hill St, Atlan...|      Atlanta|  GA |   12|
| 295670|AA Batteries (4-p...|       1|  3.84|2019-12-31 22:58:00|200 Jefferson St,...|N

## Best month for sales

In [2]:
sales_per_month_df = (sales_df.select('OrderID', 'Month', 'Price', 'Quantity', expr('Price*Quantity').alias('Sales')))

sales_per_month_df = sales_per_month_df.groupBy('Month').agg(sum('Sales').alias('TotalSales')).orderBy('TotalSales', ascending=False)
sales_per_month_df.show()

+-----+------------------+
|Month|        TotalSales|
+-----+------------------+
|   12|  4613443.31607008|
|   10| 3736726.860444069|
|    4|3390670.2231714725|
|   11| 3199603.184257984|
|    5|3152606.7349271774|
|    3|2807100.3656582832|
|    7|2647775.7468385696|
|    6| 2577802.247295618|
|    8|2244467.8685896397|
|    2| 2202022.408319235|
|    9|2097560.1194250584|
|    1| 1813586.431374073|
+-----+------------------+



## Quantity sold by city

In [3]:
quantity_by_city = (sales_df.groupBy('City').agg(sum('Quantity').alias('TotalQuantity')).orderBy('TotalQuantity', ascending=False))
quantity_by_city.show()

+-------------+-------------+
|         City|TotalQuantity|
+-------------+-------------+
|San Francisco|        50226|
|  Los Angeles|        33288|
|New York City|        27916|
|       Boston|        22524|
|       Dallas|        16728|
|      Atlanta|        16602|
|      Seattle|        16552|
|     Portland|        14051|
|       Austin|        11151|
+-------------+-------------+



### What time should we display advertisements to maximize likelihood
### of customer's buying products?

In [22]:
advertisement_time_df = sales_df.withColumn(('HourOfDay'),(hour('OrderDate')))
advertisement_time_df.groupBy('HourOfDay').agg(sum('Quantity')).orderBy('sum(Quantity)', ascending=False)
advertisement_time_df = advertisement_time_df.groupBy('HourOfDay').agg(sum('Quantity')).orderBy('sum(Quantity)', ascending=False)
advertisement_time_df.show()
## hours of the day to advertise are between 18-19 hour and 11-12

+---------+-------------+
|HourOfDay|sum(Quantity)|
+---------+-------------+
|       19|        14470|
|       12|        14202|
|       11|        14005|
|       18|        13802|
|       20|        13768|
|       13|        13685|
|       14|        12362|
|       10|        12308|
|       21|        12244|
|       17|        12229|
|       16|        11662|
|       15|        11391|
|       22|         9899|
|        9|         9816|
|       23|         7065|
|        8|         7002|
|        7|         4556|
|        0|         4412|
|        6|         2810|
|        1|         2610|
+---------+-------------+
only showing top 20 rows



## What products are often sold together in the state of 'NY'

In [33]:
sales_df.show()

+-------+--------------------+--------+------+-------------------+--------------------+-------------+-----+-----+
|OrderID|             Product|Quantity| Price|          OrderDate|        StoreAddress|         City|State|Month|
+-------+--------------------+--------+------+-------------------+--------------------+-------------+-----+-----+
| 295665|  Macbook Pro Laptop|       1|1700.0|2019-12-30 00:01:00|136 Church St, Ne...|New York City|  NY |   12|
| 295666|  LG Washing Machine|       1| 600.0|2019-12-29 07:03:00|562 2nd St, New Y...|New York City|  NY |   12|
| 295667|USB-C Charging Cable|       1| 11.95|2019-12-12 18:21:00|277 Main St, New ...|New York City|  NY |   12|
| 295668|    27in FHD Monitor|       1|149.99|2019-12-22 15:13:00|410 6th St, San F...|San Francisco|  CA |   12|
| 295669|USB-C Charging Cable|       1| 11.95|2019-12-18 12:38:00|43 Hill St, Atlan...|      Atlanta|  GA |   12|
| 295670|AA Batteries (4-p...|       1|  3.84|2019-12-31 22:58:00|200 Jefferson St,...|N

In [49]:
sold_together_df = (sales_df.where(col('State') == 'NY ').orderBy('OrderID', 'Product').groupBy('OrderID', 'State').agg(collect_list('Product').alias('ProductList')))


sold_together_df = sold_together_df.where(size('ProductList') > 1)
sold_together_df = sold_together_df.groupBy('ProductList').count().orderBy('count', ascending=False)

sold_together_df.show(truncate=False)

+------------------------------------------------------+-----+
|ProductList                                           |count|
+------------------------------------------------------+-----+
|[Google Phone, USB-C Charging Cable]                  |127  |
|[Lightning Charging Cable, iPhone]                    |126  |
|[Google Phone, Wired Headphones]                      |53   |
|[USB-C Charging Cable, Vareebadd Phone]               |50   |
|[Wired Headphones, iPhone]                            |46   |
|[Apple Airpods Headphones, iPhone]                    |45   |
|[Bose SoundSport Headphones, Google Phone]            |24   |
|[Apple Airpods Headphones, Wired Headphones]          |19   |
|[Vareebadd Phone, Wired Headphones]                   |17   |
|[AAA Batteries (4-pack), Lightning Charging Cable]    |16   |
|[AA Batteries (4-pack), Lightning Charging Cable]     |16   |
|[USB-C Charging Cable, Wired Headphones]              |14   |
|[AA Batteries (4-pack), AAA Batteries (4-pack)]       