In [160]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, DateType, IntegerType
from pyspark.sql.functions import *

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("GCSExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Create data frame, we use GS bucket as data sink

# Google Storage File Path
gsc_file_path = 'gs://dejads_input_assignment2/' 

dataSchema = StructType(
    [StructField("Order ID", LongType(), True),
     StructField("Product", StringType(), True),
     StructField("Quantity Ordered", IntegerType(), True),
     StructField("Price Each", DoubleType(), True),
     StructField("Order Date", StringType(), True),
     StructField("Purchase Address", StringType(), True)
     ])

#Read the data of all months
sales_df = spark.read.format("csv").schema(dataSchema).option("header", "true") \
       .load(gsc_file_path+'*.csv')

#Drop null rows
sales_df = sales_df.na.drop("any")

#Convert "Order Date" to data type "timestamp"
sales_df = sales_april_df.withColumn("timestamp",to_timestamp(col("Order Date"),'MM/dd/yy HH:mm')) \

sales_df.printSchema()
sales_df.show(10)



root
 |-- Order ID: long (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- totalprice: double (nullable = true)

+--------+--------------------+----------------+----------+--------------+--------------------+-------------------+----------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|          timestamp|totalprice|
+--------+--------------------+----------------+----------+--------------+--------------------+-------------------+----------+
|  295665|  Macbook Pro Laptop|               1|    1700.0|12/30/19 00:01|136 Church St, Ne...|2019-12-30 00:01:00|    1700.0|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...|2019-12-29 07:03:00|     600.0|
|  295667|USB-C

# Return the names of ordered products from the largest order of each month

In [166]:
from pyspark.sql.window import Window

#Find total price for each line
sales_df = sales_df.withColumn("totalprice",col("Price Each") * col("Quantity Ordered"))

#Add total Order Amount to the table
order_amounts = sales_df.groupBy("Order ID").agg(expr("sum(totalprice)").alias("Order Amount"))
joined_data = sales_df.join(order_amounts, ["Order ID"], "left" )

#Rank the orders based on the total amount for given month

joined_data = joined_data.withColumn("month", month("timestamp"))

window = Window.partitionBy("month").orderBy(col("Order Amount").desc())
joined_data = joined_data.withColumn("rank", rank().over(window))

joined_data = joined_data.na.drop("any")

#Print the products of the largest order for given day
joined_data.where((col("rank") == 1)).select("*").distinct().orderBy(col("month")).show(100)



+--------+--------------------+----------------+----------+--------------+--------------------+-------------------+----------+------------+-----+----+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|          timestamp|totalprice|Order Amount|month|rank|
+--------+--------------------+----------------+----------+--------------+--------------------+-------------------+----------+------------+-----+----+
|  149611|     ThinkPad Laptop|               2|    999.99|01/31/19 17:47|850 6th St, New Y...|2019-01-31 17:47:00|   1999.98|     1999.98|    1|   1|
|  150518|  Macbook Pro Laptop|               1|    1700.0|02/26/19 12:38|847 10th St, San ...|2019-02-26 12:38:00|    1700.0|      2400.0|    2|   1|
|  150518|              iPhone|               1|     700.0|02/26/19 12:38|847 10th St, San ...|2019-02-26 12:38:00|     700.0|      2400.0|    2|   1|
|  163510|  Macbook Pro Laptop|               1|    1700.0|03/27/19 17:07|55 Wilson St, San...

# Return the most sold product of each month

In [None]:
sales_april_df = spark.read.format("csv").schema(dataSchema).option("header", "true") \
       .load(gsc_file_path+'Sales_April_2019.csv')

#Drop null rows
sales_april_df = sales_april_df.na.drop("all")

#Convert "Order Date" to data type "timestamp"
sales_april_df = sales_april_df.withColumn("timestamp",to_timestamp(col("Order Date"),'MM/dd/yy HH:mm')) \

sales_april_df.printSchema()
sales_april_df.show(5)

In [167]:
spark.stop()