In [130]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, DateType, IntegerType
from pyspark.sql.functions import *

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("GCSExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Create data frame, we use GS bucket as data sink

# Google Storage File Path
gsc_file_path = 'gs://dejads_input_assignment2/' 

dataSchema = StructType(
    [StructField("Order ID", LongType(), True),
     StructField("Product", StringType(), True),
     StructField("Quantity Ordered", IntegerType(), True),
     StructField("Price Each", DoubleType(), True),
     StructField("Order Date", StringType(), True),
     StructField("Purchase Address", StringType(), True)
     ])

sales_april_df = spark.read.format("csv").schema(dataSchema).option("header", "true") \
       .load(gsc_file_path+'Sales_April_2019.csv')

#Drop null rows
sales_april_df = sales_april_df.na.drop("all")

#Convert "Order Date" to data type "timestamp"
sales_april_df = sales_april_df.withColumn("timestamp",to_timestamp(col("Order Date"),'MM/dd/yy HH:mm')) \

sales_april_df.printSchema()
sales_april_df.show(5)



root
 |-- Order ID: long (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)

+--------+--------------------+----------------+----------+--------------+--------------------+-------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|          timestamp|
+--------+--------------------+----------------+----------+--------------+--------------------+-------------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|2019-04-19 08:46:00|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|2019-04-07 22:30:00|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|2019-04-12 14:38:

# Return the names of ordered products from the largest order for each day

In [137]:
from pyspark.sql.window import Window

#Find total price for each line
sales_april_df = sales_april_df.withColumn("totalprice",col("Price Each") * col("Quantity Ordered"))

#Add total Order Amount to the table
order_amounts = sales_april_df.groupBy("Order ID").agg(expr("sum(totalprice)").alias("Order Amount"))
joined_data = sales_april_df.join(order_amounts, ["Order ID"], "left" )

#Rank the orders based on the total amount for given month

joined_data = joined_data.withColumn("day", dayofmonth("timestamp"))

window = Window.partitionBy("day").orderBy(col("Order Amount").desc())
joined_data = joined_data.withColumn("rank", rank().over(window))

#Print the products of the largest order for given day
joined_data.where((col("rank") == 1) & (col("day") == 8)).select("Product").distinct().show()



+------------------+
|           Product|
+------------------+
|Macbook Pro Laptop|
+------------------+



In [138]:
spark.stop()