In [1]:
import findspark
findspark.init("/opt/manual/spark")

In [2]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [3]:
spark = SparkSession.builder \
.appName("case_study_2") \
.master("local[*]") \
.config("spark.executer.memory","2g") \
.config("spark.driver.memory","1g") \
.getOrCreate()

In [4]:
categories = spark.read \
.option("header", True) \
.option("inferSchema", True) \
.csv("file:///home/train/datasets/retail_db/categories.csv")

In [5]:
categories.printSchema()

root
 |-- categoryId: integer (nullable = true)
 |-- categoryDepartmentId: integer (nullable = true)
 |-- categoryName: string (nullable = true)



In [6]:
categories.limit(3).toPandas()

Unnamed: 0,categoryId,categoryDepartmentId,categoryName
0,1,2,Football
1,2,2,Soccer
2,3,2,Baseball & Softball


In [7]:
customers = spark.read \
.option("header", True) \
.option("inferSchema", True) \
.csv("file:///home/train/datasets/retail_db/customers.csv")

In [8]:
departments = spark.read \
.option("header", True) \
.option("inferSchema", True) \
.csv("file:///home/train/datasets/retail_db/departments.csv")

In [9]:
order_items = spark.read \
.option("header", True) \
.option("inferSchema", True) \
.csv("file:///home/train/datasets/retail_db/order_items.csv")

In [10]:
orders = spark.read \
.option("header", True) \
.option("inferSchema", True) \
.csv("file:///home/train/datasets/retail_db/orders.csv")

In [11]:
products = spark.read \
.option("header", True) \
.option("inferSchema", True) \
.csv("file:///home/train/datasets/retail_db/products.csv")

In [12]:
customers.printSchema()

root
 |-- customerId: integer (nullable = true)
 |-- customerFName: string (nullable = true)
 |-- customerLName: string (nullable = true)
 |-- customerEmail: string (nullable = true)
 |-- customerPassword: string (nullable = true)
 |-- customerStreet: string (nullable = true)
 |-- customerCity: string (nullable = true)
 |-- customerState: string (nullable = true)
 |-- customerZipcode: integer (nullable = true)



In [13]:
customers.limit(3).toPandas()

Unnamed: 0,customerId,customerFName,customerLName,customerEmail,customerPassword,customerStreet,customerCity,customerState,customerZipcode
0,1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521
1,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126
2,3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,725


In [14]:
departments.printSchema()

root
 |-- departmentId: integer (nullable = true)
 |-- departmentName: string (nullable = true)



In [15]:
departments.limit(3).toPandas()

Unnamed: 0,departmentId,departmentName
0,2,Fitness
1,3,Footwear
2,4,Apparel


In [16]:
order_items.printSchema()

root
 |-- orderItemName: integer (nullable = true)
 |-- orderItemOrderId: integer (nullable = true)
 |-- orderItemProductId: integer (nullable = true)
 |-- orderItemQuantity: integer (nullable = true)
 |-- orderItemSubTotal: double (nullable = true)
 |-- orderItemProductPrice: double (nullable = true)



In [17]:
order_items.limit(3).toPandas()

Unnamed: 0,orderItemName,orderItemOrderId,orderItemProductId,orderItemQuantity,orderItemSubTotal,orderItemProductPrice
0,1,1,957,1,299.98,299.98
1,2,2,1073,1,199.99,199.99
2,3,2,502,5,250.0,50.0


In [18]:
orders.printSchema()

root
 |-- orderId: integer (nullable = true)
 |-- orderDate: string (nullable = true)
 |-- orderCustomerId: integer (nullable = true)
 |-- orderStatus: string (nullable = true)



In [19]:
orders.limit(3).toPandas()

Unnamed: 0,orderId,orderDate,orderCustomerId,orderStatus
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE


In [20]:
#order_items.select("orderItemOrderId",F.countDistinct("orderItemOrderId")).show()
order_items.select("orderItemOrderId").count()

172198

In [21]:
print(order_items.count())
print(orders.count())

172198
68883


In [22]:
spark.conf.set("spark.sql,autoBroadcastJoinTreshold", "-1")

In [23]:
orders_and_items = order_items.join(orders,order_items.orderItemOrderId == orders.orderId) \
.select("orderId","orderItemProductId","orderItemSubTotal","orderStatus")

In [24]:
orders_and_items.limit(3).toPandas()

Unnamed: 0,orderId,orderItemProductId,orderItemSubTotal,orderStatus
0,1,957,299.98,CLOSED
1,2,1073,199.99,PENDING_PAYMENT
2,2,502,250.0,PENDING_PAYMENT


In [25]:
products.printSchema()

root
 |-- productId: integer (nullable = true)
 |-- productCategoryId: integer (nullable = true)
 |-- productName: string (nullable = true)
 |-- productDescription: string (nullable = true)
 |-- productPrice: double (nullable = true)
 |-- productImage: string (nullable = true)



In [26]:
cat_product = categories.join(products, categories.categoryId == products.productCategoryId) \
.select("productId","productName","categoryName")

In [27]:
cat_product.limit(3).toPandas()

Unnamed: 0,productId,productName,categoryName
0,1,Quest Q64 10 FT. x 10 FT. Slant Leg Instant U,Soccer
1,2,Under Armour Men's Highlight MC Football Clea,Soccer
2,3,Under Armour Men's Renegade D Mid Football Cl,Soccer


In [28]:
final_table = orders_and_items.join(cat_product,orders_and_items.orderItemProductId == cat_product.productId)

In [29]:
final_table.limit(3).toPandas()

Unnamed: 0,orderId,orderItemProductId,orderItemSubTotal,orderStatus,productId,productName,categoryName
0,57760,858,199.99,PENDING_PAYMENT,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs
1,57847,858,199.99,COMPLETE,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs
2,58071,858,199.99,PENDING,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs


In [30]:
most_cancelled_products = final_table.select("productName","orderItemSubTotal") \
.filter("orderStatus == 'CANCELED'") \
.groupBy("productName") \
.agg(F.sum("orderItemSubTotal").alias("Totalprice")) \
.orderBy(F.desc("Totalprice"))

In [31]:
most_cancelled_products.coalesce(1).write \
.format("parquet") \
.mode("overwrite") \
.save("file:///home/train/my_pyspark/case studies/output_data")

In [32]:
! ls /home/train/my_pyspark/case\ studies/output_data

part-00000-03a54b1b-627c-4c4c-b115-191f31cbe2bb-c000.snappy.parquet  _SUCCESS


In [33]:
most_cancelled_category = final_table.select("categoryName","orderItemSubTotal") \
.filter("orderStatus == 'CANCELED'") \
.groupBy("categoryName") \
.agg(F.sum("orderItemSubTotal").alias("Totalprice")) \
.orderBy(F.desc("Totalprice"))

In [34]:
most_cancelled_category.coalesce(1) \
.write \
.format("parquet") \
.mode("overwrite") \
.save("file:///home/train/my_pyspark/case studies/output_data/")

In [35]:
! ls /home/train/my_pyspark/case\ studies/output_data

part-00000-21c594e2-4922-45e6-b1d3-ccce046fb843-c000.snappy.parquet  _SUCCESS


In [36]:
spark.stop()