In [2]:
from pyspark import SparkContext, SparkConf

In [3]:
conf = SparkConf().setAppName("pyspark")
sc = SparkContext(conf=conf)

# Join disparate datasets together using Spark
# Problem statement, get the revenue and number of orders from order_items on daily basis

In [5]:
ordersRDD = sc.textFile("E:/code/git-2018/data/data-master/retail_db/orders/part-00000")

In [6]:
for i in ordersRDD.take(5):
    print(i)

1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE


In [7]:
orderItemsRDD = sc.textFile("E:/code/git-2018/data/data-master/retail_db/order_items/part-00000")

In [8]:
for i in orderItemsRDD.take(5):
    print(i)

1,1,957,1,299.98,299.98
2,2,1073,1,199.99,199.99
3,2,502,5,250.0,50.0
4,2,403,1,129.99,129.99
5,4,897,2,49.98,24.99


In [10]:
ordersParsedRDD = ordersRDD.map(lambda rec: (int(rec.split(",")[0]), rec))
orderItemsParsedRDD = orderItemsRDD.map(lambda rec: (int(rec.split(",")[1]), rec))

In [11]:
ordersJoinOrderItems = orderItemsParsedRDD.join(ordersParsedRDD)
revenuePerOrderPerDay = ordersJoinOrderItems.map(lambda t: (t[1][1].split(",")[1], float(t[1][0].split(",")[4])))

In [12]:
for i in ordersJoinOrderItems.take(5):
    print(i)

(4, ('5,4,897,2,49.98,24.99', '4,2013-07-25 00:00:00.0,8827,CLOSED'))
(4, ('6,4,365,5,299.95,59.99', '4,2013-07-25 00:00:00.0,8827,CLOSED'))
(4, ('7,4,502,3,150.0,50.0', '4,2013-07-25 00:00:00.0,8827,CLOSED'))
(4, ('8,4,1014,4,199.92,49.98', '4,2013-07-25 00:00:00.0,8827,CLOSED'))
(8, ('17,8,365,3,179.97,59.99', '8,2013-07-25 00:00:00.0,2911,PROCESSING'))


In [13]:
for i in revenuePerOrderPerDay.take(5):
    print(i)

('2013-07-25 00:00:00.0', 49.98)
('2013-07-25 00:00:00.0', 299.95)
('2013-07-25 00:00:00.0', 150.0)
('2013-07-25 00:00:00.0', 199.92)
('2013-07-25 00:00:00.0', 179.97)


In [20]:
# Get order count per day
ordersPerDay = ordersJoinOrderItems.map(lambda rec: rec[1][1].split(",")[1] + "," + str(rec[0])).distinct()
ordersPerDayParsedRDD = ordersPerDay.map(lambda rec: (rec.split(",")[0], 1))

In [21]:
totalOrdersPerDay = ordersPerDayParsedRDD.reduceByKey(lambda x, y: x + y)

In [22]:
# Get revenue per day from joined data
totalRevenuePerDay = revenuePerOrderPerDay.reduceByKey( \
lambda total1, total2: total1 + total2 \
)

In [24]:
for data in totalRevenuePerDay.collect():
  print(data)

('2013-07-28 00:00:00.0', 87123.07999999997)
('2013-07-29 00:00:00.0', 137287.09000000008)
('2013-08-08 00:00:00.0', 76501.65999999996)
('2013-08-10 00:00:00.0', 129574.80000000005)
('2013-08-24 00:00:00.0', 128883.82000000007)
('2013-08-27 00:00:00.0', 91634.92)
('2013-08-28 00:00:00.0', 55189.21999999996)
('2013-08-29 00:00:00.0', 99960.57000000002)
('2013-08-30 00:00:00.0', 57008.49999999997)
('2013-08-31 00:00:00.0', 75923.71999999997)
('2013-09-04 00:00:00.0', 44563.16999999997)
('2013-09-08 00:00:00.0', 84483.54999999999)
('2013-09-11 00:00:00.0', 52436.90999999997)
('2013-09-14 00:00:00.0', 135308.52000000002)
('2013-09-17 00:00:00.0', 67185.92999999995)
('2013-09-20 00:00:00.0', 82662.50999999998)
('2013-09-25 00:00:00.0', 141775.64000000007)
('2013-09-27 00:00:00.0', 136537.86000000004)
('2013-09-30 00:00:00.0', 118054.81000000006)
('2013-10-07 00:00:00.0', 51592.149999999965)
('2013-10-08 00:00:00.0', 130025.08000000009)
('2013-10-11 00:00:00.0', 104617.32000000005)
('2013-10

In [15]:
for i in ordersPerDay.take(5):
    print(i)

2013-07-25 00:00:00.0,16
2013-07-25 00:00:00.0,44
2013-07-25 00:00:00.0,64
2013-07-26 00:00:00.0,116
2013-07-26 00:00:00.0,120


In [17]:
for i in totalOrdersPerDay.take(5):
    print(i)

('2013-07-28 00:00:00.0', 158)
('2013-07-29 00:00:00.0', 216)
('2013-08-08 00:00:00.0', 130)
('2013-08-10 00:00:00.0', 221)
('2013-08-24 00:00:00.0', 216)


In [23]:
# Joining order count per day and revenue per day
finalJoinRDD = totalOrdersPerDay.join(totalRevenuePerDay)

In [25]:
for data in finalJoinRDD.take(5):
  print(data)

('2013-07-28 00:00:00.0', (158, 87123.07999999997))
('2013-07-29 00:00:00.0', (216, 137287.09000000008))
('2013-08-08 00:00:00.0', (130, 76501.65999999996))
('2013-08-10 00:00:00.0', (221, 129574.80000000005))
('2013-08-24 00:00:00.0', (216, 128883.82000000007))
