In [1]:
#remember to clone https://github.com/brcondor/Architectures_for_Big_Data into '/home/jovyan/work/'
import sys
sys.path.append("/home/jovyan/work/Architectures_for_Big_Data/")
import pyspark
sc = pyspark.SparkContext("local[3]")

In [35]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *

dataset = datasetGenerator()
dataset.addGenerator(idGenerator(),prefix="line",min=1,max=5,keyName="lineId")
dataset.addGenerator(idGenerator(),prefix="item",min=1,max=15,keyName="itemId")
dataset.addGenerator(tickGenerator(),minTick=600,maxTick=1800,keyName="creationDate")
dataset.addGenerator(intGenerator(),keyName="qty")

logRdd = sc.parallelize( dataset.generateDataset(25000)).persist()
logRdd.first()

{'lineId': 'line_04',
 'itemId': 'item_010',
 'creationDate': datetime.datetime(2010, 1, 1, 0, 13, 10),
 'qty': 215}

In [36]:
logRdd.map(lambda x: x.get("creationDate")).sortBy(lambda x: x).map(lambda x: str(x)).take(5)

['2010-01-01 00:13:10',
 '2010-01-01 00:33:45',
 '2010-01-01 00:37:26',
 '2010-01-01 01:01:42',
 '2010-01-01 01:06:33']

In [4]:
logRdd.map(lambda x: (x.get("lineId"),x.get("itemId"))).distinct().sortBy(lambda x: (x[0],x[1])).collect()

[('line_01', 'item_001'),
 ('line_01', 'item_002'),
 ('line_01', 'item_003'),
 ('line_01', 'item_004'),
 ('line_01', 'item_005'),
 ('line_01', 'item_006'),
 ('line_01', 'item_007'),
 ('line_01', 'item_008'),
 ('line_01', 'item_009'),
 ('line_01', 'item_010'),
 ('line_01', 'item_011'),
 ('line_01', 'item_012'),
 ('line_01', 'item_013'),
 ('line_01', 'item_014'),
 ('line_01', 'item_015'),
 ('line_02', 'item_001'),
 ('line_02', 'item_002'),
 ('line_02', 'item_003'),
 ('line_02', 'item_004'),
 ('line_02', 'item_005'),
 ('line_02', 'item_006'),
 ('line_02', 'item_007'),
 ('line_02', 'item_008'),
 ('line_02', 'item_009'),
 ('line_02', 'item_010'),
 ('line_02', 'item_011'),
 ('line_02', 'item_012'),
 ('line_02', 'item_013'),
 ('line_02', 'item_014'),
 ('line_02', 'item_015'),
 ('line_03', 'item_001'),
 ('line_03', 'item_002'),
 ('line_03', 'item_003'),
 ('line_03', 'item_004'),
 ('line_03', 'item_005'),
 ('line_03', 'item_006'),
 ('line_03', 'item_007'),
 ('line_03', 'item_008'),
 ('line_03',

In [5]:
logRdd.map(lambda x: (x.get("lineId"),x.get("itemId"))).distinct().map(lambda x: (x[0],1))\
        .reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[0]).collect()

[('line_01', 15),
 ('line_02', 15),
 ('line_03', 15),
 ('line_04', 15),
 ('line_05', 15)]

In [37]:
maxDate = logRdd.map(lambda x: x.get("creationDate")).sortBy(lambda x: x).map(lambda x: x).max()
str(maxDate)


'2010-09-17 13:43:23'

In [38]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *

dataset = datasetGenerator()
dataset.addGenerator(idGenerator(),prefix="line",min=1,max=5,keyName="lineId")
dataset.addGenerator(idGenerator(),prefix="item",min=1,max=15,keyName="itemId")
dataset.addGenerator(dateGenerator(),endDate=maxDate,keyName="registryInsertTimestamp")
dataset.addGenerator(intGenerator(),"cost")

masterDataRdd = sc.parallelize( dataset.generateDataset(1500)).persist()
masterDataRdd.first()

{'lineId': 'line_03',
 'itemId': 'item_002',
 'registryInsertTimestamp': datetime.datetime(2010, 3, 5, 3, 6, 13),
 'cost': 417}

In [39]:
masterDataRdd.map(lambda x: x.get("registryInsertTimestamp")).max()

datetime.datetime(2010, 9, 17, 12, 34, 52)

# Bean Counter

In [40]:
logRdd.first()

{'lineId': 'line_04',
 'itemId': 'item_010',
 'creationDate': datetime.datetime(2010, 1, 1, 0, 13, 10),
 'qty': 215}

## How many lines per item?

In [41]:
logRdd.map(lambda x: (x.get("itemId"),1)).reduceByKey(lambda x,y: x+y).take(5)

[('item_010', 1682),
 ('item_003', 1663),
 ('item_001', 1634),
 ('item_014', 1680),
 ('item_013', 1705)]

## How much qty per item?

In [42]:
def f(x,y):
    numberOfItems = x+y
    return numberOfItems
logRdd.map(lambda x: (x.get("itemId"),x.get("qty"))).reduceByKey(lambda x,y: f(x,y)).take(5)

[('item_010', 836945),
 ('item_003', 827941),
 ('item_001', 808763),
 ('item_014', 837709),
 ('item_013', 857160)]

## How much qty in everage per item?

In [43]:
def f(x,y):
    cumulateQty = x[0]+y[0]
    numberOfItems = x[1]+y[1]
    return cumulateQty,numberOfItems
    
def prepare(x):
    return x[0],x[1][0]/x[1][1]
    
logRdd.map(lambda x: (x.get("itemId"),(x.get("qty"),1.0))).reduceByKey(lambda x,y: f(x,y)).map(lambda x: prepare(x)).take(5)

[('item_010', 497.58917954815695),
 ('item_003', 497.8598917618761),
 ('item_001', 494.9589963280294),
 ('item_014', 498.63630952380953),
 ('item_013', 502.73313782991204)]

In [44]:
def f(x,y):
    cumulateQty = x[0]+y[0]
    numberOfItems = x[1]+y[1]
    return cumulateQty,numberOfItems
    
def prepare(x):
    return x[1][0]/x[1][1]
    
logRdd.map(lambda x: (x.get("itemId"),(x.get("qty"),1.0))).reduceByKey(lambda x,y: f(x,y)).map(lambda x: prepare(x))\
                .stats()

(count: 15, mean: 502.0338628276392, stdev: 5.501691605710327, max: 511.8249097472924, min: 493.08837485172006)

## Exercise (1): Similar statistics but on item,line couple

In [14]:
#...

## lets Imagine to have 100 executors, and we want to distribute the average over lineId

In [45]:
def f(x,y):
    cumulateQty = x[0]+y[0]
    numberOfItems = x[1]+y[1]
    return cumulateQty,numberOfItems
    
def prepare(x):
    return x[1][0]/x[1][1]
    
logRdd.map(lambda x: ((x.get("lineId")),(x.get("qty"),1.0))).reduceByKey(lambda x,y: f(x,y))\
                    .map(lambda x: prepare(x))\
                .stats()

(count: 5, mean: 501.9888627551801, stdev: 1.9292706973781275, max: 505.33687162026837, min: 499.4560844453296)

In [46]:
def f(x,y):
    cumulateQty = x[0]+y[0]
    numberOfItems = x[1]+y[1]
    return cumulateQty,numberOfItems
    
def prepare(x):
    return x[1][0]/x[1][1]
    
logRdd.map(lambda x: ((randint(1,100),x.get("lineId")),(x.get("qty"),1.0))).reduceByKey(lambda x,y: f(x,y))\
                    .map(lambda x: (x[0][1],x[1])).reduceByKey(lambda x,y: f(x,y))\
                    .map(lambda x: prepare(x))\
                .stats()

(count: 5, mean: 501.9888627551801, stdev: 1.9292706973781275, max: 505.33687162026837, min: 499.4560844453296)

## Cycle Time

In [47]:
semiLogRdd = logRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")),x)).persist()
semiLogRdd.count()

25000

In [48]:
joinedRDD = semiLogRdd.join(semiLogRdd).persist()
joinedRDD.count()

8355274

In [49]:
joinedRDD.first()

(('line_02', 'item_014'),
 ({'lineId': 'line_02',
   'itemId': 'item_014',
   'creationDate': datetime.datetime(2010, 1, 1, 2, 10, 10),
   'qty': 898},
  {'lineId': 'line_02',
   'itemId': 'item_014',
   'creationDate': datetime.datetime(2010, 1, 1, 2, 10, 10),
   'qty': 898}))

In [51]:
triangleJoinedRdd = joinedRDD.filter(lambda x: x[1][0].get("creationDate")<x[1][1].get("creationDate")).persist()
triangleJoinedRdd.count()

4165137

In [52]:
triangleJoinedRdd.first()

(('line_02', 'item_014'),
 ({'lineId': 'line_02',
   'itemId': 'item_014',
   'creationDate': datetime.datetime(2010, 1, 1, 2, 10, 10),
   'qty': 898},
  {'lineId': 'line_02',
   'itemId': 'item_014',
   'creationDate': datetime.datetime(2010, 1, 1, 5, 35, 25),
   'qty': 867}))

In [54]:
tempRdd = triangleJoinedRdd.map(lambda x: ((x[0],x[1][0].get("creationDate")), 
                                 (x[1][1].get("creationDate")-x[1][0].get("creationDate"),x)))
tempRdd.first()

((('line_02', 'item_014'), datetime.datetime(2010, 1, 1, 2, 10, 10)),
 (datetime.timedelta(seconds=12315),
  (('line_02', 'item_014'),
   ({'lineId': 'line_02',
     'itemId': 'item_014',
     'creationDate': datetime.datetime(2010, 1, 1, 2, 10, 10),
     'qty': 898},
    {'lineId': 'line_02',
     'itemId': 'item_014',
     'creationDate': datetime.datetime(2010, 1, 1, 5, 35, 25),
     'qty': 867}))))

In [56]:
sampleLine = tempRdd.first()[0]
print(sampleLine)
triangleJoinedRdd.map(lambda x: ((x[0],x[1][0].get("creationDate")), 
                                 (x[1][1].get("creationDate")-x[1][0].get("creationDate"),x)))\
                .filter(lambda x: x[0]==sampleLine).count()

343

In [57]:
def f(x,y):
    if x[0]<y[0]:
        return x
    return y
def prepare(x):
    x[1][1][1][0]["cycleTime"] = x[1][0]
    return x[1][1][1][0]
    
enrichedLogRdd = triangleJoinedRdd.map(lambda x: ((x[0],x[1][0].get("creationDate")), 
                                                  (x[1][1].get("creationDate")-x[1][0].get("creationDate"),x)))\
                    .reduceByKey(lambda x,y: f(x,y)).map(lambda x: prepare(x)).persist()
enrichedLogRdd.first()

{'lineId': 'line_02',
 'itemId': 'item_014',
 'creationDate': datetime.datetime(2010, 1, 1, 19, 58, 39),
 'qty': 193,
 'cycleTime': datetime.timedelta(seconds=25023)}

## Exercise (2): why if we count, we see fewer items?

In [25]:
enrichedLogRdd.count()

24925

## Exercise (3): I want to see the histogram of cycle times

## Extract Snapshot from Master Data

In [59]:
tempRdd = masterDataRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")), (x.get("cost"),x.get("registryInsertTimestamp")) )  )
tempRdd.first()

(('line_03', 'item_002'), (417, datetime.datetime(2010, 3, 5, 3, 6, 13)))

In [27]:
masterDataRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")))).distinct().count()

75

In [65]:
sampleLine = tempRdd.first()[0]
print(sampleLine)
masterDataRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")), (x.get("cost"),str(x.get("registryInsertTimestamp"))) )  )\
            .filter(lambda x: x[0]==sampleLine).sortBy(lambda x: x[1][1]).collect()

('line_03', 'item_002')


[(('line_03', 'item_002'), (544, '2010-01-13 04:19:03')),
 (('line_03', 'item_002'), (333, '2010-01-25 07:25:49')),
 (('line_03', 'item_002'), (272, '2010-01-30 17:07:57')),
 (('line_03', 'item_002'), (989, '2010-02-03 10:26:20')),
 (('line_03', 'item_002'), (356, '2010-02-16 23:05:31')),
 (('line_03', 'item_002'), (417, '2010-03-05 03:06:13')),
 (('line_03', 'item_002'), (751, '2010-03-20 20:18:58')),
 (('line_03', 'item_002'), (321, '2010-04-06 11:14:35')),
 (('line_03', 'item_002'), (683, '2010-05-13 06:18:56')),
 (('line_03', 'item_002'), (453, '2010-06-20 19:24:05')),
 (('line_03', 'item_002'), (675, '2010-06-22 23:25:25')),
 (('line_03', 'item_002'), (22, '2010-07-17 22:53:08')),
 (('line_03', 'item_002'), (822, '2010-07-30 16:40:45')),
 (('line_03', 'item_002'), (10, '2010-07-31 02:13:30')),
 (('line_03', 'item_002'), (486, '2010-08-19 18:57:10')),
 (('line_03', 'item_002'), (767, '2010-09-02 04:11:42')),
 (('line_03', 'item_002'), (287, '2010-09-02 07:58:23')),
 (('line_03', 'i

In [67]:
print(sampleLine)
logRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")), str(x.get("creationDate"))) )\
            .filter(lambda x: x[0]==(sampleLine)).sortBy(lambda x: x[1][1]).collect()

('line_03', 'item_002')


[(('line_03', 'item_002'), '2010-01-01 07:33:51'),
 (('line_03', 'item_002'), '2010-01-02 12:52:43'),
 (('line_03', 'item_002'), '2010-01-02 17:14:29'),
 (('line_03', 'item_002'), '2010-01-04 06:14:10'),
 (('line_03', 'item_002'), '2010-01-06 13:42:03'),
 (('line_03', 'item_002'), '2010-01-07 10:12:32'),
 (('line_03', 'item_002'), '2010-01-08 10:45:47'),
 (('line_03', 'item_002'), '2010-01-11 00:27:59'),
 (('line_03', 'item_002'), '2010-01-11 14:53:17'),
 (('line_03', 'item_002'), '2010-01-11 23:53:07'),
 (('line_03', 'item_002'), '2010-01-13 04:23:52'),
 (('line_03', 'item_002'), '2010-01-13 12:16:45'),
 (('line_03', 'item_002'), '2010-01-14 09:17:46'),
 (('line_03', 'item_002'), '2010-01-15 01:51:40'),
 (('line_03', 'item_002'), '2010-01-15 13:02:34'),
 (('line_03', 'item_002'), '2010-01-16 11:48:19'),
 (('line_03', 'item_002'), '2010-01-18 15:51:18'),
 (('line_03', 'item_002'), '2010-01-21 17:07:49'),
 (('line_03', 'item_002'), '2010-01-22 04:13:26'),
 (('line_03', 'item_002'), '201

## Step1: start with all possible combinations of log/master regardless the timestamp

In [68]:
step1 = logRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")),x))\
                .join(masterDataRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")),x)))
step1.first()

(('line_02', 'item_014'),
 ({'lineId': 'line_02',
   'itemId': 'item_014',
   'creationDate': datetime.datetime(2010, 1, 1, 2, 10, 10),
   'qty': 898},
  {'lineId': 'line_02',
   'itemId': 'item_014',
   'registryInsertTimestamp': datetime.datetime(2010, 2, 3, 23, 47, 55),
   'cost': 626}))

## Step2: reshape the rdd to create the unique ID needed - (lineId,ItemId,creationDate)

In [73]:
#log data is leading
def linePrepare(row):
    key = (row[0],row[1][0].get("creationDate"))
    value = {"deltaTime":row[1][0].get("creationDate")-row[1][1].get("registryInsertTimestamp"),
            "logLine":row[1][0],"masterLine":row[1][1]}
    return {"key":key,"value":value}

step2 = step1.map(lambda x: linePrepare(x)).persist()
step2.first()

{'key': (('line_02', 'item_014'), datetime.datetime(2010, 1, 1, 2, 10, 10)),
 'value': {'deltaTime': datetime.timedelta(days=-34, seconds=8535),
  'logLine': {'lineId': 'line_02',
   'itemId': 'item_014',
   'creationDate': datetime.datetime(2010, 1, 1, 2, 10, 10),
   'qty': 898},
  'masterLine': {'lineId': 'line_02',
   'itemId': 'item_014',
   'registryInsertTimestamp': datetime.datetime(2010, 2, 3, 23, 47, 55),
   'cost': 626}}}

## Step3: Reducing to Select only the correct value

In [146]:
def reduceFun(val1,val2):
    return val1 if val1.get("deltaTime") < val2.get("deltaTime") else val2

step3 = step2.map(lambda x: (x.get("key"),x.get("value"))).reduceByKey(lambda x,y: reduceFun(x,y))
step3.count()

25000

In [147]:
## it looks like correct... lets make a check
step3.filter(lambda x: x[1].get("deltaTime")<timedelta(0)).first() ##a negative value?

((('line_02', 'item_014'), datetime.datetime(2010, 1, 1, 19, 58, 39)),
 {'deltaTime': datetime.timedelta(days=-257, seconds=77029),
  'logLine': {'lineId': 'line_02',
   'itemId': 'item_014',
   'creationDate': datetime.datetime(2010, 1, 1, 19, 58, 39),
   'qty': 193},
  'masterLine': {'lineId': 'line_02',
   'itemId': 'item_014',
   'registryInsertTimestamp': datetime.datetime(2010, 9, 14, 22, 34, 50),
   'cost': 576}})

In [148]:
def lineFormatter(row):
    masterLine = row.get("value").get("masterLine")
    return "[{ts}] - {cost}€".format(ts=masterLine.get("registryInsertTimestamp"),cost=masterLine.get("cost"))
sampleRowKey = step2.first().get("key")
print("{line}-{item} @{ts}".format(ts=str(sampleRowKey[1]),
                          line=sampleRowKey[0][0],item=sampleRowKey[0][1]))
print("\t"+"\n\t".join([lineFormatter(k) for k in step2.filter(lambda x: x.get("key") == sampleRowKey).take(20)]))

line_02-item_014 @2010-01-01 02:10:10
	[2010-02-03 23:47:55] - 626€
	[2010-09-14 22:34:50] - 576€
	[2010-01-17 23:41:33] - 953€
	[2010-08-06 10:17:02] - 582€
	[2010-06-24 01:36:10] - 252€
	[2010-07-05 08:38:42] - 681€
	[2010-05-15 17:20:07] - 426€
	[2010-01-08 19:46:20] - 560€
	[2010-03-20 17:23:30] - 910€
	[2010-01-30 17:50:57] - 199€
	[2010-07-24 20:10:26] - 484€
	[2010-08-04 05:19:52] - 775€
	[2010-07-03 21:10:58] - 970€
	[2010-06-08 08:45:19] - 212€
	[2010-07-13 23:46:05] - 596€
	[2010-04-11 18:13:46] - 814€
	[2010-03-11 13:14:34] - 947€
	[2010-09-02 02:29:54] - 781€
	[2010-07-27 21:55:53] - 756€
	[2010-05-28 02:20:16] - 430€


In [149]:
sampleRowKey2 = step2.filter(lambda x: x.get("key")[1]>datetime(2010,4,1)).first().get("key")
print("{line}-{item} @{ts}".format(ts=str(sampleRowKey2[1]),
                          line=sampleRowKey2[0][0],item=sampleRowKey2[0][1]))
print("\t"+"\n\t".join([lineFormatter(k) for k in step2.filter(lambda x: x.get("key") == sampleRowKey2)\
                        .top(25,key=lambda x: x.get("value").get("masterLine").get("registryInsertTimestamp"))]))

line_02-item_014 @2010-04-01 13:58:43
	[2010-09-14 22:34:50] - 576€
	[2010-09-03 16:22:04] - 90€
	[2010-09-02 02:29:54] - 781€
	[2010-08-24 15:50:21] - 503€
	[2010-08-13 04:27:10] - 565€
	[2010-08-06 10:17:02] - 582€
	[2010-08-04 05:19:52] - 775€
	[2010-07-27 21:55:53] - 756€
	[2010-07-24 20:10:26] - 484€
	[2010-07-13 23:46:05] - 596€
	[2010-07-08 10:01:31] - 704€
	[2010-07-05 08:38:42] - 681€
	[2010-07-03 21:10:58] - 970€
	[2010-06-24 01:36:10] - 252€
	[2010-06-13 11:20:15] - 379€
	[2010-06-08 08:45:19] - 212€
	[2010-06-06 00:46:00] - 289€
	[2010-05-28 02:20:16] - 430€
	[2010-05-19 02:45:21] - 488€
	[2010-05-15 17:20:07] - 426€
	[2010-04-30 07:12:45] - 349€
	[2010-04-25 11:42:55] - 211€
	[2010-04-11 18:13:46] - 814€
	[2010-03-20 17:23:30] - 910€
	[2010-03-20 09:07:16] - 233€


## Step3: lets try again!

In [150]:
def reduceFun(val1,val2):
    return val1 if val1.get("deltaTime") < val2.get("deltaTime") else val2
step3_1 = step2.map(lambda x: (x.get("key"),x.get("value"))).filter(lambda x: x[1].get("deltaTime")>=timedelta(0))\
            .reduceByKey(lambda x,y: reduceFun(x,y))
step3_1.count()

23883

why 23883?? we lost some rows

In [127]:
def lineMaker(row):
    return (row.get("key"),(0,0,1) if row.get("value").get("deltaTime")>timedelta(0) 
                                            else (0,1,0) if row.get("value").get("deltaTime") == 0
                                            else (1,0,0))
    
rowDistribution = step2.map(lambda x: lineMaker(x)).reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1],x[2]+y[2])).persist()
[(k[0][0],k[1]) for k in rowDistribution.collect() if k[1][2] == 0]

[(('line_02', 'item_014'), (33, 0, 0)),
 (('line_02', 'item_014'), (33, 0, 0)),
 (('line_02', 'item_014'), (33, 0, 0)),
 (('line_02', 'item_014'), (33, 0, 0)),
 (('line_03', 'item_009'), (20, 0, 0)),
 (('line_03', 'item_009'), (20, 0, 0)),
 (('line_05', 'item_008'), (14, 0, 0)),
 (('line_01', 'item_003'), (8, 0, 0)),
 (('line_01', 'item_003'), (8, 0, 0)),
 (('line_01', 'item_003'), (8, 0, 0)),
 (('line_01', 'item_003'), (8, 0, 0)),
 (('line_01', 'item_003'), (8, 0, 0)),
 (('line_01', 'item_003'), (8, 0, 0)),
 (('line_01', 'item_003'), (8, 0, 0)),
 (('line_01', 'item_003'), (8, 0, 0)),
 (('line_01', 'item_003'), (8, 0, 0)),
 (('line_01', 'item_003'), (8, 0, 0)),
 (('line_04', 'item_013'), (20, 0, 0)),
 (('line_03', 'item_004'), (23, 0, 0)),
 (('line_03', 'item_004'), (23, 0, 0)),
 (('line_01', 'item_005'), (26, 0, 0)),
 (('line_01', 'item_005'), (26, 0, 0)),
 (('line_01', 'item_005'), (26, 0, 0)),
 (('line_04', 'item_003'), (28, 0, 0)),
 (('line_04', 'item_003'), (28, 0, 0)),
 (('line_0

In [144]:
len([(k[0][0],k[1]) for k in rowDistribution.collect() if k[1][2] == 0]),len([(k[0][0],k[1]) for k in rowDistribution.collect() if k[1][2] == 0]) + step3.count()

(1117, 26117)

## Step3.ter maybe this is the good one

In [151]:
def reduceFun(val1,val2):
    val1Delta = val1.get("deltaTime")
    val2Delta = val2.get("deltaTime")
    if val1Delta >= timedelta(0) and val2Delta >=timedelta(0):
        return val1 if  val1Delta < val2Delta else val2
    elif val1Delta < timedelta(0) and val2Delta <timedelta(0): 
        return val1 if  val1Delta > val2Delta else val2
    else: 
        return val1 if val1Delta>=timedelta(0) else val2
        
def lineParser(row):
    return {"lineId":row[0][0][0],"itemId":row[0][0][1],"creationDate":row[0][1], "deltaTime":row[1].get("deltaTime"),
                 "qty":row[1].get("logLine").get("qty"),"cost":row[1].get("masterLine").get("cost")}


step3_2 = step2.map(lambda x: (x.get("key"),x.get("value")))\
            .reduceByKey(lambda x,y: reduceFun(x,y)).map(lambda x: lineParser(x))
step3_2.count()

25000

In [153]:
step3_0 = step3.map(lambda x: lineParser(x))
step3_0.filter(lambda x: x.get("deltaTime")<timedelta(0)).count(),step3_0.filter(lambda x: x.get("deltaTime")>=timedelta(0)).count()

(23889, 1111)

In [154]:
step3_1_0 = step3_1.map(lambda x: lineParser(x))
step3_1_0.filter(lambda x: x.get("deltaTime")<timedelta(0)).count(),step3_1_0.filter(lambda x: x.get("deltaTime")>=timedelta(0)).count()

(0, 23883)

In [155]:
step3_2.filter(lambda x: x.get("deltaTime")<timedelta(0)).count(),step3_2.filter(lambda x: x.get("deltaTime")>=timedelta(0)).count()

(1117, 23883)