In [1]:
#remember to clone https://github.com/brcondor/Architectures_for_Big_Data into '/home/jovyan/work/'
import sys
sys.path.append("/home/jovyan/work/Architectures_for_Big_Data/")
import pyspark
sc = pyspark.SparkContext("local[3]")

In [2]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *

dataset = datasetGenerator()
dataset.addGenerator(idGenerator(),prefix="line",min=1,max=5,keyName="lineId")
dataset.addGenerator(idGenerator(),prefix="item",min=1,max=15,keyName="itemId")
dataset.addGenerator(tickGenerator(),minTick=600,maxTick=1800,keyName="creationDate")
dataset.addGenerator(intGenerator(),keyName="qty")

logRdd = sc.parallelize( dataset.generateDataset(25000)).persist()
logRdd.first()

{'lineId': 'line_05',
 'itemId': 'item_007',
 'creationDate': datetime.datetime(2010, 1, 1, 0, 20, 22),
 'qty': 124}

In [3]:
logRdd.map(lambda x: x.get("creationDate")).sortBy(lambda x: x).map(lambda x: str(x)).take(5)

['2010-01-01 00:20:22',
 '2010-01-01 00:34:26',
 '2010-01-01 00:48:02',
 '2010-01-01 00:54:07',
 '2010-01-01 01:13:01']

In [4]:
logRdd.map(lambda x: (x.get("lineId"),x.get("itemId"))).distinct().sortBy(lambda x: (x[0],x[1])).collect()

[('line_01', 'item_001'),
 ('line_01', 'item_002'),
 ('line_01', 'item_003'),
 ('line_01', 'item_004'),
 ('line_01', 'item_005'),
 ('line_01', 'item_006'),
 ('line_01', 'item_007'),
 ('line_01', 'item_008'),
 ('line_01', 'item_009'),
 ('line_01', 'item_010'),
 ('line_01', 'item_011'),
 ('line_01', 'item_012'),
 ('line_01', 'item_013'),
 ('line_01', 'item_014'),
 ('line_01', 'item_015'),
 ('line_02', 'item_001'),
 ('line_02', 'item_002'),
 ('line_02', 'item_003'),
 ('line_02', 'item_004'),
 ('line_02', 'item_005'),
 ('line_02', 'item_006'),
 ('line_02', 'item_007'),
 ('line_02', 'item_008'),
 ('line_02', 'item_009'),
 ('line_02', 'item_010'),
 ('line_02', 'item_011'),
 ('line_02', 'item_012'),
 ('line_02', 'item_013'),
 ('line_02', 'item_014'),
 ('line_02', 'item_015'),
 ('line_03', 'item_001'),
 ('line_03', 'item_002'),
 ('line_03', 'item_003'),
 ('line_03', 'item_004'),
 ('line_03', 'item_005'),
 ('line_03', 'item_006'),
 ('line_03', 'item_007'),
 ('line_03', 'item_008'),
 ('line_03',

In [5]:
logRdd.map(lambda x: (x.get("lineId"),x.get("itemId"))).distinct().map(lambda x: (x[0],1))\
        .reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[0]).collect()

[('line_01', 15),
 ('line_02', 15),
 ('line_03', 15),
 ('line_04', 15),
 ('line_05', 15)]

In [6]:
maxDate = logRdd.map(lambda x: x.get("creationDate")).sortBy(lambda x: x).map(lambda x: x).max()
str(maxDate)


'2010-09-18 11:06:15'

In [7]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *

dataset = datasetGenerator()
dataset.addGenerator(idGenerator(),prefix="line",min=1,max=5,keyName="lineId")
dataset.addGenerator(idGenerator(),prefix="item",min=1,max=15,keyName="itemId")
dataset.addGenerator(dateGenerator(),endDate=maxDate,keyName="registryInsertTimestamp")
dataset.addGenerator(intGenerator(),"cost")

masterDataRdd = sc.parallelize( dataset.generateDataset(1500)).persist()
masterDataRdd.first()

{'lineId': 'line_03',
 'itemId': 'item_014',
 'registryInsertTimestamp': datetime.datetime(2010, 5, 15, 7, 1, 28),
 'cost': 266}

In [8]:
masterDataRdd.map(lambda x: x.get("registryInsertTimestamp")).max()

datetime.datetime(2010, 9, 18, 6, 26, 44)

# Bean Counter

In [9]:
logRdd.first()

{'lineId': 'line_05',
 'itemId': 'item_007',
 'creationDate': datetime.datetime(2010, 1, 1, 0, 20, 22),
 'qty': 124}

## How many lines per item?

In [10]:
logRdd.map(lambda x: (x.get("itemId"),1)).reduceByKey(lambda x,y: x+y).take(5)

[('item_007', 1717),
 ('item_005', 1705),
 ('item_014', 1710),
 ('item_013', 1634),
 ('item_001', 1657)]

## How much qty per item?

In [51]:
def f(x,y):
    numberOfItems = x+y
    return numberOfItems
logRdd.map(lambda x: (x.get("itemId"),x.get("qty"))).reduceByKey(lambda x,y: f(x,y)).top(100)

[('item_015', 823102),
 ('item_014', 850553),
 ('item_013', 804934),
 ('item_012', 832732),
 ('item_011', 845031),
 ('item_010', 853389),
 ('item_009', 833888),
 ('item_008', 842459),
 ('item_007', 863019),
 ('item_006', 817875),
 ('item_005', 872277),
 ('item_004', 854293),
 ('item_003', 804348),
 ('item_002', 835764),
 ('item_001', 835694)]

In [52]:
def f(x,y):
    numberOfItems = x+y
    return numberOfItems
logRdd.map(lambda x: (x.get("itemId"),x.get("qty"))).reduceByKey(lambda x,y: f(x,y))\
                    .top(100,key=lambda x: 1/int(x[0].split("_")[1]))

[('item_001', 835694),
 ('item_002', 835764),
 ('item_003', 804348),
 ('item_004', 854293),
 ('item_005', 872277),
 ('item_006', 817875),
 ('item_007', 863019),
 ('item_008', 842459),
 ('item_009', 833888),
 ('item_010', 853389),
 ('item_011', 845031),
 ('item_012', 832732),
 ('item_013', 804934),
 ('item_014', 850553),
 ('item_015', 823102)]

## How much qty in everage per item?

In [12]:
def f(x,y):
    cumulateQty = x[0]+y[0]
    numberOfItems = x[1]+y[1]
    return cumulateQty,numberOfItems
    
def prepare(x):
    return x[0],x[1][0]/x[1][1]
    
logRdd.map(lambda x: (x.get("itemId"),(x.get("qty"),1.0))).reduceByKey(lambda x,y: f(x,y)).map(lambda x: prepare(x)).take(5)

[('item_007', 502.63191613278974),
 ('item_005', 511.5994134897361),
 ('item_014', 497.39941520467835),
 ('item_013', 492.6156670746634),
 ('item_001', 504.34158117079056)]

In [13]:
def f(x,y):
    cumulateQty = x[0]+y[0]
    numberOfItems = x[1]+y[1]
    return cumulateQty,numberOfItems
    
def prepare(x):
    return x[1][0]/x[1][1]
    
logRdd.map(lambda x: (x.get("itemId"),(x.get("qty"),1.0))).reduceByKey(lambda x,y: f(x,y)).map(lambda x: prepare(x))\
                .stats()

(count: 15, mean: 502.8372373894986, stdev: 7.113083233556609, max: 518.4363076923077, min: 492.6156670746634)

## Exercise (1): Similar statistics but on item,line couple

In [14]:
#...

## lets Imagine to have 100 executors, and we want to distribute the average over lineId

In [15]:
def f(x,y):
    cumulateQty = x[0]+y[0]
    numberOfItems = x[1]+y[1]
    return cumulateQty,numberOfItems
    
def prepare(x):
    return x[1][0]/x[1][1]
    
logRdd.map(lambda x: ((x.get("lineId")),(x.get("qty"),1.0))).reduceByKey(lambda x,y: f(x,y))\
                    .map(lambda x: prepare(x))\
                .stats()

(count: 5, mean: 502.78731436467535, stdev: 2.8349413569255955, max: 507.68448764682057, min: 500.0643309580364)

In [16]:
def f(x,y):
    cumulateQty = x[0]+y[0]
    numberOfItems = x[1]+y[1]
    return cumulateQty,numberOfItems
    
def prepare(x):
    return x[1][0]/x[1][1]
    
logRdd.map(lambda x: ((randint(1,100),x.get("lineId")),(x.get("qty"),1.0))).reduceByKey(lambda x,y: f(x,y))\
                    .map(lambda x: (x[0][1],x[1])).reduceByKey(lambda x,y: f(x,y))\
                    .map(lambda x: prepare(x))\
                .stats()

(count: 5, mean: 502.78731436467535, stdev: 2.8349413569256026, max: 507.68448764682057, min: 500.0643309580364)

## Cycle Time

In [17]:
semiLogRdd = logRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")),x)).persist()
semiLogRdd.count()

25000

In [18]:
joinedRDD = semiLogRdd.join(semiLogRdd).persist()
joinedRDD.count()

8352658

In [19]:
joinedRDD.first()

(('line_05', 'item_008'),
 ({'lineId': 'line_05',
   'itemId': 'item_008',
   'creationDate': datetime.datetime(2010, 1, 1, 2, 19, 37),
   'qty': 481},
  {'lineId': 'line_05',
   'itemId': 'item_008',
   'creationDate': datetime.datetime(2010, 1, 1, 2, 19, 37),
   'qty': 481}))

In [20]:
triangleJoinedRdd = joinedRDD.filter(lambda x: x[1][0].get("creationDate")<x[1][1].get("creationDate")).persist()
triangleJoinedRdd.count()

4163829

In [21]:
triangleJoinedRdd.first()

(('line_05', 'item_008'),
 ({'lineId': 'line_05',
   'itemId': 'item_008',
   'creationDate': datetime.datetime(2010, 1, 1, 2, 19, 37),
   'qty': 481},
  {'lineId': 'line_05',
   'itemId': 'item_008',
   'creationDate': datetime.datetime(2010, 1, 2, 6, 8, 37),
   'qty': 390}))

In [22]:
tempRdd = triangleJoinedRdd.map(lambda x: ((x[0],x[1][0].get("creationDate")), 
                                 (x[1][1].get("creationDate")-x[1][0].get("creationDate"),x)))
tempRdd.first()

((('line_05', 'item_008'), datetime.datetime(2010, 1, 1, 2, 19, 37)),
 (datetime.timedelta(days=1, seconds=13740),
  (('line_05', 'item_008'),
   ({'lineId': 'line_05',
     'itemId': 'item_008',
     'creationDate': datetime.datetime(2010, 1, 1, 2, 19, 37),
     'qty': 481},
    {'lineId': 'line_05',
     'itemId': 'item_008',
     'creationDate': datetime.datetime(2010, 1, 2, 6, 8, 37),
     'qty': 390}))))

In [23]:
sampleLine = tempRdd.first()[0]
print(sampleLine)
triangleJoinedRdd.map(lambda x: ((x[0],x[1][0].get("creationDate")), 
                                 (x[1][1].get("creationDate")-x[1][0].get("creationDate"),x)))\
                .filter(lambda x: x[0]==sampleLine).count()

(('line_05', 'item_008'), datetime.datetime(2010, 1, 1, 2, 19, 37))


331

In [24]:
def f(x,y):
    if x[0]<y[0]:
        return x
    return y
def prepare(x):
    x[1][1][1][0]["cycleTime"] = x[1][0]
    return x[1][1][1][0]
    
enrichedLogRdd = triangleJoinedRdd.map(lambda x: ((x[0],x[1][0].get("creationDate")), 
                                                  (x[1][1].get("creationDate")-x[1][0].get("creationDate"),x)))\
                    .reduceByKey(lambda x,y: f(x,y)).map(lambda x: prepare(x)).persist()
enrichedLogRdd.first()

{'lineId': 'line_05',
 'itemId': 'item_008',
 'creationDate': datetime.datetime(2010, 1, 10, 10, 21, 45),
 'qty': 852,
 'cycleTime': datetime.timedelta(days=1, seconds=36181)}

## Exercise (2): why if we count, we see fewer items?

In [25]:
enrichedLogRdd.count()

24925

## Exercise (3): I want to see the histogram of cycle times

## Extract Snapshot from Master Data

In [26]:
tempRdd = masterDataRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")), (x.get("cost"),x.get("registryInsertTimestamp")) )  )
tempRdd.first()

(('line_03', 'item_014'), (266, datetime.datetime(2010, 5, 15, 7, 1, 28)))

In [27]:
masterDataRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")))).distinct().count()

75

In [28]:
sampleLine = tempRdd.first()[0]
print(sampleLine)
masterDataRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")), (x.get("cost"),str(x.get("registryInsertTimestamp"))) )  )\
            .filter(lambda x: x[0]==sampleLine).sortBy(lambda x: x[1][1]).collect()

('line_03', 'item_014')


[(('line_03', 'item_014'), (500, '2010-01-01 07:17:46')),
 (('line_03', 'item_014'), (719, '2010-02-10 07:27:31')),
 (('line_03', 'item_014'), (600, '2010-03-16 14:29:19')),
 (('line_03', 'item_014'), (775, '2010-03-18 07:06:28')),
 (('line_03', 'item_014'), (848, '2010-03-28 13:38:06')),
 (('line_03', 'item_014'), (378, '2010-04-09 21:46:34')),
 (('line_03', 'item_014'), (829, '2010-05-06 05:24:26')),
 (('line_03', 'item_014'), (962, '2010-05-07 05:05:58')),
 (('line_03', 'item_014'), (266, '2010-05-15 07:01:28')),
 (('line_03', 'item_014'), (541, '2010-05-30 10:47:38')),
 (('line_03', 'item_014'), (68, '2010-06-04 09:18:50')),
 (('line_03', 'item_014'), (613, '2010-06-05 06:20:29')),
 (('line_03', 'item_014'), (150, '2010-06-08 15:53:09')),
 (('line_03', 'item_014'), (932, '2010-06-20 08:47:08')),
 (('line_03', 'item_014'), (329, '2010-06-25 13:37:22')),
 (('line_03', 'item_014'), (416, '2010-06-25 16:50:14')),
 (('line_03', 'item_014'), (424, '2010-06-27 09:58:43')),
 (('line_03', '

In [29]:
print(sampleLine)
logRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")), str(x.get("creationDate"))) )\
            .filter(lambda x: x[0]==(sampleLine)).sortBy(lambda x: x[1][1]).collect()

('line_03', 'item_014')


[(('line_03', 'item_014'), '2010-01-01 02:37:50'),
 (('line_03', 'item_014'), '2010-01-01 20:08:35'),
 (('line_03', 'item_014'), '2010-01-02 16:15:40'),
 (('line_03', 'item_014'), '2010-01-03 07:25:50'),
 (('line_03', 'item_014'), '2010-01-04 07:03:10'),
 (('line_03', 'item_014'), '2010-01-04 18:17:14'),
 (('line_03', 'item_014'), '2010-01-06 12:27:57'),
 (('line_03', 'item_014'), '2010-01-06 22:32:21'),
 (('line_03', 'item_014'), '2010-01-07 03:38:43'),
 (('line_03', 'item_014'), '2010-01-07 05:15:52'),
 (('line_03', 'item_014'), '2010-01-09 15:38:56'),
 (('line_03', 'item_014'), '2010-01-09 21:08:01'),
 (('line_03', 'item_014'), '2010-01-10 19:06:41'),
 (('line_03', 'item_014'), '2010-01-10 20:04:22'),
 (('line_03', 'item_014'), '2010-01-11 09:15:39'),
 (('line_03', 'item_014'), '2010-01-13 00:23:29'),
 (('line_03', 'item_014'), '2010-01-14 01:33:02'),
 (('line_03', 'item_014'), '2010-01-14 17:29:43'),
 (('line_03', 'item_014'), '2010-01-14 19:10:45'),
 (('line_03', 'item_014'), '201

## Step1: start with all possible combinations of log/master regardless the timestamp

In [30]:
step1 = logRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")),x))\
                .join(masterDataRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")),x)))
step1.first()

(('line_05', 'item_008'),
 ({'lineId': 'line_05',
   'itemId': 'item_008',
   'creationDate': datetime.datetime(2010, 1, 1, 2, 19, 37),
   'qty': 481},
  {'lineId': 'line_05',
   'itemId': 'item_008',
   'registryInsertTimestamp': datetime.datetime(2010, 6, 11, 15, 23, 55),
   'cost': 102}))

## Step2: reshape the rdd to create the unique ID needed - (lineId,ItemId,creationDate)

In [31]:
#log data is leading
def linePrepare(row):
    key = (row[0],row[1][0].get("creationDate"))
    value = {"deltaTime":row[1][0].get("creationDate")-row[1][1].get("registryInsertTimestamp"),
            "logLine":row[1][0],"masterLine":row[1][1]}
    return {"key":key,"value":value}

step2 = step1.map(lambda x: linePrepare(x)).persist()
step2.first()

{'key': (('line_05', 'item_008'), datetime.datetime(2010, 1, 1, 2, 19, 37)),
 'value': {'deltaTime': datetime.timedelta(days=-162, seconds=39342),
  'logLine': {'lineId': 'line_05',
   'itemId': 'item_008',
   'creationDate': datetime.datetime(2010, 1, 1, 2, 19, 37),
   'qty': 481},
  'masterLine': {'lineId': 'line_05',
   'itemId': 'item_008',
   'registryInsertTimestamp': datetime.datetime(2010, 6, 11, 15, 23, 55),
   'cost': 102}}}

## Step3: Reducing to Select only the correct value

In [32]:
def reduceFun(val1,val2):
    return val1 if val1.get("deltaTime") < val2.get("deltaTime") else val2

step3 = step2.map(lambda x: (x.get("key"),x.get("value"))).reduceByKey(lambda x,y: reduceFun(x,y))
step3.count()

25000

In [33]:
## it looks like correct... lets make a check
step3.filter(lambda x: x[1].get("deltaTime")<timedelta(0)).first() ##a negative value?

((('line_05', 'item_008'), datetime.datetime(2010, 1, 10, 10, 21, 45)),
 {'deltaTime': datetime.timedelta(days=-247, seconds=22550),
  'logLine': {'lineId': 'line_05',
   'itemId': 'item_008',
   'creationDate': datetime.datetime(2010, 1, 10, 10, 21, 45),
   'qty': 852},
  'masterLine': {'lineId': 'line_05',
   'itemId': 'item_008',
   'registryInsertTimestamp': datetime.datetime(2010, 9, 14, 4, 5, 55),
   'cost': 129}})

In [34]:
def lineFormatter(row):
    masterLine = row.get("value").get("masterLine")
    return "[{ts}] - {cost}€".format(ts=masterLine.get("registryInsertTimestamp"),cost=masterLine.get("cost"))
sampleRowKey = step2.first().get("key")
print("{line}-{item} @{ts}".format(ts=str(sampleRowKey[1]),
                          line=sampleRowKey[0][0],item=sampleRowKey[0][1]))
print("\t"+"\n\t".join([lineFormatter(k) for k in step2.filter(lambda x: x.get("key") == sampleRowKey).take(20)]))

line_05-item_008 @2010-01-01 02:19:37
	[2010-06-11 15:23:55] - 102€
	[2010-03-07 13:32:58] - 869€
	[2010-03-01 21:53:17] - 985€
	[2010-07-29 14:52:25] - 756€
	[2010-03-24 17:44:15] - 246€
	[2010-03-23 02:03:38] - 970€
	[2010-03-16 13:41:56] - 988€
	[2010-02-06 12:32:15] - 921€
	[2010-09-04 00:37:45] - 31€
	[2010-07-16 18:30:33] - 210€
	[2010-03-16 01:04:02] - 19€
	[2010-04-01 10:20:51] - 790€
	[2010-09-14 04:05:55] - 129€
	[2010-01-14 11:01:39] - 266€
	[2010-04-29 16:28:54] - 371€
	[2010-09-13 23:17:04] - 123€
	[2010-05-16 15:48:11] - 576€
	[2010-03-09 20:55:56] - 569€
	[2010-06-26 09:46:53] - 83€
	[2010-02-16 23:33:41] - 20€


In [35]:
sampleRowKey2 = step2.filter(lambda x: x.get("key")[1]>datetime(2010,4,1)).first().get("key")
print("{line}-{item} @{ts}".format(ts=str(sampleRowKey2[1]),
                          line=sampleRowKey2[0][0],item=sampleRowKey2[0][1]))
print("\t"+"\n\t".join([lineFormatter(k) for k in step2.filter(lambda x: x.get("key") == sampleRowKey2)\
                        .top(25,key=lambda x: x.get("value").get("masterLine").get("registryInsertTimestamp"))]))

line_05-item_008 @2010-04-01 15:04:20
	[2010-09-14 04:05:55] - 129€
	[2010-09-13 23:17:04] - 123€
	[2010-09-04 00:37:45] - 31€
	[2010-08-28 13:12:35] - 485€
	[2010-07-29 14:52:25] - 756€
	[2010-07-16 18:30:33] - 210€
	[2010-06-26 09:46:53] - 83€
	[2010-06-19 07:02:57] - 356€
	[2010-06-11 15:23:55] - 102€
	[2010-05-16 15:48:11] - 576€
	[2010-05-16 09:46:38] - 178€
	[2010-04-29 16:28:54] - 371€
	[2010-04-02 04:02:39] - 959€
	[2010-04-01 10:20:51] - 790€
	[2010-03-29 21:39:31] - 757€
	[2010-03-24 17:44:15] - 246€
	[2010-03-23 02:03:38] - 970€
	[2010-03-16 13:41:56] - 988€
	[2010-03-16 01:04:02] - 19€
	[2010-03-15 15:48:22] - 176€
	[2010-03-09 20:55:56] - 569€
	[2010-03-09 05:58:05] - 139€
	[2010-03-08 04:17:50] - 148€
	[2010-03-07 13:32:58] - 869€
	[2010-03-01 21:53:17] - 985€


## Step3: lets try again!

In [36]:
def reduceFun(val1,val2):
    return val1 if val1.get("deltaTime") < val2.get("deltaTime") else val2
step3_1 = step2.map(lambda x: (x.get("key"),x.get("value"))).filter(lambda x: x[1].get("deltaTime")>=timedelta(0))\
            .reduceByKey(lambda x,y: reduceFun(x,y))
step3_1.count()

23796

why 23883?? we lost some rows

In [37]:
def lineMaker(row):
    return (row.get("key"),(0,0,1) if row.get("value").get("deltaTime")>timedelta(0) 
                                            else (0,1,0) if row.get("value").get("deltaTime") == 0
                                            else (1,0,0))
    
rowDistribution = step2.map(lambda x: lineMaker(x)).reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1],x[2]+y[2])).persist()
[(k[0][0],k[1]) for k in rowDistribution.collect() if k[1][2] == 0]

[(('line_05', 'item_008'), (29, 0, 0)),
 (('line_02', 'item_011'), (18, 0, 0)),
 (('line_02', 'item_011'), (18, 0, 0)),
 (('line_04', 'item_002'), (22, 0, 0)),
 (('line_04', 'item_002'), (22, 0, 0)),
 (('line_01', 'item_005'), (18, 0, 0)),
 (('line_01', 'item_005'), (18, 0, 0)),
 (('line_03', 'item_007'), (25, 0, 0)),
 (('line_01', 'item_003'), (22, 0, 0)),
 (('line_03', 'item_004'), (14, 0, 0)),
 (('line_03', 'item_005'), (21, 0, 0)),
 (('line_03', 'item_009'), (21, 0, 0)),
 (('line_03', 'item_009'), (21, 0, 0)),
 (('line_03', 'item_009'), (21, 0, 0)),
 (('line_03', 'item_009'), (21, 0, 0)),
 (('line_02', 'item_014'), (25, 0, 0)),
 (('line_05', 'item_007'), (17, 0, 0)),
 (('line_05', 'item_007'), (17, 0, 0)),
 (('line_05', 'item_007'), (17, 0, 0)),
 (('line_05', 'item_007'), (17, 0, 0)),
 (('line_02', 'item_003'), (20, 0, 0)),
 (('line_02', 'item_003'), (20, 0, 0)),
 (('line_04', 'item_010'), (23, 0, 0)),
 (('line_04', 'item_010'), (23, 0, 0)),
 (('line_04', 'item_010'), (23, 0, 0)),


In [38]:
len([(k[0][0],k[1]) for k in rowDistribution.collect() if k[1][2] == 0]),len([(k[0][0],k[1]) for k in rowDistribution.collect() if k[1][2] == 0]) + step3.count()

(1204, 26204)

## Step3.ter maybe this is the good one

In [39]:
def reduceFun(val1,val2):
    val1Delta = val1.get("deltaTime")
    val2Delta = val2.get("deltaTime")
    if val1Delta >= timedelta(0) and val2Delta >=timedelta(0):
        return val1 if  val1Delta < val2Delta else val2
    elif val1Delta < timedelta(0) and val2Delta <timedelta(0): 
        return val1 if  val1Delta > val2Delta else val2
    else: 
        return val1 if val1Delta>=timedelta(0) else val2
        
def lineParser(row):
    return {"lineId":row[0][0][0],"itemId":row[0][0][1],"creationDate":row[0][1], "deltaTime":row[1].get("deltaTime"),
                 "qty":row[1].get("logLine").get("qty"),"cost":row[1].get("masterLine").get("cost")}


step3_2 = step2.map(lambda x: (x.get("key"),x.get("value")))\
            .reduceByKey(lambda x,y: reduceFun(x,y)).map(lambda x: lineParser(x))
step3_2.count()

25000

In [40]:
step3_0 = step3.map(lambda x: lineParser(x))
step3_0.filter(lambda x: x.get("deltaTime")<timedelta(0)).count(),step3_0.filter(lambda x: x.get("deltaTime")>=timedelta(0)).count()

(23794, 1206)

In [41]:
step3_1_0 = step3_1.map(lambda x: lineParser(x))
step3_1_0.filter(lambda x: x.get("deltaTime")<timedelta(0)).count(),step3_1_0.filter(lambda x: x.get("deltaTime")>=timedelta(0)).count()

(0, 23796)

In [42]:
step3_2.filter(lambda x: x.get("deltaTime")<timedelta(0)).count(),step3_2.filter(lambda x: x.get("deltaTime")>=timedelta(0)).count()

(1204, 23796)