In [1]:
#remember to clone https://github.com/brcondor/Architectures_for_Big_Data into '/home/jovyan/work/'
import sys
sys.path.append("/home/jovyan/work/Architectures_for_Big_Data/")
import pyspark
sc = pyspark.SparkContext("local[3]")

In [2]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *

dataset = datasetGenerator()
dataset.addGenerator(idGenerator(),prefix="line",min=1,max=5,keyName="lineId")
dataset.addGenerator(idGenerator(),prefix="item",min=1,max=15,keyName="itemId")
dataset.addGenerator(tickGenerator(),minTick=600,maxTick=1800,keyName="ts")
dataset.addGenerator(intGenerator(),keyName="qty")

logRdd = sc.parallelize( dataset.generateDataset(25000)).persist()
logRdd.first()

{'lineId': 'line_04',
 'itemId': 'item_005',
 'ts': datetime.datetime(2010, 1, 1, 0, 14, 7),
 'qty': 46}

In [3]:
logRdd.map(lambda x: x.get("ts")).sortBy(lambda x: x).map(lambda x: str(x)).take(5)

['2010-01-01 00:14:07',
 '2010-01-01 00:41:49',
 '2010-01-01 01:06:11',
 '2010-01-01 01:20:34',
 '2010-01-01 01:49:32']

In [4]:
logRdd.map(lambda x: (x.get("lineId"),x.get("itemId"))).distinct().sortBy(lambda x: (x[0],x[1])).collect()

[('line_01', 'item_001'),
 ('line_01', 'item_002'),
 ('line_01', 'item_003'),
 ('line_01', 'item_004'),
 ('line_01', 'item_005'),
 ('line_01', 'item_006'),
 ('line_01', 'item_007'),
 ('line_01', 'item_008'),
 ('line_01', 'item_009'),
 ('line_01', 'item_010'),
 ('line_01', 'item_011'),
 ('line_01', 'item_012'),
 ('line_01', 'item_013'),
 ('line_01', 'item_014'),
 ('line_01', 'item_015'),
 ('line_02', 'item_001'),
 ('line_02', 'item_002'),
 ('line_02', 'item_003'),
 ('line_02', 'item_004'),
 ('line_02', 'item_005'),
 ('line_02', 'item_006'),
 ('line_02', 'item_007'),
 ('line_02', 'item_008'),
 ('line_02', 'item_009'),
 ('line_02', 'item_010'),
 ('line_02', 'item_011'),
 ('line_02', 'item_012'),
 ('line_02', 'item_013'),
 ('line_02', 'item_014'),
 ('line_02', 'item_015'),
 ('line_03', 'item_001'),
 ('line_03', 'item_002'),
 ('line_03', 'item_003'),
 ('line_03', 'item_004'),
 ('line_03', 'item_005'),
 ('line_03', 'item_006'),
 ('line_03', 'item_007'),
 ('line_03', 'item_008'),
 ('line_03',

In [5]:
logRdd.map(lambda x: (x.get("lineId"),x.get("itemId"))).distinct().map(lambda x: (x[0],1))\
        .reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[0]).collect()

[('line_01', 15),
 ('line_02', 15),
 ('line_03', 15),
 ('line_04', 15),
 ('line_05', 15)]

In [6]:
maxDate = logRdd.map(lambda x: x.get("ts")).sortBy(lambda x: x).map(lambda x: x).max()
str(maxDate)


'2010-09-17 19:51:06'

In [7]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *

dataset = datasetGenerator()
dataset.addGenerator(idGenerator(),prefix="line",min=1,max=5,keyName="lineId")
dataset.addGenerator(idGenerator(),prefix="item",min=1,max=15,keyName="itemId")
dataset.addGenerator(dateGenerator(),endDate=maxDate,keyName="ts")
dataset.addGenerator(intGenerator(),"cost")

masterDataRdd = sc.parallelize( dataset.generateDataset(1500)).persist()
masterDataRdd.first()

{'lineId': 'line_05',
 'itemId': 'item_010',
 'ts': datetime.datetime(2010, 1, 16, 22, 18, 38),
 'cost': 341}

In [8]:
masterDataRdd.map(lambda x: x.get("ts")).max()

datetime.datetime(2010, 9, 17, 13, 52, 5)

# Bean Counter

In [9]:
logRdd.first()

{'lineId': 'line_04',
 'itemId': 'item_005',
 'ts': datetime.datetime(2010, 1, 1, 0, 14, 7),
 'qty': 46}

## How many lines per item?

In [10]:
logRdd.map(lambda x: (x.get("itemId"),1)).reduceByKey(lambda x,y: x+y).take(5)

[('item_005', 1666),
 ('item_013', 1583),
 ('item_010', 1649),
 ('item_007', 1725),
 ('item_014', 1668)]

## How much qty per item?

In [11]:
def f(x,y):
    numberOfItems = x+y
    return numberOfItems
logRdd.map(lambda x: (x.get("itemId"),x.get("qty"))).reduceByKey(lambda x,y: f(x,y)).take(5)

[('item_005', 834748),
 ('item_013', 785499),
 ('item_010', 825618),
 ('item_007', 881358),
 ('item_014', 847889)]

## How much qty in everage per item?

In [12]:
def f(x,y):
    cumulateQty = x[0]+y[0]
    numberOfItems = x[1]+y[1]
    return cumulateQty,numberOfItems
    
def prepare(x):
    return x[0],x[1][0]/x[1][1]
    
logRdd.map(lambda x: (x.get("itemId"),(x.get("qty"),1.0))).reduceByKey(lambda x,y: f(x,y)).map(lambda x: prepare(x)).take(5)

[('item_005', 501.04921968787517),
 ('item_013', 496.2090966519267),
 ('item_010', 500.67798665858095),
 ('item_007', 510.93217391304347),
 ('item_014', 508.32673860911274)]

In [13]:
def f(x,y):
    cumulateQty = x[0]+y[0]
    numberOfItems = x[1]+y[1]
    return cumulateQty,numberOfItems
    
def prepare(x):
    return x[1][0]/x[1][1]
    
logRdd.map(lambda x: (x.get("itemId"),(x.get("qty"),1.0))).reduceByKey(lambda x,y: f(x,y)).map(lambda x: prepare(x))\
                .stats()

(count: 15, mean: 500.7442721979763, stdev: 5.537131200704034, max: 510.93217391304347, min: 489.468085106383)

## Exercise (1): Similar statistics but on item,line couple

In [14]:
#...

## lets Imagine to have 100 executors, and we want to distribute the average over lineId

In [15]:
def f(x,y):
    cumulateQty = x[0]+y[0]
    numberOfItems = x[1]+y[1]
    return cumulateQty,numberOfItems
    
def prepare(x):
    return x[1][0]/x[1][1]
    
logRdd.map(lambda x: ((x.get("lineId")),(x.get("qty"),1.0))).reduceByKey(lambda x,y: f(x,y))\
                    .map(lambda x: prepare(x))\
                .stats()

(count: 5, mean: 500.75982327713666, stdev: 2.3358911677342413, max: 503.2509113001215, min: 496.51540332387515)

In [16]:
def f(x,y):
    cumulateQty = x[0]+y[0]
    numberOfItems = x[1]+y[1]
    return cumulateQty,numberOfItems
    
def prepare(x):
    return x[1][0]/x[1][1]
    
logRdd.map(lambda x: ((randint(1,100),x.get("lineId")),(x.get("qty"),1.0))).reduceByKey(lambda x,y: f(x,y))\
                    .map(lambda x: (x[0][1],x[1])).reduceByKey(lambda x,y: f(x,y))\
                    .map(lambda x: prepare(x))\
                .stats()

(count: 5, mean: 500.75982327713666, stdev: 2.335891167734239, max: 503.2509113001215, min: 496.51540332387515)

## Cycle Time

In [17]:
semiLogRdd = logRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")),x)).persist()
semiLogRdd.count()

25000

In [18]:
joinedRDD = semiLogRdd.join(semiLogRdd).persist()
joinedRDD.count()

8355026

In [19]:
joinedRDD.first()

(('line_04', 'item_013'),
 ({'lineId': 'line_04',
   'itemId': 'item_013',
   'ts': datetime.datetime(2010, 1, 1, 0, 41, 49),
   'qty': 259},
  {'lineId': 'line_04',
   'itemId': 'item_013',
   'ts': datetime.datetime(2010, 1, 1, 0, 41, 49),
   'qty': 259}))

In [20]:
triangleJoinedRdd = joinedRDD.filter(lambda x: x[1][0].get("ts")<x[1][1].get("ts")).persist()
triangleJoinedRdd.count()

4165013

In [21]:
triangleJoinedRdd.first()

(('line_04', 'item_013'),
 ({'lineId': 'line_04',
   'itemId': 'item_013',
   'ts': datetime.datetime(2010, 1, 1, 0, 41, 49),
   'qty': 259},
  {'lineId': 'line_04',
   'itemId': 'item_013',
   'ts': datetime.datetime(2010, 1, 1, 16, 53, 3),
   'qty': 415}))

In [22]:
triangleJoinedRdd.map(lambda x: ((x[0],x[1][0].get("ts")), (x[1][1].get("ts")-x[1][0].get("ts"),x))).first()

((('line_04', 'item_013'), datetime.datetime(2010, 1, 1, 0, 41, 49)),
 (datetime.timedelta(seconds=58274),
  (('line_04', 'item_013'),
   ({'lineId': 'line_04',
     'itemId': 'item_013',
     'ts': datetime.datetime(2010, 1, 1, 0, 41, 49),
     'qty': 259},
    {'lineId': 'line_04',
     'itemId': 'item_013',
     'ts': datetime.datetime(2010, 1, 1, 16, 53, 3),
     'qty': 415}))))

In [23]:
triangleJoinedRdd.map(lambda x: ((x[0],x[1][0].get("ts")), (x[1][1].get("ts")-x[1][0].get("ts"),x)))\
                .filter(lambda x: x[0]==(('line_03', 'item_005'), datetime(2010, 1, 1, 0, 13, 41))).count()

0

In [24]:
def f(x,y):
    if x[0]<y[0]:
        return x
    return y
def prepare(x):
    x[1][1][1][0]["cycleTime"] = x[1][0]
    return x[1][1][1][0]
    
enrichedLogRdd = triangleJoinedRdd.map(lambda x: ((x[0],x[1][0].get("ts")), (x[1][1].get("ts")-x[1][0].get("ts"),x)))\
                    .reduceByKey(lambda x,y: f(x,y)).map(lambda x: prepare(x)).persist()
enrichedLogRdd.first()

{'lineId': 'line_04',
 'itemId': 'item_013',
 'ts': datetime.datetime(2010, 1, 1, 16, 53, 3),
 'qty': 415,
 'cycleTime': datetime.timedelta(seconds=29394)}

## Exercise (2): why if we count, we see fewer items?

In [25]:
enrichedLogRdd.count()

24925

## Exercise (3): I want to see the histogram of cycle times

## Extract Snapshot from Master Data

In [26]:
masterDataRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")), (x.get("cost"),x.get("ts")) )  ).first()

(('line_05', 'item_010'), (341, datetime.datetime(2010, 1, 16, 22, 18, 38)))

In [27]:
masterDataRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")))).distinct().count()

75

In [28]:
masterDataRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")), (x.get("cost"),str(x.get("ts"))) )  )\
            .filter(lambda x: x[0]==(('line_025', 'item_078'))).sortBy(lambda x: x[1][1]).collect()

[]

In [29]:
logRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")), str(x.get("ts"))) )\
            .filter(lambda x: x[0]==(('line_025', 'item_078'))).sortBy(lambda x: x[1][1]).collect()

[]

In [30]:
masterDataRdd.map(lambda x: ((x.get("lineId"),x.get("itemId")),1)).reduceByKey(lambda x,y: x+y).top(5,lambda x: x[1])

[(('line_01', 'item_014'), 32),
 (('line_01', 'item_007'), 29),
 (('line_03', 'item_003'), 27),
 (('line_05', 'item_003'), 27),
 (('line_04', 'item_015'), 26)]