In [1]:
#remember to clone https://github.com/brcondor/Architectures_for_Big_Data into '/home/jovyan/work/'
import sys
sys.path.append("/home/jovyan/work/Architectures_for_Big_Data/")
import pyspark
sc = pyspark.SparkContext("local[3]")

In [2]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *
from datetime import datetime

dataset = datasetGenerator()
dataset.addGenerator(idGenerator(),prefix="book",min=1,max=100,keyName="bookId")
dataset.addGenerator(idGenerator(),prefix="token",min=1,max=5000,keyName="tokenId")

bookTokens = sc.parallelize( dataset.generateDataset(250000)).persist()
bookTokens.first()

{'bookId': 'book_0013', 'tokenId': 'token_01928'}

# Black Belt Histogram

In [3]:
## Compute the term frequency for each token w.r.t. each book
bookTermFrequency = bookTokens.map(lambda x: ((x.get("bookId"),x.get("tokenId")),1)).reduceByKey(lambda x,y: x+y)\
                    .map(lambda x: {"bookId":x[0][0],"tokenId":x[0][1], "termFrequency":x[1]} ).persist()
bookTermFrequency.count()

196871

In [4]:
## Compute the hist for token frequencies
bookTermFrequency.map(lambda x: (x.get("termFrequency"),1)).reduceByKey(lambda x,y: x+y).top(15,key=lambda x: 1/x[1])

[(7, 1), (6, 5), (5, 83), (4, 778), (3, 6221), (2, 37990), (1, 151793)]

# TF-IDF 
## Reading from a CDC Master table

In [37]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *
from datetime import datetime

dataset = datasetGenerator()
dataset.addGenerator(idGenerator(),prefix="book",min=1,max=100,keyName="bookId")
dataset.addGenerator(idGenerator(),prefix="token",min=1,max=500,keyName="tokenId")
dataset.addGenerator(intGenerator(),min=1,max=1000,keyName="termFrequency")
dataset.addGenerator(dateGenerator(),startdate=datetime(2018,1,1),max=datetime(2020,1,1),keyName="insertDate")


bookTermFrequencyUniform = sc.parallelize( dataset.generateDataset(250000)).persist()
bookTermFrequencyUniform.first()

{'bookId': 'book_0088',
 'tokenId': 'token_0283',
 'termFrequency': 60,
 'insertDate': datetime.datetime(2013, 7, 18, 0, 35, 54)}

In [38]:
## add exponentiality
import numpy as np
def exp(row):
    row["oldTermFrequency"]  =  row["termFrequency"] 
    row["termFrequency"] = int(np.random.exponential()* row["termFrequency"])
    return row
    
bookTermFrequency = bookTermFrequencyUniform.map(lambda x: exp(x)).persist()
bookTermFrequency.first()

{'bookId': 'book_0088',
 'tokenId': 'token_0283',
 'termFrequency': 2,
 'insertDate': datetime.datetime(2013, 7, 18, 0, 35, 54),
 'oldTermFrequency': 60}

### Extract Master Data Snapshot

In [39]:
## lets get the snapshot of bookTermFrequency (is a Master Data Registry)
def getKeys(row):
    return row.get("bookId"),row.get("tokenId")
def getTs(row):
    return row.get("insertDate")
    
bookTermFrequencySnapshot_step0 = bookTermFrequency.map(lambda x: (getKeys(x),(getTs(x),x)))
bookTermFrequencySnapshot_step0.first()

(('book_0088', 'token_0283'),
 (datetime.datetime(2013, 7, 18, 0, 35, 54),
  {'bookId': 'book_0088',
   'tokenId': 'token_0283',
   'termFrequency': 2,
   'insertDate': datetime.datetime(2013, 7, 18, 0, 35, 54),
   'oldTermFrequency': 60}))

In [40]:
bookTermFrequencySnapshot_step1 = bookTermFrequencySnapshot_step0.reduceByKey(lambda x,y: x if x[0]>y[0] else y)
bookTermFrequencySnapshot_step1.first()

(('book_0093', 'token_0333'),
 (datetime.datetime(2018, 7, 18, 15, 34, 30),
  {'bookId': 'book_0093',
   'tokenId': 'token_0333',
   'termFrequency': 11,
   'insertDate': datetime.datetime(2018, 7, 18, 15, 34, 30),
   'oldTermFrequency': 188}))

In [41]:
bookTermFrequencySnapshot = bookTermFrequencySnapshot_step1.map(lambda x: x[1][1]).persist()
bookTermFrequencySnapshot.count()

49666

## inverseDocumentFrequency (iDF)

In [42]:
## compute iDF
iDF = bookTermFrequencySnapshot.map(lambda x: (x.get("tokenId"),1)).reduceByKey(lambda x,y: x+y).persist()
iDF.take(5)

[('token_0333', 98),
 ('token_0442', 100),
 ('token_0229', 98),
 ('token_0135', 98),
 ('token_0419', 100)]

In [43]:
iDF.take(5)
# iDF = log(N/N_t)

[('token_0333', 98),
 ('token_0442', 100),
 ('token_0229', 98),
 ('token_0135', 98),
 ('token_0419', 100)]

In [44]:
# compute iDF (again)
import numpy as np
totBooks = bookTermFrequencySnapshot.map(lambda x: (x.get("bookId"))).distinct().count()
iDF = bookTermFrequencySnapshot.map(lambda x: (x.get("tokenId"),1)).reduceByKey(lambda x,y: x+y)\
                                .map(lambda x:(x[0],np.log(totBooks/x[1]))).persist()
iDF.take(5)

[('token_0333', 0.02020270731751947),
 ('token_0442', 0.0),
 ('token_0229', 0.02020270731751947),
 ('token_0135', 0.02020270731751947),
 ('token_0419', 0.0)]

# Exercise (1) - solo and small teams
The actual dataset generator always generate data from a random distribution.

e.g., id generator returns 
```python
return prefix+"_"+str(randint(min, max)).zfill(len(str(max))+1)
```

If we would like to use it in these exercises, we would need to create stronger id generator where the sampling strategy could be changed. Provide an implementation of 
```python
class nonUniformeIdGenerator(typeGenerator)
class expIntGenerator(typeGenerator)
class expFloatGenerator(typeGenerator)
```

# Exercise (2) - additional (mandatory for Big Team)
Extend datasetGenerator() 
```python
class logDataset(datasetGenerator)
class registryDataset(typeGenerator)
class cdcRegistryDataset(typeGenerator)
```

In [63]:
## combine bookTermFrequency with iDF to obtain 
## {'bookId': 'book_0097', 'tokenId': 'token_0359', 'termFrequency': 576,"iDF":0.12, "TF-iDF":576*0.12}
def parser(x):
    x[1][0]["iDF"] = x[1][1]
    x[1][0]["TF-iDF"] = x[1][0]["termFrequency"]*x[1][1]
    return x[1][0]

bookTFiDF = bookTermFrequency.map(lambda x: (x.get("tokenId"),x)).join(iDF).map(lambda x: parser(x))
bookTFiDF.first()

{'bookId': 'book_0093',
 'tokenId': 'token_0333',
 'termFrequency': 1000,
 'insertDate': datetime.datetime(2016, 9, 24, 14, 59, 58),
 'oldTermFrequency': 552,
 'iDF': 0.02020270731751947,
 'TF-iDF': 20.20270731751947}

In [64]:
bookTFiDF.map(lambda x: x.get("TF-iDF")).stats()

(count: 250000, mean: 3.3292810066317293, stdev: 8.045016580100247, max: 207.90641809165973, min: 0.0)

# Book Similarity

In [65]:
bookTFiDF.first()

{'bookId': 'book_0093',
 'tokenId': 'token_0333',
 'termFrequency': 1000,
 'insertDate': datetime.datetime(2016, 9, 24, 14, 59, 58),
 'oldTermFrequency': 552,
 'iDF': 0.02020270731751947,
 'TF-iDF': 20.20270731751947}

```python
sim(bookA,bookB) = sum(bookA_token_i * bookB_token_i)/(norm(bookA)*norm(bookA))
```

In [66]:
bookA_bookB_contributions = bookTFiDF.map(lambda x: (x.get("tokenId"),x)).join(bookTFiDF.map(lambda x: (x.get("tokenId"),x)))
bookA_bookB_contributions.first()

('token_0333',
 ({'bookId': 'book_0093',
   'tokenId': 'token_0333',
   'termFrequency': 1000,
   'insertDate': datetime.datetime(2016, 9, 24, 14, 59, 58),
   'oldTermFrequency': 552,
   'iDF': 0.02020270731751947,
   'TF-iDF': 20.20270731751947},
  {'bookId': 'book_0093',
   'tokenId': 'token_0333',
   'termFrequency': 1000,
   'insertDate': datetime.datetime(2016, 9, 24, 14, 59, 58),
   'oldTermFrequency': 552,
   'iDF': 0.02020270731751947,
   'TF-iDF': 20.20270731751947}))

In [67]:
# lets make a triangular matrix
bookA_bookB_contrTriang = bookA_bookB_contributions.filter(lambda x: (x[1][0].get("bookId")>x[1][1].get("bookId")) )
bookA_bookB_contrTriang.first()

('token_0333',
 ({'bookId': 'book_0093',
   'tokenId': 'token_0333',
   'termFrequency': 1000,
   'insertDate': datetime.datetime(2016, 9, 24, 14, 59, 58),
   'oldTermFrequency': 552,
   'iDF': 0.02020270731751947,
   'TF-iDF': 20.20270731751947},
  {'bookId': 'book_0055',
   'tokenId': 'token_0333',
   'termFrequency': 843,
   'insertDate': datetime.datetime(2015, 4, 28, 2, 49, 56),
   'oldTermFrequency': 223,
   'iDF': 0.02020270731751947,
   'TF-iDF': 17.03088226866891}))

In [74]:
# now we need to build the line to apply the sum 
eachTokenContr = bookA_bookB_contrTriang.map(lambda x: (x[1]))\
                .map(lambda x: ((x[0].get("bookId"),x[1].get("bookId")),x[0].get("TF-iDF")*x[1].get("TF-iDF")) )
eachTokenContr.first()

(('book_0093', 'book_0055'), 344.06992983304997)

In [75]:
bookToBookSimilarity = eachTokenContr.reduceByKey(lambda x,y: x+y)
bookToBookSimilarity.take(5)

[(('book_0093', 'book_0040'), 490589.8779948853),
 (('book_0093', 'book_0052'), 394761.6625497298),
 (('book_0055', 'book_0025'), 376350.4987694563),
 (('book_0055', 'book_0009'), 366855.5669745611),
 (('book_0055', 'book_0027'), 397924.2586771737)]

In [80]:
#extract for each book to 5 closer books
bookToBookSimilarity.map(lambda x: (x[0][0],[(x[1],x[0][1])])).reduceByKey(lambda x,y: sorted(x+y,reverse=True)[0:5]).take(5)

[('book_0018',
  [(478090.16730927187, 'book_0001'),
   (452870.694649205, 'book_0014'),
   (410907.90912833787, 'book_0005'),
   (407297.0970469734, 'book_0002'),
   (404272.1360993062, 'book_0017')]),
 ('book_0053',
  [(413900.020425716, 'book_0040'),
   (408906.899518296, 'book_0001'),
   (384913.35462971963, 'book_0004'),
   (376800.3168083103, 'book_0019'),
   (374543.0645824918, 'book_0047')]),
 ('book_0082',
  [(380752.9970726997, 'book_0040'),
   (376981.4355760529, 'book_0001'),
   (367719.8962979362, 'book_0038'),
   (367315.3053465798, 'book_0004'),
   (364728.0030140588, 'book_0045')]),
 ('book_0071',
  [(399861.93696522387, 'book_0047'),
   (394469.3769090045, 'book_0002'),
   (389053.1655425688, 'book_0018'),
   (384981.52358427766, 'book_0010'),
   (380516.32053608156, 'book_0044')]),
 ('book_0016',
  [(381357.1081600139, 'book_0002'),
   (362543.9564661828, 'book_0014'),
   (339107.5043046193, 'book_0005'),
   (337506.0343681572, 'book_0001'),
   (329709.30267198593, 'b

# Exercise 2 (optional)
In the previous example, we computed only the book_A_i*book_B_i... we missed the part below...