In [1]:
#remember to clone https://github.com/brcondor/Architectures_for_Big_Data into '/home/jovyan/work/'
import sys
sys.path.append("/home/jovyan/work/Architectures_for_Big_Data/")
import pyspark
sc = pyspark.SparkContext("local[3]")

In [4]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *
from datetime import datetime

dataset = datasetGenerator()
dataset.addGenerator(idGenerator(),prefix="book",min=1,max=100,keyName="bookId")
dataset.addGenerator(idGenerator(),prefix="token",min=1,max=5000,keyName="tokenId")

bookTokens = sc.parallelize( dataset.generateDataset(250000)).persist()
bookTokens.first()

{'bookId': 'book_0065', 'tokenId': 'token_00482'}

# Black Belt Histogram

In [3]:
## Compute the term frequency for each token w.r.t. each book
bookTermFrequency = bookTokens.map(lambda x: ((x.get("bookId"),x.get("tokenId")),1)).reduceByKey(lambda x,y: x+y)\
                    .map(lambda x: {"bookId":x[0][0],"tokenId":x[0][1], "termFrequency":x[1]} ).persist()
bookTermFrequency.count()

196871

In [4]:
## Compute the hist for token frequencies
bookTermFrequency.map(lambda x: (x.get("termFrequency"),1)).reduceByKey(lambda x,y: x+y).top(15,key=lambda x: 1/x[1])

[(7, 1), (6, 5), (5, 83), (4, 778), (3, 6221), (2, 37990), (1, 151793)]

# TF-IDF 
## Reading from a CDC Master table

In [5]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *
from datetime import datetime

dataset = datasetGenerator()
dataset.addGenerator(idGenerator(),prefix="book",min=1,max=100,keyName="bookId")
dataset.addGenerator(idGenerator(),prefix="token",min=1,max=500,keyName="tokenId")
dataset.addGenerator(intGenerator(),min=1,max=1000,keyName="termFrequency")
dataset.addGenerator(dateGenerator(),startdate=datetime(2018,1,1),max=datetime(2020,1,1),keyName="insertDate")


bookTermFrequencyUniform = sc.parallelize( dataset.generateDataset(250000)).persist()
bookTermFrequencyUniform.first()

{'bookId': 'book_0002',
 'tokenId': 'token_0295',
 'termFrequency': 699,
 'insertDate': datetime.datetime(2017, 6, 1, 10, 25, 31)}

In [7]:
## add exponentiality ==> np.random.exponential()
import numpy as np
def exp(row):
    row["termFrequency"] = int(np.random.exponential() * row["termFrequency"])
    return row
bookTermFrequency = bookTermFrequencyUniform.map(lambda x: exp(x))
bookTermFrequency.first()

{'bookId': 'book_0002',
 'tokenId': 'token_0295',
 'termFrequency': 900,
 'insertDate': datetime.datetime(2017, 6, 1, 10, 25, 31)}

### Extract Master Data Snapshot

In [39]:
## lets get the snapshot of bookTermFrequency (is a Master Data Registry)
def getKeys(row):
    pass 
def getTs(row):
    pass
    
bookTermFrequencySnapshot_step0 = bookTermFrequency.map(
bookTermFrequencySnapshot_step0.first()

(('book_0088', 'token_0283'),
 (datetime.datetime(2013, 7, 18, 0, 35, 54),
  {'bookId': 'book_0088',
   'tokenId': 'token_0283',
   'termFrequency': 2,
   'insertDate': datetime.datetime(2013, 7, 18, 0, 35, 54),
   'oldTermFrequency': 60}))

In [9]:

def getKeys(row):
    return row["bookId"], row["tokenId"]
def getTs(row):
    return row["insertDate"]
    
bookTermFrequencySnapshot_step0 = bookTermFrequency.map(lambda x: (getKeys(x), (getTs(x),x)))
bookTermFrequencySnapshot_step0.first()

(('book_0002', 'token_0295'),
 (datetime.datetime(2017, 6, 1, 10, 25, 31),
  {'bookId': 'book_0002',
   'tokenId': 'token_0295',
   'termFrequency': 900,
   'insertDate': datetime.datetime(2017, 6, 1, 10, 25, 31)}))

In [10]:
# take only the last value
bookTermFrequencySnapshot_step1 = bookTermFrequencySnapshot_step0.reduceByKey(lambda x,y: x if x[0]>y[0] else y)
bookTermFrequencySnapshot_step1.first()

(('book_0002', 'token_0295'),
 (datetime.datetime(2019, 4, 10, 13, 58, 29),
  {'bookId': 'book_0002',
   'tokenId': 'token_0295',
   'termFrequency': 6,
   'insertDate': datetime.datetime(2019, 4, 10, 13, 58, 29)}))

In [11]:
# come back to the dict per row format
bookTermFrequencySnapshot = bookTermFrequencySnapshot_step1.map(lambda x: x[1][1])
bookTermFrequencySnapshot.count()

49644

In [12]:
bookTermFrequencySnapshot.first()

{'bookId': 'book_0002',
 'tokenId': 'token_0295',
 'termFrequency': 6,
 'insertDate': datetime.datetime(2019, 4, 10, 13, 58, 29)}

## inverseDocumentFrequency (iDF)

In [14]:
## compute iDF - start with distribution of tokens
iDF = bookTermFrequencySnapshot.map(lambda x: (x.get("tokenId"),1)).reduceByKey(lambda x,y: x+y)
iDF.take(5)

[('token_0295', 100),
 ('token_0446', 99),
 ('token_0261', 98),
 ('token_0500', 100),
 ('token_0365', 100)]

In [43]:
iDF.take(5)
# iDF(term1) = log(N_tot/N_with_term1)...

[('token_0333', 98),
 ('token_0442', 100),
 ('token_0229', 98),
 ('token_0135', 98),
 ('token_0419', 100)]

In [18]:
# compute iDF (again)
import numpy as np
totBooks = bookTermFrequencySnapshot.map(lambda x: x.get("bookId")).distinct().count() 
iDF = bookTermFrequencySnapshot.map(lambda x: (x.get("tokenId"),1)).reduceByKey(lambda x,y: x+y)\
                .map(lambda x: (x[0],np.log(x[1]/totBooks)))
iDF.take(5)

[('token_0295', 0.0),
 ('token_0446', -0.01005033585350145),
 ('token_0261', -0.020202707317519466),
 ('token_0500', 0.0),
 ('token_0365', 0.0)]

# Exercise (1) - solo and small teams
The actual dataset generator always generate data from a random distribution.

e.g., id generator returns 
```python
return prefix+"_"+str(randint(min, max)).zfill(len(str(max))+1)
```

If we would like to use it in these exercises, we would need to create stronger id generator where the sampling strategy could be changed. Provide an implementation of 
```python
class nonUniformeIdGenerator(typeGenerator)
class expIntGenerator(typeGenerator)
class expFloatGenerator(typeGenerator)
```

# Exercise (2) - additional (mandatory for Big Team)
Extend datasetGenerator() 
```python
class logDataset(datasetGenerator)
class registryDataset(typeGenerator)
class cdcRegistryDataset(typeGenerator)
```

In [23]:
## combine bookTermFrequency with iDF to obtain 
## {'bookId': 'book_0097', 'tokenId': 'token_0359', 'termFrequency': 576,"iDF":0.12, "TF-iDF":576*0.12}
def parser(x):
    x[1][0]["iDF"] = x[1][1]
    x[1][0]["TF-iDF"] = x[1][1]*x[1][0].get("termFrequency")
    return x[1][0]

bookTFiDF = bookTermFrequency.map(lambda x: (x.get("tokenId"),x)).join(iDF).map(lambda x: parser(x))
bookTFiDF.first()

{'bookId': 'book_0002',
 'tokenId': 'token_0295',
 'termFrequency': 55,
 'insertDate': datetime.datetime(2017, 6, 1, 10, 25, 31),
 'iDF': 0.0,
 'TF-iDF': 0.0}

In [24]:
bookTFiDF.map(lambda x: x.get("TF-iDF")).stats()

(count: 250000, mean: -3.586772870725633, stdev: 8.19504185972357, max: 0.0, min: -218.3710633950679)

# Book Similarity

In [65]:
bookTFiDF.first()

{'bookId': 'book_0093',
 'tokenId': 'token_0333',
 'termFrequency': 1000,
 'insertDate': datetime.datetime(2016, 9, 24, 14, 59, 58),
 'oldTermFrequency': 552,
 'iDF': 0.02020270731751947,
 'TF-iDF': 20.20270731751947}

```python
sim(bookA,bookB) = sum(bookA_token_i * bookB_token_i)/(norm(bookA)*norm(bookA))
```

In [66]:
bookA_bookB_contributions = bookTFiDF.map(...
bookA_bookB_contributions.first()

('token_0333',
 ({'bookId': 'book_0093',
   'tokenId': 'token_0333',
   'termFrequency': 1000,
   'insertDate': datetime.datetime(2016, 9, 24, 14, 59, 58),
   'oldTermFrequency': 552,
   'iDF': 0.02020270731751947,
   'TF-iDF': 20.20270731751947},
  {'bookId': 'book_0093',
   'tokenId': 'token_0333',
   'termFrequency': 1000,
   'insertDate': datetime.datetime(2016, 9, 24, 14, 59, 58),
   'oldTermFrequency': 552,
   'iDF': 0.02020270731751947,
   'TF-iDF': 20.20270731751947}))

In [67]:
# lets make a triangular matrix
bookA_bookB_contrTriang = bookA_bookB_contributions...
bookA_bookB_contrTriang.first()

('token_0333',
 ({'bookId': 'book_0093',
   'tokenId': 'token_0333',
   'termFrequency': 1000,
   'insertDate': datetime.datetime(2016, 9, 24, 14, 59, 58),
   'oldTermFrequency': 552,
   'iDF': 0.02020270731751947,
   'TF-iDF': 20.20270731751947},
  {'bookId': 'book_0055',
   'tokenId': 'token_0333',
   'termFrequency': 843,
   'insertDate': datetime.datetime(2015, 4, 28, 2, 49, 56),
   'oldTermFrequency': 223,
   'iDF': 0.02020270731751947,
   'TF-iDF': 17.03088226866891}))

In [74]:
# now we need to build the line to apply the sum 
eachTokenContr = bookA_bookB_contrTriang.map(...
                
eachTokenContr.first()

(('book_0093', 'book_0055'), 344.06992983304997)

In [75]:
bookToBookSimilarity = eachTokenContr...
bookToBookSimilarity.take(5)

[(('book_0093', 'book_0040'), 490589.8779948853),
 (('book_0093', 'book_0052'), 394761.6625497298),
 (('book_0055', 'book_0025'), 376350.4987694563),
 (('book_0055', 'book_0009'), 366855.5669745611),
 (('book_0055', 'book_0027'), 397924.2586771737)]

In [79]:
#extract for each book to 5 closer books
bookToBookSimilarity.map(lambda x: ...

[('book_0018',
  [(308422.42330327886, 'book_0015'),
   (333297.3672153354, 'book_0013'),
   (338483.7670688783, 'book_0016'),
   (342978.31636367145, 'book_0008'),
   (353904.65181296325, 'book_0007')]),
 ('book_0053',
  [(271124.8324598133, 'book_0016'),
   (280498.2250452823, 'book_0013'),
   (283393.5724598963, 'book_0012'),
   (283597.4902153876, 'book_0039'),
   (284306.2925381678, 'book_0044')]),
 ('book_0082',
  [(253262.12105578918, 'book_0039'),
   (264230.7942164881, 'book_0013'),
   (267181.32763212104, 'book_0075'),
   (275957.7782837865, 'book_0024'),
   (278301.92212257243, 'book_0050')]),
 ('book_0071',
  [(273745.62675940956, 'book_0007'),
   (275134.91709174804, 'book_0013'),
   (281544.6283921305, 'book_0008'),
   (288046.5346574101, 'book_0015'),
   (291442.57781192847, 'book_0026')]),
 ('book_0016',
  [(255996.59628928234, 'book_0015'),
   (265398.08730256563, 'book_0008'),
   (268749.9762100572, 'book_0013'),
   (284225.9609940841, 'book_0007'),
   (294623.3695300

# Exercise 2
In the previous example, we computed only the book_A_i*book_B_i... we missed the part below...