In [1]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.appName("myApp").getOrCreate()

In [8]:
import re

## Exercise 1.2

In [22]:
# Creat a pair RDD from user_libraries.txt using the user hash as key and liked paper(s) as value(s)
userRatingsRDD = \
    sparkSession.sparkContext.textFile("./users_libraries.txt") \
            .map(lambda line: line.split(';')) \
            .flatMapValues(lambda value: value.split(','))
# Display first elements of the RDD
userRatingsRDD.take(3)

[('28d3f81251d94b09735497477a5e4e02', '3929762'),
 ('28d3f81251d94b09735497477a5e4e02', '503574'),
 ('28d3f81251d94b09735497477a5e4e02', '5819422')]

In [23]:
# Create a pair RDD from papers.csv mapping paper IDs to their words
paperTermsRDD = \
    papersRDD = sparkSession.sparkContext.textFile("./papers.csv") \
            .map(lambda line: line.split(',')) \
            .map(lambda split: (split[0], ','.join(split[12:]))) \
            .flatMapValues(lambda text: re.split('[^A-Za-z]+', text)) \
            .filter(lambda pair: len(pair[1]) > 0)
# Display first elements of the RDD
paperTermsRDD.take(10)

[('80546', 'the'),
 ('80546', 'arbitrariness'),
 ('80546', 'of'),
 ('80546', 'the'),
 ('80546', 'genetic'),
 ('80546', 'code'),
 ('80546', 'the'),
 ('80546', 'genetic'),
 ('80546', 'code'),
 ('80546', 'has')]

## Exercise 1.3

In [13]:
# Collect all stopwords in memory and clean the RDD against them
stopWords = sparkSession.sparkContext.textFile("./stopwords_en.txt").collect()
stopWordsBrdcast = sparkSession.sparkContext.broadcast(stopWords)
cleanedPaperTermsRDD = paperTermsRDD.filter(lambda pair: pair[1] not in stopWordsBrdcast.value)

cleanedPaperTermsRDD.take(10)

[('80546', 'arbitrariness'),
 ('80546', 'genetic'),
 ('80546', 'code'),
 ('80546', 'genetic'),
 ('80546', 'code'),
 ('80546', 'regarded'),
 ('80546', 'arbitrary'),
 ('80546', 'sense'),
 ('80546', 'codon'),
 ('80546', 'amino')]

So far the calculations were not costy. But, as the evaluations are lazy on Spark, it's better to do the expensive calculations in one go.

In [36]:
userRatingsRDD \
    .map(lambda pair: (pair[1], pair[0])) \
    .join(cleanedPaperTermsRDD) \
    .map(lambda pair: (pair[1], 1)) \
    .reduceByKey(lambda i, j: i + j) \
    .map(lambda x: ((x[0][0], (x[0][1], x[1])))) \
    .groupByKey() \
    .mapValues(lambda kws: sorted(list(kws), key=lambda x: x[1], reverse=True)[:10]) \
    .mapValues(lambda x: [t[0] for t in x]) \
    .saveAsTextFile("final_results")

A step-by-step explanation of the script above is represented in the following section. The shape of data in each step is also represented.

### Explanation

In [17]:
# Join the pair RDDs
userTerms = userRatingsRDD \
    .map(lambda pair: (pair[1], pair[0])) \
    .join(cleanedPaperTermsRDD) \
    .map(lambda pair: pair[1])
    
userTerms.take(10)

[('28d3f81251d94b09735497477a5e4e02', 'cosmic'),
 ('28d3f81251d94b09735497477a5e4e02', 'acceleration'),
 ('28d3f81251d94b09735497477a5e4e02', 'slowing'),
 ('81c1b56cea8dd0186219a7def2886350', 'cosmic'),
 ('81c1b56cea8dd0186219a7def2886350', 'acceleration'),
 ('81c1b56cea8dd0186219a7def2886350', 'slowing'),
 ('fe85965ab25e37621a184e7f4dccdd30', 'cosmic'),
 ('fe85965ab25e37621a184e7f4dccdd30', 'acceleration'),
 ('fe85965ab25e37621a184e7f4dccdd30', 'slowing'),
 ('34dbe7dc7c81c83667f4bbb3b2bd0f06', 'cosmic')]

In [24]:
# Simple word count, but also considering user token
userTermFrequencies = userTerms \
    .map(lambda pair: (pair, 1)) \
    .reduceByKey(lambda i, j: i + j)

userTermFrequencies.take(10)

[(('beaca02b21b7cad6cb738c5e2682af8d', 'successfully'), 6),
 (('d816955a32c36c82fdce10baf0472c1c', 'metabolic'), 1),
 (('289136b1ae39ff03d70950f6641caf5a', 'periphery'), 3),
 (('eef7bb9df43870abc3dc4137fd526f24', 'young'), 2),
 (('67dfe95d7c27571f3935bf9ed76e6860', 'misunderstood'), 1),
 (('abc1d60f9d772bfc76247fa779ae9df0', 'amounts'), 1),
 (('ccc99b52ecd364c6798a98a718ab819b', 'meaning'), 2),
 (('d11ef8f8dc92dca1127c5ea632fc7174', 'information'), 9),
 (('1fb822f9918c7d6a02cb156b514831d0', 'complex'), 26),
 (('60b658a3714fcbec85075dc057ee7c0f', 'elements'), 7)]

In [25]:
# Change the pairing in order to have the token as the key
rearrangedRDD = userTermFrequencies.map(lambda x: ((x[0][0], (x[0][1], x[1]))))

rearrangedRDD.take(10)

[('2aee76aa67126cace3f38aafe195ecd2', ('bioinformatics', 7)),
 ('807175702493c11eff8e5c312b59afb7', ('paper', 13)),
 ('3be1f660f56216bd8e832b07fed40455', ('guide', 21)),
 ('cd17c8d5ca03a1381ad3466679d87000', ('binding', 761)),
 ('77edc9dc774ba3daa0e56d42d15daab2', ('study', 25)),
 ('95ea796353515bb02bcbc226e037e4a3', ('paper', 4)),
 ('a754bd77cec20e3b8cd2af71954ec7ef', ('considerably', 1)),
 ('2ee2936259a62957bf7b68021e989c7f', ('documented', 6)),
 ('0530b8ed834fa63f24e13ce2e1205d15', ('cities', 1)),
 ('9397ae5efa95bcbee7ad8e408eadfe54', ('measures', 9))]

In [26]:
# Sort words by user
groupedSortedRDD = rearrangedRDD \
    .groupByKey() \
    .map(lambda x: (x[0], sorted(list(x[1]), key=lambda x: x[1], reverse=True)))

groupedSortedRDD.take(5)

[('a14def34540418e7fdcb76a68b514802',
  [('data', 494),
   ('sequencing', 435),
   ('genome', 267),
   ('variants', 204),
   ('analysis', 192),
   ('human', 156),
   ('methods', 155),
   ('research', 144),
   ('generation', 143),
   ('sequence', 143),
   ('exome', 129),
   ('based', 122),
   ('reads', 119),
   ('genetic', 115),
   ('disease', 113),
   ('high', 108),
   ('clinical', 105),
   ('information', 99),
   ('software', 95),
   ('results', 93),
   ('genomic', 93),
   ('tools', 90),
   ('read', 89),
   ('studies', 87),
   ('alignment', 86),
   ('cancer', 85),
   ('gene', 85),
   ('genes', 84),
   ('number', 83),
   ('variation', 82),
   ('dna', 81),
   ('quality', 77),
   ('single', 76),
   ('large', 76),
   ('short', 73),
   ('mapping', 72),
   ('developed', 70),
   ('coding', 66),
   ('mutations', 65),
   ('genomes', 64),
   ('tool', 63),
   ('technologies', 63),
   ('approach', 63),
   ('functional', 62),
   ('diseases', 59),
   ('computational', 58),
   ('error', 57),
   ('ra

In [27]:
topTenGroupRDD = groupedSortedRDD.map(lambda x: ([x[0]] ,x[1][:10]))

topTenGroupRDD.take(5)

[(['a14def34540418e7fdcb76a68b514802'],
  [('data', 494),
   ('sequencing', 435),
   ('genome', 267),
   ('variants', 204),
   ('analysis', 192),
   ('human', 156),
   ('methods', 155),
   ('research', 144),
   ('generation', 143),
   ('sequence', 143)]),
 (['119620e3318b1aa51146838b87df6430'],
  [('tick', 50),
   ('deer', 32),
   ('disease', 31),
   ('tbe', 30),
   ('human', 26),
   ('host', 26),
   ('infection', 25),
   ('borne', 24),
   ('analysis', 23),
   ('risk', 22)]),
 (['4b348f9a180f539867029399f40a6515'],
  [('software', 61),
   ('agile', 48),
   ('development', 45),
   ('research', 27),
   ('estimation', 25),
   ('test', 25),
   ('based', 23),
   ('factors', 17),
   ('paper', 17),
   ('project', 17)]),
 (['60ad64a511550a28d10eec341524a912'],
  [('br', 28),
   ('technology', 22),
   ('design', 21),
   ('communication', 20),
   ('internet', 19),
   ('social', 18),
   ('mobile', 18),
   ('interaction', 13),
   ('body', 12),
   ('life', 12)]),
 (['c3391f00cde062923c8d2abfd032a6a

In [29]:
finalResultRDD = topTenGroupRDD.mapValues(lambda x: [t[0] for t in x])
finalResultRDD.take(10)

[(['a14def34540418e7fdcb76a68b514802'],
  ['data',
   'sequencing',
   'genome',
   'variants',
   'analysis',
   'human',
   'methods',
   'research',
   'generation',
   'sequence']),
 (['119620e3318b1aa51146838b87df6430'],
  ['tick',
   'deer',
   'disease',
   'tbe',
   'human',
   'host',
   'infection',
   'borne',
   'analysis',
   'risk']),
 (['4b348f9a180f539867029399f40a6515'],
  ['software',
   'agile',
   'development',
   'research',
   'estimation',
   'test',
   'based',
   'factors',
   'paper',
   'project']),
 (['60ad64a511550a28d10eec341524a912'],
  ['br',
   'technology',
   'design',
   'communication',
   'internet',
   'social',
   'mobile',
   'interaction',
   'body',
   'life']),
 (['c3391f00cde062923c8d2abfd032a6a8'],
  ['phylogenetic',
   'genes',
   'based',
   'tree',
   'gene',
   'data',
   'sequence',
   'protein',
   'proteins',
   'method']),
 (['8279ef5b7bac521b42cdc7c86fd577c7'],
  ['molecular',
   'transport',
   'arrays',
   'single',
   'molecule

In [30]:
finalResultRDD.saveAsTextFile("output_files")