# Experiment 1
Yashveere Ramful, Mohammad-Ali Arabi

In [4]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as Funct
from pyspark.sql.window import Window
from datetime import datetime
import csv
import re

sparkSession = SparkSession.builder.appName("myApp").getOrCreate()
#Get default configurations
sparkSession.sparkContext._conf.getAll()
#Update default configurations
conf = sparkSession.sparkContext._conf.setAll([('spark.executor.memory', '16g')\
                                        , ('spark.app.name', 'Spark Updated Conf')\
                                        , ('spark.executor.cores', '12')\
                                        , ('spark.cores.max', '12')\
                                        , ('spark.driver.memory','16g')\
                                        ,('spark.driver.maxResultSize','16g')])
#Stop the current Spark Session
sparkSession.sparkContext.stop()
#Create a Spark Session
sparkSession = SparkSession.builder.config(conf=conf).getOrCreate()


## Exercise 1.2 (Loading the dataset into an RDD)
Loading the dataset in the following RDD:<br>
a) userRatingsRDD: pair RDD from **user_libraries.txt** using the user hash as the key and the liked paper(s) as value(s).<br>
b) paperTermsRDD: pair RDD from **papers.csv** using the paper_id as the key and the words contained in the abstract as the value(s).<br>

In [5]:
# Creat a pair RDD from user_libraries.txt using the user hash as key and liked paper(s) as value(s)
userRatingsRDD = \
    sparkSession.sparkContext.textFile("./users_libraries.txt") \
            .map(lambda line: line.split(';')) \
            .flatMapValues(lambda value: value.split(','))
# Display first elements of the RDD
userRatingsRDD.take(3)

[('28d3f81251d94b09735497477a5e4e02', '3929762'),
 ('28d3f81251d94b09735497477a5e4e02', '503574'),
 ('28d3f81251d94b09735497477a5e4e02', '5819422')]

In [6]:
# Create a pair RDD from papers.csv mapping paper IDs to their words
paperTermsRDD = \
    papersRDD = sparkSession.sparkContext.textFile("./papers.csv") \
            .map(lambda line: line.split(',')) \
            .map(lambda split: (split[0], ','.join(split[12:]))) \
            .flatMapValues(lambda text: re.split('[^A-Za-z]+', text)) \
            .filter(lambda pair: len(pair[1]) > 0)
# Display first elements of the RDD
paperTermsRDD.take(10)

[('80546', 'the'),
 ('80546', 'arbitrariness'),
 ('80546', 'of'),
 ('80546', 'the'),
 ('80546', 'genetic'),
 ('80546', 'code'),
 ('80546', 'the'),
 ('80546', 'genetic'),
 ('80546', 'code'),
 ('80546', 'has')]

## Exercise 1.3 (Joining Collections)
Compute for each user the top-10 most frequent words appearing in the papers she likes. Exclude the stop words listed in **stopwords_en.txt**.<br>
Store the results into a file which contains in each line the user hash and the list of her retrieved words sorted by frequency (top 1 is the most frequent).

In [7]:
# Collect all stopwords in memory and clean the RDD against them
stopWords = sparkSession.sparkContext.textFile("./stopwords_en.txt").collect()
stopWordsBrdcast = sparkSession.sparkContext.broadcast(stopWords)
cleanedPaperTermsRDD = paperTermsRDD.filter(lambda pair: pair[1] not in stopWordsBrdcast.value)

cleanedPaperTermsRDD.take(10)

[('80546', 'arbitrariness'),
 ('80546', 'genetic'),
 ('80546', 'code'),
 ('80546', 'genetic'),
 ('80546', 'code'),
 ('80546', 'regarded'),
 ('80546', 'arbitrary'),
 ('80546', 'sense'),
 ('80546', 'codon'),
 ('80546', 'amino')]

So far the calculations were not costy. But, as the evaluations are lazy on Spark, it's better to do the expensive calculations in one go. The following script ran in 5 minutes.

In [9]:
then = datetime.now()

userRatingsRDD \
    .map(lambda pair: (pair[1], pair[0])) \
    .join(cleanedPaperTermsRDD) \
    .map(lambda pair: (pair[1], 1)) \
    .reduceByKey(lambda i, j: i + j) \
    .map(lambda x: ((x[0][0], (x[0][1], x[1])))) \
    .groupByKey() \
    .mapValues(lambda kws: sorted(list(kws), key=lambda x: x[1], reverse=True)[:10]) \
    .mapValues(lambda x: [t[0] for t in x]) \
    .saveAsTextFile("final_results")

datetime.now() - then

datetime.timedelta(0, 293, 898320)

A step-by-step explanation of the script above is represented in the following section. The shape of data in each step is also represented.

### Explanation

In [10]:
# Join the pair RDDs
userTerms = userRatingsRDD \
    .map(lambda pair: (pair[1], pair[0])) \
    .join(cleanedPaperTermsRDD) \
    .map(lambda pair: pair[1])
    
userTerms.take(10)

[('28d3f81251d94b09735497477a5e4e02', 'cosmic'),
 ('28d3f81251d94b09735497477a5e4e02', 'acceleration'),
 ('28d3f81251d94b09735497477a5e4e02', 'slowing'),
 ('81c1b56cea8dd0186219a7def2886350', 'cosmic'),
 ('81c1b56cea8dd0186219a7def2886350', 'acceleration'),
 ('81c1b56cea8dd0186219a7def2886350', 'slowing'),
 ('fe85965ab25e37621a184e7f4dccdd30', 'cosmic'),
 ('fe85965ab25e37621a184e7f4dccdd30', 'acceleration'),
 ('fe85965ab25e37621a184e7f4dccdd30', 'slowing'),
 ('34dbe7dc7c81c83667f4bbb3b2bd0f06', 'cosmic')]

In [11]:
# Simple word count, but also considering user token
userTermFrequencies = userTerms \
    .map(lambda pair: (pair, 1)) \
    .reduceByKey(lambda i, j: i + j)

userTermFrequencies.take(10)

[(('6c03ffded7d902b39b5b63fc805db513', 'time'), 19),
 (('7c0081293b39880655fe3d2189b79a4b', 'world'), 15),
 (('f1e1cd4ff25018273aafc0c68fbb5a2f', 'territory'), 1),
 (('324fb28bdc7aba229e75bf86f65e234d', 'sciences'), 1),
 (('0370ffe8719d4c7c85a23288de9be95a', 'achieve'), 1),
 (('2f5994df953ffd8e5ab39cd69a49ce79', 'manage'), 1),
 (('8d898a2171f552b3dc8129bed0971cd6', 'capture'), 3),
 (('73d7ad289fbbe9863d450b8e7226c489', 'elimination'), 7),
 (('fd2e2779a2353a21f2e1a77df419fb75', 'upwards'), 1),
 (('514ffc4d29d415df46cce4575c89acaf', 'future'), 2)]

In [12]:
# Change the pairing in order to have the token as the key
rearrangedRDD = userTermFrequencies.map(lambda x: ((x[0][0], (x[0][1], x[1]))))

rearrangedRDD.take(10)

[('c25c28b1436dca865993150a03e3bf0b', ('atregnet', 2)),
 ('b7640351a824f4e554921e14cb2fc021', ('registration', 1)),
 ('f05bcffe7951de9e5a32fff4a42eb088', ('coming', 2)),
 ('e26a2573c01281fe18bd3f4c1e76e6f7', ('applications', 4)),
 ('453e87c3dec542289de8fc85185e4a5e', ('universities', 1)),
 ('f83821331f4526800702c8f8a17494d5', ('represented', 6)),
 ('c0c3605edd53ebbb088e53046f35ea56', ('wide', 22)),
 ('a168122af0f745b75202e7c03b9908c9', ('conclude', 4)),
 ('76fe25728899c26cdd48e7009403483d', ('difusi', 1)),
 ('22e77936b198e99cd5730da94bb20cc7', ('free', 9))]

In [13]:
# Sort words by user
groupedSortedRDD = rearrangedRDD \
    .groupByKey() \
    .map(lambda x: (x[0], sorted(list(x[1]), key=lambda x: x[1], reverse=True)))

groupedSortedRDD.take(5)

[('184d03774ad01f7dcc0221279fe0f950',
  [('knowledge', 179),
   ('organizational', 42),
   ('information', 35),
   ('social', 30),
   ('sharing', 29),
   ('work', 25),
   ('organizations', 24),
   ('transfer', 23),
   ('management', 19),
   ('learning', 17),
   ('based', 15),
   ('knowing', 14),
   ('practice', 14),
   ('capital', 13),
   ('advantage', 13),
   ('cities', 13),
   ('firm', 13),
   ('organization', 12),
   ('firms', 12),
   ('people', 12),
   ('theory', 11),
   ('dynamic', 10),
   ('communication', 10),
   ('study', 10),
   ('society', 10),
   ('literature', 10),
   ('article', 10),
   ('distance', 9),
   ('findings', 9),
   ('hiding', 9),
   ('informational', 9),
   ('network', 9),
   ('studies', 8),
   ('intellectual', 8),
   ('competitive', 8),
   ('processes', 8),
   ('process', 8),
   ('show', 8),
   ('creation', 8),
   ('networks', 8),
   ('experience', 8),
   ('framework', 8),
   ('paper', 8),
   ('collaboration', 7),
   ('case', 7),
   ('economy', 7),
   ('innovat

In [14]:
topTenGroupRDD = groupedSortedRDD.map(lambda x: (x[0] ,x[1][:10]))

topTenGroupRDD.take(5)

[('184d03774ad01f7dcc0221279fe0f950',
  [('knowledge', 179),
   ('organizational', 42),
   ('information', 35),
   ('social', 30),
   ('sharing', 29),
   ('work', 25),
   ('organizations', 24),
   ('transfer', 23),
   ('management', 19),
   ('learning', 17)]),
 ('59bc8788b0e74f3a7b81d33c4c4e424c',
  [('species', 32),
   ('genome', 29),
   ('diversity', 20),
   ('genomes', 18),
   ('gene', 18),
   ('tree', 17),
   ('microbial', 17),
   ('archaea', 17),
   ('genetic', 17),
   ('data', 16)]),
 ('23dbf2c357633df36d4dae487b7d1e84',
  [('genome', 74),
   ('gene', 69),
   ('genes', 61),
   ('sequences', 59),
   ('microbial', 59),
   ('species', 56),
   ('marine', 55),
   ('diversity', 54),
   ('analysis', 53),
   ('rrna', 48)]),
 ('db24a533207e2b28d1d2d2ff3c9c1698',
  [('health', 298),
   ('care', 140),
   ('data', 129),
   ('models', 101),
   ('analysis', 59),
   ('methods', 55),
   ('multilevel', 53),
   ('model', 51),
   ('research', 50),
   ('spatial', 46)]),
 ('a1f1822091aaa3009120db95f3

In [15]:
finalResultRDD = topTenGroupRDD.mapValues(lambda x: [t[0] for t in x])
finalResultRDD.take(10)

[('184d03774ad01f7dcc0221279fe0f950',
  ['knowledge',
   'organizational',
   'information',
   'social',
   'sharing',
   'work',
   'organizations',
   'transfer',
   'management',
   'learning']),
 ('59bc8788b0e74f3a7b81d33c4c4e424c',
  ['species',
   'genome',
   'diversity',
   'genomes',
   'gene',
   'tree',
   'microbial',
   'archaea',
   'genetic',
   'data']),
 ('23dbf2c357633df36d4dae487b7d1e84',
  ['genome',
   'gene',
   'genes',
   'sequences',
   'microbial',
   'species',
   'marine',
   'diversity',
   'analysis',
   'rrna']),
 ('db24a533207e2b28d1d2d2ff3c9c1698',
  ['health',
   'care',
   'data',
   'models',
   'analysis',
   'methods',
   'multilevel',
   'model',
   'research',
   'spatial']),
 ('a1f1822091aaa3009120db95f3f50699',
  ['monads',
   'type',
   'programming',
   'database',
   'york',
   'query',
   'functional',
   'ny',
   'paper',
   'data']),
 ('162ea35a71a5503bacc986fec9e62fe5',
  ['web',
   'data',
   'de',
   'information',
   'biodiversity',


## Exercise 1.4 (Basic Analysis for Recommender Systems)
Basic analysis to get an idea of the characteristics of the dataset.

In [17]:
#Reading txt and csv files
userLibRDD = sparkSession.sparkContext.textFile("./users_libraries.txt")
stopWordRDD = sparkSession.sparkContext.textFile("./stopwords_en.txt")
papersRDD = sparkSession.sparkContext.textFile("./papers.csv")

In [18]:
#Exercise 1.4 a)
#Number of distinct users
#userLibRDD.count()
userLibRDD.map(lambda line: line.split(';')[0]).distinct().count()

28416

In [19]:
#Exercise 1.4 a)
#Number of distinct items
papersRDD.map(lambda line: line.split(',')[0]).distinct().count()

172079

In [20]:
#Exercise 1.4 a)
#Number of total ratings by users
userLibRDD.map(lambda line: (line.split(';')[0], line.split(';')[1])) \
            .flatMap(lambda x: x[1].split(',')) \
            .count()

828481

In [21]:
#Exercise 1.4 b)
#Min number of ratings a user has given
userLibRDD.map(lambda line: (line.split(';')[0], line.split(';')[1])) \
            .mapValues(lambda x: len(x.split(','))) \
            .map(lambda x: x[1]) \
            .min()

1

In [22]:
#Exercise 1.4 c)
#Max number of ratings a user has given
userLibRDD.map(lambda line: (line.split(';')[0], line.split(';')[1])) \
            .mapValues(lambda x: len(x.split(','))) \
            .map(lambda x: x[1]) \
            .max()

1922

In [23]:
#Exercise 1.4 d)
#Average number of ratings of users
userLibRDD.map(lambda line: (line.split(';')[0], line.split(';')[1])) \
            .mapValues(lambda x: len(x.split(','))) \
            .map(lambda x: x[1]) \
            .mean()


29.155440596846848

In [24]:
#Exercise 1.4 e)
#Standard deviation for ratings of users
userLibRDD.map(lambda line: (line.split(';')[0], line.split(';')[1])) \
            .mapValues(lambda x: len(x.split(','))) \
            .map(lambda x: x[1]) \
            .stdev()


81.1751761366871

In [25]:
#Exercise 1.4 f)
#Min number of ratings an item has received
userLibRDD.map(lambda line: (line.split(';')[0], line.split(';')[1]))\
            .map(lambda x: x[1].split(','))\
            .flatMap(lambda x: x)\
            .map(lambda x: (x, 1))\
            .reduceByKey(lambda a, b: a + b)\
            .map(lambda x: x[1])\
            .min()

3

In [26]:
#Exercise 1.4 g)
#Max number of ratings an item has received
userLibRDD.map(lambda line: (line.split(';')[0], line.split(';')[1]))\
            .map(lambda x: x[1].split(','))\
            .flatMap(lambda x: x)\
            .map(lambda x: (x, 1))\
            .reduceByKey(lambda a, b: a + b)\
            .map(lambda x: x[1])\
            .max()

924

In [27]:
#Exercise 1.4 h)
#Average number of ratings of items
userLibRDD.map(lambda line: (line.split(';')[0], line.split(';')[1]))\
            .map(lambda x: x[1].split(','))\
            .flatMap(lambda x: x)\
            .map(lambda x: (x, 1))\
            .reduceByKey(lambda a, b: a + b)\
            .map(lambda x: x[1])\
            .mean()

4.8145386711917055

In [28]:
#Exercise 1.4 i)
#Standard deviation for ratings of items
userLibRDD.map(lambda line: (line.split(';')[0], line.split(';')[1]))\
            .map(lambda x: x[1].split(','))\
            .flatMap(lambda x: x)\
            .map(lambda x: (x, 1))\
            .reduceByKey(lambda a, b: a + b)\
            .map(lambda x: x[1])\
            .stdev()

5.477802292314533

## Exercise 1.5 (Loading the dataset into Dataframes)

In [29]:
#Exercise 1.5
#loading data from users_libraries.txt into a dataframe
df = sparkSession.read.csv(path="./users_libraries.txt", sep=';')
df.show(5, truncate=True)

+--------------------+--------------------+
|                 _c0|                 _c1|
+--------------------+--------------------+
|28d3f81251d94b097...|3929762,503574,58...|
|d0c9aaa788153daea...|2080631,6343346,5...|
|f05bcffe7951de9e5...|1158654,478707,12...|
|ca4f1ba4094011d9a...|              278019|
|d1d41a15201915503...|6610569,6493797,6...|
+--------------------+--------------------+
only showing top 5 rows



In [32]:
#Exercise 1.5
#loading data from users_libraries.txt into a dataframe
#Loading the data in and RDD and splitting the paper_id id values
userRatingsRDD = \
    sparkSession.sparkContext.textFile("./users_libraries.txt") \
            .map(lambda line: line.split(';')) \
            .flatMapValues(lambda value: value.split(','))

#Creating the schema for the dataframe
schemaString = "user_hash_id paper_id"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
#Creating the dataframe
userLibDF = sparkSession.createDataFrame(userRatingsRDD, schema)
userLibDF.show(5, truncate=True)

+--------------------+--------+
|        user_hash_id|paper_id|
+--------------------+--------+
|28d3f81251d94b097...| 3929762|
|28d3f81251d94b097...|  503574|
|28d3f81251d94b097...| 5819422|
|28d3f81251d94b097...| 4238883|
|28d3f81251d94b097...| 5788061|
+--------------------+--------+
only showing top 5 rows



In [33]:
#Exercise 1.5
#Loading papers.csv data into a dataframe
#Defining the column names
columns = ['paper_id', 'type', 'journal', 'book_title', \
           'series', 'publisher', 'pages', 'volume', \
           'number', 'year', 'month', 'postedate',\
           'address', 'title', 'abstract']
papersDF = sparkSession.read\
            .load("./papers.csv", format="csv", sep=",", inferSchema="true", quote='"', header="false")\
            .toDF(*columns)
papersDF.show(2, truncate=True)

+--------+-------+--------------------+----------+------+---------+-----+------+------+----+-----+-------------------+-------+--------------------+--------------------+
|paper_id|   type|             journal|book_title|series|publisher|pages|volume|number|year|month|          postedate|address|               title|            abstract|
+--------+-------+--------------------+----------+------+---------+-----+------+------+----+-----+-------------------+-------+--------------------+--------------------+
|   80546|article|biology and philo...|      null|  null|     null|   17|    19|     2|2004|  mar|2005-01-26 21:35:21|   null|the arbitrariness...|the genetic code ...|
| 5842862|article|      molecular cell|      null|  null| elsevier|    2|    35|     6|2009|  sep|2009-09-30 17:11:23|   null|how to choose a g...|choosing good pro...|
+--------+-------+--------------------+----------+------+---------+-----+------+------+----+-----+-------------------+-------+--------------------+--------

In [34]:
#Exercise 1.5
#Loading stopwords_en.txt data into a dataframe
stopWordsDF = sparkSession.read\
                .load("./stopwords_en.txt", format="text", sep=" ", inferSchema="true", header="false")\
                .toDF('stop_word')
stopWordsDF.show(5, truncate=True)

+---------+
|stop_word|
+---------+
|        a|
|     able|
|    about|
|    above|
|according|
+---------+
only showing top 5 rows



## Exercise 1.6 (Tasks on top of Dataframes)
### Exercise 1.4 (Basic Analysis for Recommender Systems)

In [35]:
#Exercise 1.6
#Exercise 1.4 a) Using DataFrames (Basic Analysis for Recommender Systems)
#Number of distinct users
userLibDF.select("user_hash_id").distinct().count()

28416

In [36]:
#Exercise 1.6
#Exercise 1.4 a) Using DataFrames (Basic Analysis for Recommender Systems)
#Number of distinct items
papersDF.select("paper_id").distinct().count()

172079

In [37]:
#Exercise 1.6
#Exercise 1.4 a) Using DataFrames (Basic Analysis for Recommender Systems)
#Number of ratings
userLibDF.select("paper_id").count()

828481

In [38]:
#Exercise 1.6
#Exercise 1.4 b) Using DataFrames (Basic Analysis for Recommender Systems)
#Min number of ratings a user has given
userLibDF.select("user_hash_id").groupBy("user_hash_id").count().agg(Funct.min("count")).show()

+----------+
|min(count)|
+----------+
|         1|
+----------+



In [39]:
#Exercise 1.6
#Exercise 1.4 c) Using DataFrames (Basic Analysis for Recommender Systems)
#Max number of ratings a user has given
userLibDF.select("user_hash_id").groupBy("user_hash_id").count().agg(Funct.max("count")).show()

+----------+
|max(count)|
+----------+
|      1922|
+----------+



In [40]:
#Exercise 1.6
#Exercise 1.4 d) Using DataFrames (Basic Analysis for Recommender Systems)
#Average number of ratings of users
userLibDF.select("user_hash_id").groupBy("user_hash_id").count().agg(Funct.mean("count")).show()

+------------------+
|        avg(count)|
+------------------+
|29.155440596846848|
+------------------+



In [41]:
#Exercise 1.6
#Exercise 1.4 e) Using DataFrames (Basic Analysis for Recommender Systems)
#Standard deviation for ratings of users
userLibDF.select("user_hash_id").groupBy("user_hash_id").count().agg(Funct.stddev("count")).show()

+------------------+
|stddev_samp(count)|
+------------------+
| 81.17660451011605|
+------------------+



In [42]:
#Exercise 1.6
#Exercise 1.4 f) Using DataFrames (Basic Analysis for Recommender Systems)
#Min number of ratings an item has received
userLibDF.select("paper_id").groupBy("paper_id").count().agg(Funct.min("count")).show()

+----------+
|min(count)|
+----------+
|         3|
+----------+



In [43]:
#Exercise 1.6
#Exercise 1.4 g) Using DataFrames (Basic Analysis for Recommender Systems)
#Max number of ratings an item has received
userLibDF.select("paper_id").groupBy("paper_id").count().agg(Funct.max("count")).show()

+----------+
|max(count)|
+----------+
|       924|
+----------+



In [44]:
#Exercise 1.6
#Exercise 1.4 h) Using DataFrames (Basic Analysis for Recommender Systems)
#Average number of ratings of items
userLibDF.select("paper_id").groupBy("paper_id").count().agg(Funct.mean("count")).show()

+----------------+
|      avg(count)|
+----------------+
|4.81453867119172|
+----------------+



In [45]:
#Exercise 1.6
#Exercise 1.4 i) Using DataFrames (Basic Analysis for Recommender Systems)
#Average number of ratings of items
userLibDF.select("paper_id").groupBy("paper_id").count().agg(Funct.stddev("count")).show()

+------------------+
|stddev_samp(count)|
+------------------+
| 5.477818208917287|
+------------------+



## Exercise 1.6 (Tasks on top of Dataframes)
### Exercise 1.3 (Joining Collections)

In [53]:
#Exercise 1.6
#Exercise 1.3 Using Dataframes (Joining collections)
#Joining the userLib dataframe with the papersDF dataframe over paper_id
#Dropping any row that has null value in the abstract column
joinedDF = userLibDF.join(papersDF, userLibDF.paper_id == papersDF.paper_id, "inner")\
                        .select(userLibDF.user_hash_id,\
                                Funct.explode(Funct.split(papersDF.abstract," "))\
                                .alias("word"))\
                        .na.drop("any")

joinedDF.show(20, truncate=True)

0:00:00.025654
+--------------------+---------------+
|        user_hash_id|           word|
+--------------------+---------------+
|8ac80c1b48f33b5c2...|multistability,|
|8ac80c1b48f33b5c2...|            the|
|8ac80c1b48f33b5c2...|       capacity|
|8ac80c1b48f33b5c2...|             to|
|8ac80c1b48f33b5c2...|        achieve|
|8ac80c1b48f33b5c2...|       multiple|
|8ac80c1b48f33b5c2...|       internal|
|8ac80c1b48f33b5c2...|         states|
|8ac80c1b48f33b5c2...|             in|
|8ac80c1b48f33b5c2...|       response|
|8ac80c1b48f33b5c2...|             to|
|8ac80c1b48f33b5c2...|              a|
|8ac80c1b48f33b5c2...|         single|
|8ac80c1b48f33b5c2...|            set|
|8ac80c1b48f33b5c2...|             of|
|8ac80c1b48f33b5c2...|       external|
|8ac80c1b48f33b5c2...|        inputs,|
|8ac80c1b48f33b5c2...|             is|
|8ac80c1b48f33b5c2...|            the|
|8ac80c1b48f33b5c2...|       defining|
+--------------------+---------------+
only showing top 20 rows



In [54]:
#Exercise 1.6
#Exercise 1.3 Using Dataframes (Joining collections)
#Subtracting the stop words from the user and abstract words dataframe
withoutStpWrdDF = joinedDF.join(stopWordsDF, joinedDF.word==stopWordsDF.stop_word, how="left_anti")
withoutStpWrdDF.show(20, truncate=True)

+--------------------+---------------+
|        user_hash_id|           word|
+--------------------+---------------+
|8ac80c1b48f33b5c2...|multistability,|
|8ac80c1b48f33b5c2...|       capacity|
|8ac80c1b48f33b5c2...|        achieve|
|8ac80c1b48f33b5c2...|       multiple|
|8ac80c1b48f33b5c2...|       internal|
|8ac80c1b48f33b5c2...|         states|
|8ac80c1b48f33b5c2...|       response|
|8ac80c1b48f33b5c2...|         single|
|8ac80c1b48f33b5c2...|            set|
|8ac80c1b48f33b5c2...|       external|
|8ac80c1b48f33b5c2...|        inputs,|
|8ac80c1b48f33b5c2...|       defining|
|8ac80c1b48f33b5c2...| characteristic|
|8ac80c1b48f33b5c2...|        switch.|
|8ac80c1b48f33b5c2...|     biological|
|8ac80c1b48f33b5c2...|       switches|
|8ac80c1b48f33b5c2...|      essential|
|8ac80c1b48f33b5c2...|  determination|
|8ac80c1b48f33b5c2...|           cell|
|8ac80c1b48f33b5c2...|           fate|
+--------------------+---------------+
only showing top 20 rows



In [57]:
#Exercise 1.6
#Exercise 1.3 Using Dataframes (Joining collections)
#Grouping the joinedDf dataframe over 'user_hash_id' and 'word' columns
#Sorting in ascending order the result w.r.t the 'user_hash_id' column
then = datetime.now()

groupedDF = withoutStpWrdDF.groupBy("user_hash_id", "word")\
                    .agg(Funct.count("word").alias("word_count"))\
                    .sort(Funct.asc("user_hash_id"))

groupedDF.show(10, truncate=True)

print(datetime.now() - then)

+--------------------+------------+----------+
|        user_hash_id|        word|word_count|
+--------------------+------------+----------+
|00095808cdc611fb5...|instructors,|         1|
|00095808cdc611fb5...|     induced|         1|
|00095808cdc611fb5...|      aiming|         1|
|00095808cdc611fb5...|experimental|         1|
|00095808cdc611fb5...|    acquired|         1|
|00095808cdc611fb5...|        text|         3|
|00095808cdc611fb5...|    paradigm|         1|
|00095808cdc611fb5...|       apply|         1|
|00095808cdc611fb5...|      vision|         1|
|00095808cdc611fb5...|hierarchical|         1|
+--------------------+------------+----------+
only showing top 10 rows

0:00:19.008212


In [58]:
#Exercise 1.6
#Exercise 1.3 Using Dataframes (Joining collections)
#Creating a window and using 'rank' function over the window to give a rank to the words per user and according 
#to the word count
then = datetime.now()

window = Window.partitionBy("user_hash_id").orderBy(Funct.desc("word_count"))
rankedResultDF = groupedDF.withColumn("ranking", Funct.rank().over(window))
filteredRankedResDF = rankedResultDF.filter(rankedResultDF.ranking < 10).select("user_hash_id", "word", "ranking")
filteredRankedResDF.show(30, truncate=True)

print(datetime.now() - then)

+--------------------+-------------+-------+
|        user_hash_id|         word|ranking|
+--------------------+-------------+-------+
|00095808cdc611fb5...|       errors|      1|
|00095808cdc611fb5...|         text|      2|
|00095808cdc611fb5...|       impact|      3|
|00095808cdc611fb5...|          web|      3|
|00095808cdc611fb5...|         list|      3|
|00095808cdc611fb5...|   department|      3|
|00095808cdc611fb5...|  information|      3|
|00095808cdc611fb5...|        error|      3|
|00095808cdc611fb5...|         site|      3|
|00095808cdc611fb5...|      problem|      3|
|000a87940e47aef8c...|      million|      1|
|000a87940e47aef8c...|     patterns|      1|
|000a87940e47aef8c...|      blogger|      1|
|000a87940e47aef8c...|      reveals|      1|
|000a87940e47aef8c...|      entries|      1|
|000a87940e47aef8c...|demographics,|      1|
|000a87940e47aef8c...|        blogs|      1|
|000a87940e47aef8c...|     bloggers|      1|
|000a87940e47aef8c...|     activity|      1|
|000a87940

In [59]:
#Exercise 1.6
#Exercise 1.3 Using Dataframes (Joining collections)
#Transforming the dataframe so that the top 10 most used words are combined into a list for eash user 
then = datetime.now()

finalResultDF = filteredRankedResDF.groupBy("user_hash_id").agg(Funct.collect_list("word").alias("word"))
finalResultDF.show(10, truncate=True)

print(datetime.now() - then)

+--------------------+--------------------+
|        user_hash_id|                word|
+--------------------+--------------------+
|00095808cdc611fb5...|[errors, text, im...|
|000a87940e47aef8c...|[million, pattern...|
|000ac87bf9c1623ee...|[consciousness, p...|
|000e5a48701b81078...|[sleep, arousal, ...|
|000e984f80e531b77...|[online, reviews,...|
|000ea48a81566915b...|[virtual, , users...|
|001933555c2b77453...|[game, motivation...|
|0019c5f6eb3731d3f...|[media, research,...|
|001a1d3a428a4acfd...|[place, sense, de...|
|001a28e625cbfa51c...|[information, , b...|
+--------------------+--------------------+
only showing top 10 rows

0:00:27.431064


In [60]:
#Exercise 1.6
#Exercise 1.3 Using Dataframes (Joining collections)
#Saving the result as a text file
then = datetime.now()

finalResultDF.select(finalResultDF.user_hash_id, finalResultDF.word.cast("string")) \
    .write.csv(path='dataframe_result',mode='overwrite',sep=' ')

print(datetime.now() - then)

0:00:32.554247


The dataframe version is a lot faster, doing the same procedure in just 2 minutes (comparing to 5 minutes for RDD).