In [2]:
#Import packages.
from pyspark.sql import SparkSession
from pyspark.sql import functions as Func
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, IDF
from pyspark.ml.linalg import SparseVector, DenseVector, VectorUDT, Vectors, Vector
from pyspark.ml.clustering import KMeans, LDA
from nltk.stem import PorterStemmer
import math
import numpy as np
import itertools
#import string

#Create a spark session.
sparkSession = SparkSession.builder.appName("Experiment4").getOrCreate()
#Get default configurations
sparkSession.sparkContext._conf.getAll()
#Update default configurations
conf = sparkSession.sparkContext._conf.setAll([('spark.executor.memory', '16g')\
                                        , ('spark.app.name', 'Spark Updated Conf')\
                                        , ('spark.executor.cores', '8')\
                                        , ('spark.cores.max', '8')\
                                        , ('spark.driver.memory','16g')\
                                        ,('spark.driver.maxResultSize','16g')])
#Stop the current Spark Session
sparkSession.sparkContext.stop()
#Create a Spark Session
sparkSession = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
#Loading stopwords_en.txt data into a dataframe.
stopWordsDF = sparkSession.read\
                .load("/home/jovyan/work/stopwords_en.txt", format="text", sep=" ", inferSchema="true", header="false")\
                .toDF('stop_word')
stopWordsDF.show(5, truncate=True)

+---------+
|stop_word|
+---------+
|        a|
|     able|
|    about|
|    above|
|according|
+---------+
only showing top 5 rows



In [104]:
#Loading users_libraries.txt data into a dataframe.
#Defining the column names.
user_columns = ['raw_data']
rawUsersDF = sparkSession.read\
            .load("/home/jovyan/work/mod_users_libraries3.txt", format="text", sep=";",\
                  inferSchema="true", quote='"', header="false")\
            .toDF(*user_columns)

usersDF = rawUsersDF.select(Func.split(rawUsersDF.raw_data, ";").getItem(0).alias("user_hash_id"),\
                           Func.split(rawUsersDF.raw_data, ";").getItem(1).alias("user_library"))
usersDF.show(5)

+--------------------+--------------------+
|        user_hash_id|        user_library|
+--------------------+--------------------+
|28d3f81251d94b097...|3929762,503574,58...|
|d0c9aaa788153daea...|2080631,6343346,5...|
|f05bcffe7951de9e5...|1158654,478707,12...|
|ca4f1ba4094011d9a...|              278019|
|d1d41a15201915503...|6610569,6493797,6...|
+--------------------+--------------------+
only showing top 5 rows



In [105]:
#Loading papers.csv data into a dataframe.
#Defining the column names.
paper_columns = ['paper_id', 'type', 'journal', 'book_title', \
           'series', 'publisher', 'pages', 'volume', \
           'number', 'year', 'month', 'postedate',\
           'address', 'title', 'abstract']

papersDF = sparkSession.read\
            .load("/home/jovyan/work/papers.csv", format="csv", sep=",", inferSchema="true", quote='"', header="false")\
            .toDF(*paper_columns)
papersDF.show(2, truncate=True)

+--------+-------+--------------------+----------+------+---------+-----+------+------+----+-----+-------------------+-------+--------------------+--------------------+
|paper_id|   type|             journal|book_title|series|publisher|pages|volume|number|year|month|          postedate|address|               title|            abstract|
+--------+-------+--------------------+----------+------+---------+-----+------+------+----+-----+-------------------+-------+--------------------+--------------------+
|   80546|article|biology and philo...|      null|  null|     null|   17|    19|     2|2004|  mar|2005-01-26 21:35:21|   null|the arbitrariness...|the genetic code ...|
| 5842862|article|      molecular cell|      null|  null| elsevier|    2|    35|     6|2009|  sep|2009-09-30 17:11:23|   null|how to choose a g...|choosing good pro...|
+--------+-------+--------------------+----------+------+---------+-----+------+------+----+-----+-------------------+-------+--------------------+--------

In [109]:
#touserpaper = explodUsersDF.select("paper_id").distinct()
#papersDF = touserpaper.join(papersDF, "paper_id")
#papersDF.show(2)

In [124]:
#papersDF.count()
#filteredDF.count()

In [108]:
#Concatenate the title and abstract fields of papers together.
textDF = papersDF.select(papersDF.paper_id, Func.concat_ws(" ", papersDF.title, papersDF.abstract).alias("text"))
textDF.show(5)

+--------+--------------------+
|paper_id|                text|
+--------+--------------------+
|   80546|the arbitrariness...|
| 5842862|how to choose a g...|
| 1242600|how to write cons...|
|  305755|the structure of ...|
| 6603134|how to build a mo...|
+--------+--------------------+
only showing top 5 rows



In [110]:
#Perform tokenization and remove words less than 3 characters.
#Keep the words containing "-" and "_" characters.
reTokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=4, pattern="[^-_\\w]")
wordsDF = reTokenizer.transform(textDF)
wordsDF.head()

Row(paper_id='80546', text="the arbitrariness of the genetic code the genetic code has been regarded as arbitrary in the sense that the codon-amino acid assignments could be different than they actually are. this general idea has been spelled out differently by previous, often rather implicit accounts of arbitrariness. they have drawn on the frozen accident theory, on evolutionary contingency, on alternative causal pathways, and on the absence of direct stereochemical interactions between codons and amino acids. it has also been suggested that the arbitrariness of the genetic code justifies attributing semantic information to macromolecules, notably to {dna}. i argue that these accounts of arbitrariness are unsatisfactory. i propose that the code is arbitrary in the sense of jacques monod's concept of chemical arbitrariness: the genetic code is arbitrary in that any codon requires certain chemical and structural properties to specify a particular amino acid, but these properties are no

In [111]:
#Function to remove the characters "-" and "_" from words.
def concatConnectedWords(wordList):
    wordSet = set(wordList)
    identity = str.maketrans("", "", "-_")
    wordSet = [word.translate(identity) for word in wordSet]
    return wordSet

udf_concatConnectedWords = Func.udf(concatConnectedWords, ArrayType(StringType()))

In [112]:
#removing the characters "-" and "_" from words.
removedConnectorsDF = wordsDF.select(wordsDF.paper_id,\
                                     Func.lit(udf_concatConnectedWords(wordsDF.words)).alias("processed_words"))
removedConnectorsDF.take(2)

[Row(paper_id='80546', processed_words=['these', 'actually', 'certain', 'stereochemical', 'notably', 'that', 'concept', 'particular', 'code', 'chemical', 'could', 'they', 'direct', 'propose', 'with', 'about', 'chemistry', 'general', 'regarded', 'notion', 'this', 'nucleic', 'information', 'drawn', 'specify', 'evolution', 'neither', 'absence', 'genetic', 'implicit', 'unsatisfactory', 'previous', 'frozen', 'between', 'alternative', 'amino', 'attributing', 'required', 'codonamino', 'been', 'have', 'structural', 'several', 'properties', 'maintain', 'necessary', 'requires', 'sense', 'hypotheses', 'pathways', 'acids', 'causal', 'codons', 'arbitrary', 'often', 'theory', 'also', 'macromolecules', 'assignments', 'rather', 'accounts', 'contingency', 'acid', 'monod', 'virtue', 'sufficient', 'jacques', 'principle', 'argue', 'idea', 'arbitrariness', 'suggested', 'compatible', 'than', 'recent', 'different', 'semantic', 'codon', 'interactions', 'justifies', 'accident', 'spelled', 'evolutionary', 'diff

In [113]:
#Creating a list of stop words.
stopWordsList = stopWordsDF.agg(Func.collect_list(stopWordsDF.stop_word)).rdd.flatMap(lambda row: row[0])
stopWordsList.take(10)

['a',
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after',
 'afterwards']

In [114]:
#Removing the stop words.
remover = StopWordsRemover(inputCol="processed_words", outputCol="words", stopWords=stopWordsList.collect())
withoutStopWordsDF = remover.transform(removedConnectorsDF)
withoutStopWordsDF.select("words").first()

Row(words=['stereochemical', 'notably', 'concept', 'code', 'chemical', 'direct', 'propose', 'chemistry', 'general', 'regarded', 'notion', 'nucleic', 'information', 'drawn', 'evolution', 'absence', 'genetic', 'implicit', 'unsatisfactory', 'previous', 'frozen', 'alternative', 'amino', 'attributing', 'required', 'codonamino', 'structural', 'properties', 'maintain', 'requires', 'sense', 'hypotheses', 'pathways', 'acids', 'causal', 'codons', 'arbitrary', 'theory', 'macromolecules', 'assignments', 'accounts', 'contingency', 'acid', 'monod', 'virtue', 'sufficient', 'jacques', 'principle', 'argue', 'idea', 'arbitrariness', 'suggested', 'compatible', 'recent', 'semantic', 'codon', 'interactions', 'justifies', 'accident', 'spelled', 'evolutionary', 'differently'])

In [115]:
#Function to perform stemming.
stemmer = PorterStemmer()

def stemming(wordList):
    wordSet = set(wordList)
    wordSet = [stemmer.stem(word) for word in wordList]
    return sorted(wordSet)

#User defined function to perform stemming.
udf_stemming = Func.udf(stemming, ArrayType(StringType()))

In [116]:
#Performing stemming.
stemmedWordsDF = withoutStopWordsDF.withColumn("stemmed_words", udf_stemming(withoutStopWordsDF.words))

In [117]:
#Counting the number of papers (frequency) in which a particular word appears. (Document Frequency)
explodedDF = stemmedWordsDF.select(stemmedWordsDF.paper_id,\
                                   Func.explode(stemmedWordsDF.stemmed_words).alias("single_word"))\
                            .distinct()\
                            .groupBy("single_word")\
                            .agg(Func.count("single_word")\
                                 .alias("df"))
explodedDF.show(10)

+-----------+---+
|single_word| df|
+-----------+---+
|       1159|  1|
|      input| 53|
|    persist|  8|
|     bhi040|  1|
|    disclos|  4|
|      trail|  3|
|      carlo|  8|
|     travel|  9|
|     harder|  2|
|       hope|  9|
+-----------+---+
only showing top 10 rows



In [122]:
#Counting the number of unique papers.
#Setting the upper and lower bounds.
uniqPaperCount = stemmedWordsDF.count()
upperBoundary = 0.1*uniqPaperCount
lowerBoundary = 20
#filter the dataframe to select only words that appear in more than 20 papers and
#less than 10 percent of the total number of papers.
filteredDF = explodedDF.filter((explodedDF.df>=lowerBoundary) & (explodedDF.df<=upperBoundary))
filteredDF.show(10)

+-----------+---+
|single_word| df|
+-----------+---+
|      input| 53|
|       oper| 62|
|      curat| 37|
|    explain| 51|
|    classif| 81|
|        map| 71|
|     growth| 44|
|  character| 68|
|      decis| 32|
|       grow| 28|
+-----------+---+
only showing top 10 rows



In [125]:
#Limiting the number of important terms to 1000.
termsDF = filteredDF.sort("df", ascending=False).limit(1000).select("single_word")
termsDF.show(10)

+-----------+
|single_word|
+-----------+
|     integr|
|     improv|
| understand|
|     design|
|  algorithm|
| experiment|
|       test|
|   knowledg|
|     combin|
|      relat|
+-----------+
only showing top 10 rows



In [126]:
#Generating an index to the top 1000 important terms starting with the index 0.
termsDF = termsDF.withColumn("word_id", Func.monotonically_increasing_id())

wordWinSpec = Window.orderBy("word_id")

termsDF = termsDF.withColumn("word_id",Func.row_number().over(wordWinSpec)-1)
termsDF.show(10, truncate=True)

+-----------+-------+
|single_word|word_id|
+-----------+-------+
|     improv|      0|
|     integr|      1|
| understand|      2|
|     design|      3|
|  algorithm|      4|
|     combin|      5|
|   knowledg|      6|
| experiment|      7|
|       test|      8|
|      relat|      9|
+-----------+-------+
only showing top 10 rows



In [127]:
#Counting the number of times a word appeared in a particular paper (word count per paper) (term frequency).
tempDF = stemmedWordsDF.select(stemmedWordsDF.paper_id,\
                               Func.explode(stemmedWordsDF.stemmed_words).alias("single_word"))\
                        .groupBy("paper_id", "single_word")\
                        .agg(Func.count("single_word").alias("tf"))
tempDF.show(10)

+--------+-------------+---+
|paper_id|  single_word| tf|
+--------+-------------+---+
|   90558|       common|  1|
| 1042553|bonferronityp|  1|
|  469428|       requir|  1|
| 5394760|      develop|  1|
|  820297|         oper|  1|
| 1279898|        studi|  1|
|  557229|      primari|  1|
|   94348|         vari|  1|
|  100186|      progeni|  1|
|  227173|        cumul|  1|
+--------+-------------+---+
only showing top 10 rows



In [128]:
#Creating a dataframe with term index and count of the word per paper.
#Creating a column containing a mapping of term index -> count per paper.
joinedResult = tempDF.join(termsDF, "single_word").withColumn("map", Func.create_map("word_id", "tf"))\
                    .groupBy("paper_id").agg(Func.collect_list("map").alias("map_list"))
joinedResult.show(10)

+--------+--------------------+
|paper_id|            map_list|
+--------+--------------------+
| 7065512|[[533 -> 1], [404...|
| 5910752|[[585 -> 1], [64 ...|
| 1843282|[[357 -> 1], [209...|
| 2706950|[[78 -> 1], [230 ...|
| 4169015|[[85 -> 1], [786 ...|
| 2180593|[[5 -> 1], [65 ->...|
|  227173|[[584 -> 1], [712...|
|  166220|[[397 -> 1], [249...|
| 1089946|[[31 -> 1], [359 ...|
| 1458475|[[707 -> 1], [623...|
+--------+--------------------+
only showing top 10 rows



In [129]:
#Function to create a sparse vector for each paper.
def toSparse(mapList):
    pairs = []
    for map_val in mapList:
        pairs += map_val.items()
    pairs = sorted(pairs, key=lambda pair: pair[0])
    return SparseVector(1000, [x[0] for x in pairs], [x[1] for x in pairs])

#Creating a user defined function.
udf_toSparse = Func.udf(toSparse, VectorUDT())

In [130]:
#Creating the bag of words dataframe.
featurizedDataDF = joinedResult.select("paper_id",\
                                   udf_toSparse(joinedResult.map_list)\
                                   .alias("tf_vector"))
featurizedDataDF.first()

Row(paper_id='7065512', tf_vector=SparseVector(1000, {2: 1.0, 16: 1.0, 36: 1.0, 46: 1.0, 56: 1.0, 64: 2.0, 70: 1.0, 73: 1.0, 76: 2.0, 141: 1.0, 155: 1.0, 156: 1.0, 204: 1.0, 228: 1.0, 231: 1.0, 274: 1.0, 294: 1.0, 298: 1.0, 314: 1.0, 319: 2.0, 386: 1.0, 404: 1.0, 411: 2.0, 490: 1.0, 526: 1.0, 527: 1.0, 533: 1.0, 578: 1.0, 606: 1.0, 677: 1.0, 700: 1.0, 744: 1.0}))

In [131]:
#Initializing the IDF object.
sparkMlIdf = IDF(inputCol="tf_vector", outputCol="idf_vector")
#Training the data and creating a model.
sparkMlIdfModel = sparkMlIdf.fit(featurizedDataDF)
#Adding the output produced from IDF to the dataset as a separate column.
paperTfIdf = sparkMlIdfModel.transform(featurizedDataDF)
paperTfIdf.first()

Row(paper_id='7065512', tf_vector=SparseVector(1000, {2: 1.0, 16: 1.0, 36: 1.0, 46: 1.0, 56: 1.0, 64: 2.0, 70: 1.0, 73: 1.0, 76: 2.0, 141: 1.0, 155: 1.0, 156: 1.0, 204: 1.0, 228: 1.0, 231: 1.0, 274: 1.0, 294: 1.0, 298: 1.0, 314: 1.0, 319: 2.0, 386: 1.0, 404: 1.0, 411: 2.0, 490: 1.0, 526: 1.0, 527: 1.0, 533: 1.0, 578: 1.0, 606: 1.0, 677: 1.0, 700: 1.0, 744: 1.0}), idf_vector=SparseVector(1000, {2: 2.2961, 16: 2.3756, 36: 2.5119, 46: 2.5567, 56: 2.6036, 64: 5.2557, 70: 2.6783, 73: 2.6783, 76: 5.3914, 141: 2.9546, 155: 2.9775, 156: 2.9892, 204: 3.1278, 228: 3.1834, 231: 3.1834, 274: 3.321, 294: 3.4065, 298: 3.4065, 314: 3.4429, 319: 6.9232, 386: 3.6033, 404: 3.6707, 411: 7.3414, 490: 3.8484, 526: 3.9055, 527: 3.9055, 533: 3.9055, 578: 3.9979, 606: 4.0646, 677: 4.1738, 700: 4.213, 744: 4.2964}))

In [106]:
#Splitting the list of paper_ids into individual rows.
explodUsersDF = usersDF.select(usersDF.user_hash_id,\
                               Func.explode(Func.split(usersDF.user_library, ","))\
                               .alias("paper_id"))
explodUsersDF.show(5, truncate=True)

+--------------------+--------+
|        user_hash_id|paper_id|
+--------------------+--------+
|28d3f81251d94b097...| 3929762|
|28d3f81251d94b097...|  503574|
|28d3f81251d94b097...| 5819422|
|28d3f81251d94b097...| 4238883|
|28d3f81251d94b097...| 5788061|
+--------------------+--------+
only showing top 5 rows



In [132]:
#Creating a dataframe with user_hash_id as well as paper_id, tf, idf, tf-idf information.
joinUserPaperTfIdf = explodUsersDF.join(paperTfIdf, "paper_id")
joinUserPaperTfIdf.show(5)

+--------+--------------------+--------------------+--------------------+
|paper_id|        user_hash_id|           tf_vector|          idf_vector|
+--------+--------------------+--------------------+--------------------+
| 7065512|f1e1cd4ff25018273...|(1000,[2,16,36,46...|(1000,[2,16,36,46...|
| 5910752|f1e1cd4ff25018273...|(1000,[64,225,585...|(1000,[64,225,585...|
| 1843282|5ba96b191db3c3c02...|(1000,[22,38,69,1...|(1000,[22,38,69,1...|
| 2706950|3b715ebaf1f8f81a1...|(1000,[4,55,75,78...|(1000,[4,55,75,78...|
| 4169015|7c0081293b3988065...|(1000,[13,16,22,2...|(1000,[13,16,22,2...|
+--------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [133]:
#Grouping the tf-idf vectors with respect to the user.
userTfIdfList = joinUserPaperTfIdf.groupBy("user_hash_id").agg(Func.collect_list("idf_vector").alias("tf_idf_list"))
userTfIdfList.show(10)

+--------------------+--------------------+
|        user_hash_id|         tf_idf_list|
+--------------------+--------------------+
|f1e1cd4ff25018273...|[(1000,[2,16,36,4...|
|cf9c7f356092c34be...|[(1000,[20,24,29,...|
|488fb15e8c77f8054...|[(1000,[3,4,12,18...|
|ca4f1ba4094011d9a...|[(1000,[2,3,9,22,...|
|d503571e44a0373eb...|[(1000,[0,1,5,22,...|
|f05bcffe7951de9e5...|[(1000,[0,2,5,6,1...|
|c6b59086a0bbac141...|[(1000,[0,2,3,9,1...|
|ed571b13a83199c9c...|[(1000,[0,9,19,20...|
|586c867a0688250ac...|[(1000,[3,4,10,11...|
|e17a1c14ffca94104...|[(1000,[4,425],[2...|
+--------------------+--------------------+
only showing top 10 rows



In [30]:
#Function to add elements of a list of sparse vectors.
def addSparseVec(vector_list):
    result = SparseVector(1000, list(range(0, 1000)), np.zeros(1000))
    for vector in vector_list:
        result = np.add(result, vector).tolist()

    vector_args = len(result), [i for i, x in enumerate(result) if x != 0], [x for x in result if x != 0] 
    return Vectors.sparse(*vector_args)
    
udf_addSparseVec = Func.udf(addSparseVec, VectorUDT())

In [134]:
#Creating the user profile.
userProfileDF = userTfIdfList.withColumn("features", udf_addSparseVec(userTfIdfList.tf_idf_list))
userFeaturesDF = userProfileDF.select("user_hash_id", "features")
userFeaturesDF.first()

Row(user_hash_id='f1e1cd4ff25018273aafc0c68fbb5a2f', features=SparseVector(1000, {0: 38.7362, 1: 36.4576, 2: 45.9219, 3: 25.4529, 4: 27.9114, 5: 27.9843, 6: 81.6209, 7: 18.6562, 8: 11.6601, 9: 49.101, 10: 30.3958, 11: 35.2573, 12: 11.7524, 13: 25.8554, 14: 28.3559, 15: 23.693, 16: 64.1424, 17: 26.1321, 18: 33.259, 19: 52.2642, 20: 19.1595, 21: 31.2189, 22: 45.7522, 23: 9.685, 24: 9.7118, 25: 19.4774, 26: 56.1536, 27: 31.9171, 28: 68.7444, 29: 14.7309, 30: 19.6966, 31: 44.4429, 32: 17.2833, 33: 24.8313, 34: 22.3482, 35: 62.4354, 36: 12.5596, 37: 25.1191, 38: 22.6731, 39: 15.1597, 40: 7.5799, 41: 10.1362, 42: 60.8173, 43: 17.8436, 44: 40.9071, 45: 25.5669, 46: 20.4536, 47: 46.2974, 48: 49.0174, 49: 23.2893, 51: 23.3605, 52: 31.1473, 53: 38.9342, 54: 18.1693, 55: 59.699, 56: 164.0254, 57: 5.2072, 58: 33.9509, 59: 23.5045, 60: 20.8929, 61: 26.1971, 62: 34.1623, 63: 28.9066, 64: 168.1837, 65: 23.7996, 66: 39.666, 67: 5.3224, 68: 21.3577, 69: 16.0183, 70: 45.5311, 71: 37.4962, 72: 26.783, 73

## Exercise 4.2 (Content-based recommendations: similarity metric)
To compute the similarity between the User and Item(paper) profiles.

In [135]:
#Exercise 4.2
#Function to calculate the cosine similarity metric
def cosineSim(user_profile, item_profile):
    u_leng = np.linalg.norm(user_profile, 2)
    p_leng = np.linalg.norm(item_profile, 2)

    if u_leng*p_leng == 0:
        denominator = 1
    else:
        denominator = u_leng*p_leng
        
    similarity = float(np.dot(user_profile, item_profile)/denominator)
    
    return similarity

udf_cosineSim = Func.udf(cosineSim, FloatType())    

## Exercise 4.3 (Content-based recommendations)
a) Implement CBRS tf_idf <br>
b) Implement CBRS lda <br>
c) Show top-k recommendations for user with user_hash_id = 1eac022a97d683eace8815545ce3153f <br>

In [136]:
#Exercise 4.3
#Get the papers that the specific user has rated.
specUserRatedPaper = explodUsersDF.filter(explodUsersDF.user_hash_id=="1eac022a97d683eace8815545ce3153f")
specUserRatedPaper.show(10)

+--------------------+--------+
|        user_hash_id|paper_id|
+--------------------+--------+
|1eac022a97d683eac...| 3973229|
|1eac022a97d683eac...|  322433|
|1eac022a97d683eac...| 5732042|
|1eac022a97d683eac...| 8004203|
|1eac022a97d683eac...|  421656|
|1eac022a97d683eac...| 3106933|
|1eac022a97d683eac...| 1121661|
|1eac022a97d683eac...|  368203|
|1eac022a97d683eac...| 6439894|
|1eac022a97d683eac...|12786786|
+--------------------+--------+
only showing top 10 rows



CBRS tf_idf

In [167]:
#Exercise 4.3
#Function to get the top-k recommendation of a user
def recommendTfIdf(user_id, user_profile, rated_paper_df, k):
    result_arr = []
    #Get the papers not yet rated by the user
    unratedPapers = paperTfIdf.join(rated_paper_df, "paper_id", how="left_anti")\
                            .select("paper_id", "idf_vector")
    #Loop over the unrated papers
    for row_df in unratedPapers.collect():
        #Calculate the cosine similarity given the user and paper profiles and store in a list
        result_arr.append((user_id, row_df.paper_id, cosineSim(user_profile, row_df.idf_vector)))
    #Sort the list of tuples according to the cosine similarity in descending order
    result_arr.sort(key=lambda tup: tup[2], reverse=True)
    
    if k > len(result_arr):
        k = len(result_arr)
    
    return result_arr[0:k]

In [168]:
#Exercise 4.3
#Get the id and profile of the user in focus and store the recommendations as a list of tuples
resultIdf = []
for uid, uprofile in userFeaturesDF.filter(userFeaturesDF.user_hash_id == "1eac022a97d683eace8815545ce3153f")\
                .rdd.map(lambda x: (x[0], x[1])).collect():
    resultIdf = resultIdf + recommendTfIdf(uid, uprofile, specUserRatedPaper, 10)

In [169]:
#Exercise 4.3
#Create a dataframe to contain the recommendations
specUserTop10 = sparkSession.createDataFrame(resultIdf, ["user_hash_id", "paper_id", "score"])
specUserTop10.collect()

[Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='3398098', score=0.34487821642211375),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='507529', score=0.30904766609006196),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='1321106', score=0.2986204526550599),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='3474061', score=0.2933958050265111),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='3909566', score=0.2883505521836707),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='848992', score=0.2870553009714794),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='4041004', score=0.28668993229309436),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='3483504', score=0.2828635359472608),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='3581787', score=0.28151584574154287),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='3112352', score=0.2794862

CBRS lda

In [192]:
#Initializing an LDA object.
lda = LDA(featuresCol="tf_vector", k=40, maxIter=10)
#Creating the model.
ldaModel = lda.fit(featurizedDataDF.select("tf_vector"))

In [193]:
#Creating a dataframe with the topic distibution generated from LDA.
ldaTransformed = ldaModel.transform(featurizedDataDF)
ldaTransformed.first()

Row(paper_id='7065512', tf_vector=SparseVector(1000, {2: 1.0, 16: 1.0, 36: 1.0, 46: 1.0, 56: 1.0, 64: 2.0, 70: 1.0, 73: 1.0, 76: 2.0, 141: 1.0, 155: 1.0, 156: 1.0, 204: 1.0, 228: 1.0, 231: 1.0, 274: 1.0, 294: 1.0, 298: 1.0, 314: 1.0, 319: 2.0, 386: 1.0, 404: 1.0, 411: 2.0, 490: 1.0, 526: 1.0, 527: 1.0, 533: 1.0, 578: 1.0, 606: 1.0, 677: 1.0, 700: 1.0, 744: 1.0}), topicDistribution=DenseVector([0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.7642, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0008, 0.0007, 0.0007, 0.2106, 0.0007, 0.0007, 0.0007, 0.0007]))

In [194]:
#Joining the topic distribution to the users.
joinUserLDA = explodUsersDF.join(ldaTransformed, "paper_id")
joinUserLDA.show(5, truncate=True)

+--------+--------------------+--------------------+--------------------+
|paper_id|        user_hash_id|           tf_vector|   topicDistribution|
+--------+--------------------+--------------------+--------------------+
| 7065512|f1e1cd4ff25018273...|(1000,[2,16,36,46...|[6.58188466065330...|
| 5910752|f1e1cd4ff25018273...|(1000,[64,225,585...|[0.00610750251118...|
| 1843282|5ba96b191db3c3c02...|(1000,[22,38,69,1...|[0.00221590860822...|
| 2706950|3b715ebaf1f8f81a1...|(1000,[4,55,75,78...|[0.00143317230407...|
| 4169015|7c0081293b3988065...|(1000,[13,16,22,2...|[6.58188466065329...|
+--------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [195]:
#Grouping the topic distribution with respect to the users.
grpUserTopicDistDF = joinUserLDA.groupBy("user_hash_id")\
                                .agg(Func.collect_list("topicDistribution").alias("topic_distribution_ls"))
grpUserTopicDistDF.first()

Row(user_hash_id='f1e1cd4ff25018273aafc0c68fbb5a2f', topic_distribution_ls=[DenseVector([0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.7642, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0008, 0.0007, 0.0007, 0.2106, 0.0007, 0.0007, 0.0007, 0.0007]), DenseVector([0.0061, 0.0061, 0.0062, 0.0061, 0.0063, 0.0062, 0.0061, 0.0061, 0.0062, 0.0062, 0.0061, 0.0061, 0.0061, 0.0061, 0.0061, 0.0061, 0.0062, 0.0067, 0.0061, 0.0061, 0.0061, 0.0061, 0.0061, 0.0061, 0.0061, 0.0061, 0.0061, 0.0061, 0.0061, 0.0062, 0.0065, 0.0062, 0.007, 0.0061, 0.0061, 0.7589, 0.0062, 0.0061, 0.0061, 0.0061]), DenseVector([0.0082, 0.0082, 0.0082, 0.0082, 0.0085, 0.0083, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0083, 0.6787, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0082, 0.0083, 0.

In [196]:
#Function to sum a list of dense vectors.
def addDenseVec(dense_vec_list):
    result = DenseVector(np.zeros(40))
    for dense_vector in dense_vec_list:
        result = np.add(result, dense_vector).tolist() 
    return DenseVector(result)

#Creating a user defined function.
udf_addDenseVec = Func.udf(addDenseVec, VectorUDT())

In [211]:
#Summation of the paper topics vectors.
userLDAProfile = grpUserTopicDistDF.withColumn("features",\
                                               Func.lit(udf_addDenseVec(grpUserTopicDistDF.topic_distribution_ls)))

userLDAProfile.select("user_hash_id", "features").first()

Row(user_hash_id='f1e1cd4ff25018273aafc0c68fbb5a2f', features=DenseVector([0.3822, 1.2192, 1.7299, 0.3828, 6.2319, 0.716, 0.3826, 0.3831, 8.4454, 0.3851, 0.8863, 0.3825, 0.383, 0.3823, 0.3839, 1.186, 1.0575, 35.5781, 0.3824, 0.3826, 0.3823, 0.3823, 0.3822, 0.3822, 0.3825, 0.3822, 0.3829, 0.3829, 0.3823, 0.3875, 32.8132, 1.1376, 35.1269, 0.3823, 0.3822, 25.8921, 2.0259, 0.3822, 0.3831, 0.3823]))

In [198]:
#Exercise 4.3
#Function to get the top-k recommendation of a user
def recommendLda(user_id, user_profile, rated_paper_df, k):
    result_arr = []
    #Get the papers not yet rated by the user
    unratedPapers = ldaTransformed.join(rated_paper_df, "paper_id", how="left_anti")\
                            .select("paper_id", "topicDistribution")
    #Loop over the unrated papers
    for row_df in unratedPapers.collect():
        #Calculate the cosine similarity given the user and paper profiles and store in a list
        result_arr.append((user_id, row_df.paper_id, cosineSim(user_profile, row_df.topicDistribution)))
    #Sort the list of tuples according to the cosine similarity in descending order
    result_arr.sort(key=lambda tup: tup[2], reverse=True)
    
    if k > len(result_arr):
        k = len(result_arr)
    
    return result_arr[0:k]

In [215]:
#Exercise 4.3
#Get the id and profile of the user in focus and store the recommendations as a list of tuples
resultLda = []
for uid, uprofile in userLDAProfile.filter(userLDAProfile.user_hash_id == "1eac022a97d683eace8815545ce3153f")\
                .rdd.map(lambda x: (x[0], x[2])).collect():
    resultLda = resultLda + recommendLda(uid, uprofile, specUserRatedPaper, 10)

In [216]:
#Exercise 4.3
#Create a dataframe to contain the recommendations
specUserTop10Lda = sparkSession.createDataFrame(resultLda, ["user_hash_id", "paper_id", "score"])
specUserTop10Lda.collect()

[Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='525396', score=0.9968105400696438),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='11536389', score=0.996741283895792),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='261290', score=0.9961329302196683),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='2373101', score=0.996097344438993),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='11733005', score=0.9960232326441464),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='1043194', score=0.9954499215563196),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='1320137', score=0.9954185880350954),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='8310458', score=0.9951383904363118),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='3273038', score=0.9950104584796386),
 Row(user_hash_id='1eac022a97d683eace8815545ce3153f', paper_id='1677653', score=0.99363508903

## Exercise 4.4 (Sampling and data preparation)
a) Randomly select n users in a sample set. <br>
b) Split library of each user into training(80%) and test(20%) sets. <br>
c) Compute user profile using TF-IDF and LDA

In [244]:
#Exercise 4.4
#Function to get training and test papers
def get_training(paper_id_list, type_data):
    data_set = []
    paper_set = set(paper_id_list)
    #Get length of list of papers rated by user
    length = len(paper_set)
    #Training set contains 80 percent of total papers rated by user
    train_low_idx = 0
    train_up_idx = math.floor(0.8*length)-1
    #Test set contains the rest of the papers rated by the user
    test_low_idx = train_up_idx + 1
    test_up_idx = length-1
    
    for idx, paper_id in enumerate(paper_set):
        #if TRAIN then generate training set
        if type_data == "TRAIN":
            if (idx >= train_low_idx and idx <= train_up_idx):
                data_set.append(paper_id)
        #if TEST then generate test set
        elif type_data == "TEST":
            if (idx >= test_low_idx and idx <= test_up_idx):
                data_set.append(paper_id)    
    return data_set

udf_get_training = Func.udf(get_training, ArrayType(StringType()))

In [237]:
#Exercise 4.4
#Function to get training and test sets
def sampler(n_users, users_df, paper_profile, profile_type):
    #Get n numbers of user from a sample set of users
    sampleUsersDF = users_df.sample(0.8).limit(n_users)
    
    #Create a dataframe with the user id and user library
    sampleUserLibDF = sampleUsersDF.select(sampleUsersDF.user_hash_id,\
                                           Func.explode(Func.split(sampleUsersDF.user_library, ","))\
                                           .alias("paper_id"))\
                                    .groupBy("user_hash_id")\
                                    .agg(Func.collect_list("paper_id")\
                                         .alias("paper_id_list"))
    
    #Get the training set dataframe with the training library
    trSetDF = sampleUserLibDF.withColumn("train_paper_list",\
                           udf_get_training(sampleUserLibDF.paper_id_list, Func.lit("TRAIN")))\
                                .select("user_hash_id", "train_paper_list")
    
    #Get the training set dataframe, each row containing individual paper_id
    trainSetDF = trSetDF.select("user_hash_id", Func.explode(trSetDF.train_paper_list).alias("paper_id"))
    
    #Get the test set dataframe with the test library
    tstSetDF = sampleUserLibDF.withColumn("test_paper_list",\
                           udf_get_training(sampleUserLibDF.paper_id_list, Func.lit("TEST")))\
                                .select("user_hash_id", "test_paper_list")

    #Get the test set dataframe, each row containing individual paper_id
    testSetDF = tstSetDF.select("user_hash_id", Func.explode(tstSetDF.test_paper_list).alias("paper_id"))
    
    #Joining with the profile of each paper
    joinTrItmProf = trainSetDF.join(paper_profile, "paper_id")
    
    if profile_type =="TF_IDF":
        #Generating the user profile based on the TF-IDF of the paper in the training set
        trUserProfList = joinTrItmProf.groupBy("user_hash_id").agg(Func.collect_list("idf_vector").alias("tf_idf_list"))
        trUserProfileDF = trUserTfIdfList.withColumn("features", udf_addSparseVec(trUserTfIdfList.tf_idf_list))
        trUserFeaturesDF = trUserProfileDF.select("user_hash_id", "features")
        
    elif profile_type == "LDA":
        #Generating the user profile based on the LDA of the paper in the training set
        trUserProfList = joinTrItmProf.groupBy("user_hash_id")\
                                .agg(Func.collect_list("topicDistribution").alias("topic_distribution_ls"))
        trUserProfileDF = grpTrUserTopicDistDF.withColumn("features",\
                                                           Func.lit(udf_addDenseVec(grpTrUserTopicDistDF.topic_distribution_ls)))
        trUserFeaturesDF = trUserProfileDF.select("user_hash_id", "features")
        
    return (trUserFeaturesDF, trainSetDF, tstSetDF)
        

## Exercise 4.5 (Off-line evaluation)
a) Generate 10 recommendations for each user (20) using CBRS tf-idf. <br>

In [None]:
userFeaturesIdf, trainSetIdf, testSetIdfa = sampler(20, usersDF, paperTfIdf, "TF_IDF")

In [170]:
#Exercise 4.5 a)
#Generate the top 10 and 100 recommendations for the users using CBRS tf-idf
resultIdf10 = []
resultIdf100 = []
for uid, uprofile in userFeaturesIdf.rdd.map(lambda x: (x[0], x[1])).collect():
    userRatedPaper = trainSetIdf.filter(trainSetIdf.user_hash_id==uid)
    resultIdf10 = resultIdf10 + recommendTfIdf(uid, uprofile, userRatedPaper, 10)
    resultIdf100 = resultIdf100 + recommendTfIdf(uid, uprofile, userRatedPaper, 100)

In [171]:
#Exercise 4.5 a)
#Create a dataframe to contain the top 10 recommendations
userTopIdf10 = sparkSession.createDataFrame(resultIdf10, ["user_hash_id", "paper_id", "score"])
userTopIdf10.show(15)

+--------------------+--------+-------------------+
|        user_hash_id|paper_id|              score|
+--------------------+--------+-------------------+
|f05bcffe7951de9e5...|10259199| 0.4115165624330962|
|f05bcffe7951de9e5...| 4038052|0.37463983301077103|
|f05bcffe7951de9e5...| 3486430| 0.3523025476408462|
|f05bcffe7951de9e5...|  556147| 0.3469728818394254|
|f05bcffe7951de9e5...| 3508967| 0.3437314101418278|
|f05bcffe7951de9e5...| 3752489|0.34220716025545106|
|f05bcffe7951de9e5...| 8667279| 0.3418505434804742|
|f05bcffe7951de9e5...| 3821740| 0.3405555236985056|
|f05bcffe7951de9e5...|  111881| 0.3384057593770682|
|f05bcffe7951de9e5...|11852474| 0.3350706871009224|
|a0bbf6bb9b1c818f3...| 7469728| 0.4944232566399789|
|a0bbf6bb9b1c818f3...| 5261538|0.46740456778932693|
|a0bbf6bb9b1c818f3...| 3619721| 0.4053103127442583|
|a0bbf6bb9b1c818f3...| 1223835| 0.3751995759502769|
|a0bbf6bb9b1c818f3...|12033178|0.35210770039330513|
+--------------------+--------+-------------------+
only showing

In [None]:
#Exercise 4.5 a)
#Create a dataframe to contain the top 100 recommendations
userTopIdf100 = sparkSession.createDataFrame(resultIdf100, ["user_hash_id", "paper_id", "score"])
userTopIdf100.show(15)

b) Get the hits (recommendations also found in the test set). <br>

In [174]:
#Exercise 4.5 b)
#Dataframe with the list of top 10 recommendations for each user
userTopIdfList10 = userTopIdf10.groupBy(userTopIdf10.user_hash_id).agg(Func.collect_list("paper_id").alias("recom_paper_list"))
userTopIdfList10.show(5)

+--------------------+--------------------+
|        user_hash_id|    recom_paper_list|
+--------------------+--------------------+
|f05bcffe7951de9e5...|[10259199, 403805...|
|a0bbf6bb9b1c818f3...|[7469728, 5261538...|
|01ecfc66d01b0dd84...|[7190864, 1089946...|
|c62b6c27979ffd314...|[884897, 12157068...|
|b656009a6efdc8b1a...|[1938132, 688160,...|
+--------------------+--------------------+
only showing top 5 rows



In [175]:
#Exercise 4.5 b)
#Dataframe with the list test paper list for each user
userTopTstIdf10 = testSetIdfa.join(userTopIdfList10, "user_hash_id")
userTopTstIdf10.show(5)

+--------------------+--------------------+--------------------+
|        user_hash_id|     test_paper_list|    recom_paper_list|
+--------------------+--------------------+--------------------+
|f05bcffe7951de9e5...|[3281478, 945310,...|[10259199, 403805...|
|a0bbf6bb9b1c818f3...|[1223835, 1123321...|[7469728, 5261538...|
|01ecfc66d01b0dd84...|    [970908, 808689]|[7190864, 1089946...|
|c62b6c27979ffd314...|   [884897, 1270598]|[884897, 12157068...|
|b656009a6efdc8b1a...|[1247499, 771870,...|[1938132, 688160,...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [176]:
#Exercise 4.5 b)
#Function to get the list of hits
def get_hits(test_list, recom_list):
    hits = list(set(test_list).intersection(recom_list))
    return hits

udf_get_hits = Func.udf(get_hits, ArrayType(StringType()))

In [178]:
#Exercise 4.5 b)
#Dataframe with the list of hits
userTopHitIdf10 = userTopTstIdf10.withColumn("hits", udf_get_hits(userTopTstIdf10.test_paper_list\
                                                            , userTopTstIdf10.recom_paper_list))
userTopHitIdf10.show(5)

+--------------------+--------------------+--------------------+--------------------+
|        user_hash_id|     test_paper_list|    recom_paper_list|                hits|
+--------------------+--------------------+--------------------+--------------------+
|f05bcffe7951de9e5...|[3281478, 945310,...|[10259199, 403805...|  [556147, 11852474]|
|a0bbf6bb9b1c818f3...|[1223835, 1123321...|[7469728, 5261538...|[1223835, 1123321...|
|01ecfc66d01b0dd84...|    [970908, 808689]|[7190864, 1089946...|                  []|
|c62b6c27979ffd314...|   [884897, 1270598]|[884897, 12157068...|            [884897]|
|b656009a6efdc8b1a...|[1247499, 771870,...|[1938132, 688160,...|[1779811, 688160,...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [None]:
#Exercise 4.5 b)
#Dataframe with the list of top 100 recommendations for each user
userTopIdfList100 = userTopIdf100.groupBy(userTopIdf100.user_hash_id)\
                                    .agg(Func.collect_list("paper_id").alias("recom_paper_list"))
#Dataframe with the list test paper list for each user
userTopTstIdf100 = testSetIdfa.join(userTopIdfList100, "user_hash_id")
#Dataframe with the list of hits
userTopHitIdf100 = userTopTstIdf100.withColumn("hits", udf_get_hits(userTopTstIdf100.test_paper_list\
                                                            , userTopTstIdf100.recom_paper_list))
userTopHitIdf100.show(5)

c) Evaluation of the Precision@k, Recall@k and MRR@k metrics. <br>

In [181]:
#Exercise 4.5 c)
#Get the number of users
sampleCountIdf = 20#userFeaturesIdf.count()
#sampleCount

6

In [179]:
#Exercise 4.5 c)
#Function to calculate the precision metric
def get_precision_k(sample_count, userTop_df, k):
    result = 0
    for uid, tst_ls, rec_ls, hit in userTop_df.rdd.map(lambda x: (x[0], x[1], x[2], x[3])).collect():
        result = result + float(len(hit)/k)
    
    return float(result/sample_count)

In [182]:
#Exercise 4.5 c)
#Precision metric for top 10 k
precision10 = get_precision_k(sampleCountIdf, userTopHitIdf10, 10)
precision10

0.26666666666666666

In [None]:
#Exercise 4.5 c)
#Precision metric for top 100 k
precision100 = get_precision_k(sampleCountIdf, userTopHitIdf100, 100)
precision100

In [183]:
#Exercise 4.5 c)
#Function to calculate the recall metric
def get_recall_k(sample_count, userTop_df):
    result = 0
    for uid, tst_ls, rec_ls, hit in userTop_df.rdd.map(lambda x: (x[0], x[1], x[2], x[3])).collect():
        result = result + float(len(hit)/len(tst_ls))
    
    return float(result/sample_count)

In [184]:
#Exercise 4.5 c)
#Recall metric for top 10 k
recall10 = get_recall_k(sampleCountIdf, userTopHitIdf10)
recall10

0.35107376283846864

In [None]:
#Exercise 4.5 c)
#Recall metric for top 100 k
recall100 = get_recall_k(sampleCountIdf, userTopHitIdf100)
recall100

In [190]:
#Exercise 4.5 c)
#Function to calculate the MRR metric
def get_Mrr_k(sample_count, userTop_df):
    result = 0
    for uid, tst_ls, rec_ls, hit in userTop_df.rdd.map(lambda x: (x[0], x[1], x[2], x[3])).collect():
        if len(hit)!=0:
            idx = tst_ls.index(hit[0])+1
            result = result + float(1/idx)
        
    return float(result/sample_count)

In [191]:
#Exercise 4.5 c)
#Mrr metric for top 10 k
Mrr10 = get_Mrr_k(sampleCountIdf, userTopHitIdf10)
Mrr10

0.4916666666666667

d) Generate top 10 and top 100 recommendations for each user (20) using CBRS lda. <br>

In [None]:
userFeaturesLda, trainSetLda, testSetLda = sampler(20, usersDF, ldaTransformed, "LDA")

In [222]:
#Exercise 4.5 d)
#Generate the top 10 and 100 recommendations for the users using CBRS lda
resultLda10 = []
resultLda100 = []
for uid, uprofile in userFeaturesLda.rdd.map(lambda x: (x[0], x[2])).collect():
    userRatedPaperLda = trainSetLda.filter(trainSetLda.user_hash_id==uid)
    resultLda10 = resultLda10 + recommendLda(uid, uprofile, userRatedPaperLda, 10)
    resultLda100 = resultLda100 + recommendLda(uid, uprofile, userRatedPaperLda, 100)

In [223]:
#Exercise 4.5 d)
#Create a dataframe to contain the top 10 recommendations
userTopLda10 = sparkSession.createDataFrame(resultLda10, ["user_hash_id", "paper_id", "score"])
userTopLda10.show(15)

+--------------------+--------+------------------+
|        user_hash_id|paper_id|             score|
+--------------------+--------+------------------+
|f05bcffe7951de9e5...| 8667279|0.9823014150850838|
|f05bcffe7951de9e5...|  115167|0.9822724724488839|
|f05bcffe7951de9e5...|  941108|0.9785438928148672|
|f05bcffe7951de9e5...| 6600388|0.9775181487805837|
|f05bcffe7951de9e5...| 9428511|0.9773620935949265|
|f05bcffe7951de9e5...| 3689557|0.9773391569439702|
|f05bcffe7951de9e5...|  267335|0.9772986156841782|
|f05bcffe7951de9e5...| 3479291|0.9769089607251129|
|f05bcffe7951de9e5...| 2855355|0.9768913023845585|
|f05bcffe7951de9e5...| 1325105|0.9768335104096272|
|a0bbf6bb9b1c818f3...| 2688523|0.9942583667403158|
|a0bbf6bb9b1c818f3...| 3463357|0.9939660842002734|
|a0bbf6bb9b1c818f3...|  849862| 0.993912573225263|
|a0bbf6bb9b1c818f3...|  808689|0.9938298704325079|
|a0bbf6bb9b1c818f3...|  466011|0.9938125181808284|
+--------------------+--------+------------------+
only showing top 15 rows



In [224]:
#Exercise 4.5 d)
#Dataframe with the list of top 10 recommendations for each user
userTopLdaList10 = userTopLda10.groupBy(userTopLda10.user_hash_id).agg(Func.collect_list("paper_id").alias("recom_paper_list"))
userTopLdaList10.show(5)

+--------------------+--------------------+
|        user_hash_id|    recom_paper_list|
+--------------------+--------------------+
|f05bcffe7951de9e5...|[8667279, 115167,...|
|a0bbf6bb9b1c818f3...|[2688523, 3463357...|
|01ecfc66d01b0dd84...|[154, 840811, 121...|
|c62b6c27979ffd314...|[3462570, 607999,...|
|b656009a6efdc8b1a...|[311570, 8423325,...|
+--------------------+--------------------+
only showing top 5 rows



In [225]:
#Exercise 4.5 d)
#Dataframe with the list test paper list for each user
userTopLdatst10 = testSetLda.join(userTopLdaList10, "user_hash_id")
userTopLdatst10.show(5)

+--------------------+--------------------+--------------------+
|        user_hash_id|     test_paper_list|    recom_paper_list|
+--------------------+--------------------+--------------------+
|f05bcffe7951de9e5...|[3281478, 945310,...|[8667279, 115167,...|
|a0bbf6bb9b1c818f3...|[1223835, 1123321...|[2688523, 3463357...|
|01ecfc66d01b0dd84...|    [970908, 808689]|[154, 840811, 121...|
|c62b6c27979ffd314...|   [884897, 1270598]|[3462570, 607999,...|
|b656009a6efdc8b1a...|[1247499, 771870,...|[311570, 8423325,...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [226]:
#Exercise 4.5 d)
#Dataframe with the list of hits
userTopLdaHit10 = userTopLdatst10.withColumn("hits", udf_get_hits(userTopLdatst10.test_paper_list\
                                                            , userTopLdatst10.recom_paper_list))
userTopLdaHit10.show(5)

+--------------------+--------------------+--------------------+---------+
|        user_hash_id|     test_paper_list|    recom_paper_list|     hits|
+--------------------+--------------------+--------------------+---------+
|f05bcffe7951de9e5...|[3281478, 945310,...|[8667279, 115167,...|[1325105]|
|a0bbf6bb9b1c818f3...|[1223835, 1123321...|[2688523, 3463357...|       []|
|01ecfc66d01b0dd84...|    [970908, 808689]|[154, 840811, 121...|       []|
|c62b6c27979ffd314...|   [884897, 1270598]|[3462570, 607999,...|       []|
|b656009a6efdc8b1a...|[1247499, 771870,...|[311570, 8423325,...|       []|
+--------------------+--------------------+--------------------+---------+
only showing top 5 rows



In [None]:
#Exercise 4.5 d)
#Create a dataframe to contain the top 100 recommendations
userTopLda100 = sparkSession.createDataFrame(resultLda100, ["user_hash_id", "paper_id", "score"])
#Dataframe with the list of top 10 recommendations for each user
userTopLdaList100 = userTopLda100.groupBy(userTopLda100.user_hash_id).agg(Func.collect_list("paper_id").alias("recom_paper_list"))
#Dataframe with the list test paper list for each user
userTopLdatst100 = testSetLda.join(userTopLdaList100, "user_hash_id")
#Dataframe with the list of hits
userTopLdaHit100 = userTopLdatst100.withColumn("hits", udf_get_hits(userTopLdatst100.test_paper_list\
                                                            , userTopLdatst100.recom_paper_list))
userTopLdaHit100.show(5)

In [227]:
#Exercise 4.5 d)
#Get the number of users
sampleCount = 20#userFeaturesLda.count()
#sampleCount

6

In [229]:
#Exercise 4.5 d)
#Precision metric for top 10 k
precisionLda10 = get_precision_k(sampleCount, userTopLdaHit10, 10)
precisionLda10

0.016666666666666666

In [None]:
#Exercise 4.5 d)
#Precision metric for top 100 k
precisionLda100 = get_precision_k(sampleCount, userTopLdaHit100, 100)
precisionLda100

In [230]:
#Exercise 4.5 d)
#Recall metric for top 10 k
recallLda10 = get_recall_k(sampleCount, userTopLdaHit10)
recallLda10

0.004901960784313725

In [None]:
#Exercise 4.5 d)
#Recall metric for top 100 k
recallLda100 = get_recall_k(sampleCount, userTopLdaHit100)
recallLda100

In [231]:
#Exercise 4.5 d)
#Mrr metric for top 10 k
MrrLda10 = get_Mrr_k(sampleCount, userTopLdaHit10)
MrrLda10

0.015151515151515152