In [44]:
#Import packages.
from pyspark.sql import SparkSession
from pyspark.sql import functions as Func
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, IDF
from pyspark.ml.linalg import SparseVector, VectorUDT, Vectors
from pyspark.ml.clustering import KMeans
from nltk.stem import PorterStemmer
import math
import numpy as np
#import string

#Create a spark session.
sparkSession = SparkSession.builder.appName("Experiment3").getOrCreate()

In [2]:
#Loading stopwords_en.txt data into a dataframe.
stopWordsDF = sparkSession.read\
                .load("/home/jovyan/work/stopwords_en.txt", format="text", sep=" ", inferSchema="true", header="false")\
                .toDF('stop_word')
stopWordsDF.show(5, truncate=True)

+---------+
|stop_word|
+---------+
|        a|
|     able|
|    about|
|    above|
|according|
+---------+
only showing top 5 rows



In [3]:
#Loading users_libraries.txt data into a dataframe.
#Defining the column names.
user_columns = ['raw_data']
rawUsersDF = sparkSession.read\
            .load("/home/jovyan/work/mod_users_libraries2.txt", format="text", sep=";", inferSchema="true", quote='"', header="false")\
            .toDF(*user_columns)

usersDF = rawUsersDF.select(Func.split(rawUsersDF.raw_data, ";").getItem(0).alias("user_hash_id"),\
                           Func.split(rawUsersDF.raw_data, ";").getItem(1).alias("user_library"))
usersDF.show()

+--------------------+--------------------+
|        user_hash_id|        user_library|
+--------------------+--------------------+
|d0c9aaa788153daea...|2080631,6343346,5...|
|ca4f1ba4094011d9a...|              278019|
|d1d41a15201915503...|6610569,6493797,6...|
|f2f77383828ea6d39...|943458,238121,763429|
|9c883d02115400f7b...|3509971,3509965,2...|
|1eac022a97d683eac...|3973229,322433,57...|
+--------------------+--------------------+



In [4]:
#Loading papers.csv data into a dataframe.
#Defining the column names.
paper_columns = ['paper_id', 'type', 'journal', 'book_title', \
           'series', 'publisher', 'pages', 'volume', \
           'number', 'year', 'month', 'postedate',\
           'address', 'title', 'abstract']
papersDF = sparkSession.read\
            .load("/home/jovyan/work/mod_papers.csv", format="csv", sep=",", inferSchema="true", quote='"', header="false")\
            .toDF(*paper_columns)
papersDF.show(2, truncate=True)


+--------+-------+--------------------+----------+------+---------+-----+------+------+----+-----+-------------------+-------+--------------------+--------------------+
|paper_id|   type|             journal|book_title|series|publisher|pages|volume|number|year|month|          postedate|address|               title|            abstract|
+--------+-------+--------------------+----------+------+---------+-----+------+------+----+-----+-------------------+-------+--------------------+--------------------+
|   80546|article|biology and philo...|      null|  null|     null|   17|    19|     2|2004|  mar|2005-01-26 21:35:21|   null|the arbitrariness...|the genetic code ...|
| 5842862|article|      molecular cell|      null|  null| elsevier|    2|    35|     6|2009|  sep|2009-09-30 17:11:23|   null|how to choose a g...|choosing good pro...|
+--------+-------+--------------------+----------+------+---------+-----+------+------+----+-----+-------------------+-------+--------------------+--------

## Exercise 3.1 (Vector representation for the papers)
To generate the bag-of-words representation for each paper.

In [5]:
#Exercise 3.1
#Concatenate the title and abstract fields together.
textDF = papersDF.select(papersDF.paper_id, Func.concat_ws(" ", papersDF.title, papersDF.abstract).alias("text"))
textDF.show(5)


+--------+--------------------+
|paper_id|                text|
+--------+--------------------+
|   80546|the arbitrariness...|
| 5842862|how to choose a g...|
| 1242600|how to write cons...|
| 3467077|defrosting the di...|
|  309395|why most publishe...|
+--------+--------------------+
only showing top 5 rows



In [6]:
#Exercise 3.1
#Perform tokenization and remove words less than 3 characters.
#Keep the words containing "-" and "_" characters.
reTokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=4, pattern="[^-_\\w]")
wordsDF = reTokenizer.transform(textDF)
wordsDF.head()


Row(paper_id=80546, text="the arbitrariness of the genetic code the genetic code has been regarded as arbitrary in the sense that the codon-amino acid assignments could be different than they actually are. this general idea has been spelled out differently by previous, often rather implicit accounts of arbitrariness. they have drawn on the frozen accident theory, on evolutionary contingency, on alternative causal pathways, and on the absence of direct stereochemical interactions between codons and amino acids. it has also been suggested that the arbitrariness of the genetic code justifies attributing semantic information to macromolecules, notably to {dna}. i argue that these accounts of arbitrariness are unsatisfactory. i propose that the code is arbitrary in the sense of jacques monod's concept of chemical arbitrariness: the genetic code is arbitrary in that any codon requires certain chemical and structural properties to specify a particular amino acid, but these properties are not 

In [7]:
#Exercise 3.1
#Function to remove the characters "-" and "_" from words.
def concatConnectedWords(wordList):
    wordSet = set(wordList)
    identity = str.maketrans("", "", "-_")
    wordSet = [word.translate(identity) for word in wordSet]
    return wordSet

udf_concatConnectedWords = Func.udf(concatConnectedWords, ArrayType(StringType()))

In [8]:
#Exercise 3.1
#removing the characters "-" and "_" from words.
removedConnectorsDF = wordsDF.select(wordsDF.paper_id,\
                                     Func.lit(udf_concatConnectedWords(wordsDF.words)).alias("processed_words"))
removedConnectorsDF.take(2)


[Row(paper_id=80546, processed_words=['these', 'actually', 'certain', 'stereochemical', 'notably', 'that', 'concept', 'particular', 'code', 'chemical', 'could', 'they', 'direct', 'propose', 'with', 'about', 'chemistry', 'general', 'regarded', 'notion', 'this', 'nucleic', 'information', 'drawn', 'specify', 'evolution', 'neither', 'absence', 'genetic', 'implicit', 'unsatisfactory', 'previous', 'frozen', 'between', 'alternative', 'amino', 'attributing', 'required', 'codonamino', 'been', 'have', 'structural', 'several', 'properties', 'maintain', 'necessary', 'requires', 'sense', 'hypotheses', 'pathways', 'acids', 'causal', 'codons', 'arbitrary', 'often', 'theory', 'also', 'macromolecules', 'assignments', 'rather', 'accounts', 'contingency', 'acid', 'monod', 'virtue', 'sufficient', 'jacques', 'principle', 'argue', 'idea', 'arbitrariness', 'suggested', 'compatible', 'than', 'recent', 'different', 'semantic', 'codon', 'interactions', 'justifies', 'accident', 'spelled', 'evolutionary', 'differ

In [9]:
#Exercise 3.1
#Creating a list of stop words.
stopWordsList = stopWordsDF.agg(Func.collect_list(stopWordsDF.stop_word)).rdd.flatMap(lambda row: row[0])
stopWordsList.collect()

['a',
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'appear',
 'appreciate',
 'appropriate',
 'are',
 'around',
 'as',
 'aside',
 'ask',
 'asking',
 'associated',
 'at',
 'available',
 'away',
 'awfully',
 'b',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'both',
 'brief',
 'but',
 'by',
 'c',
 'came',
 'can',
 'cannot',
 'cant',
 'cause',
 'causes',
 'certain',
 'certainly',
 'changes',
 'clearly',
 'co',
 'com',
 'come',
 'comes',
 'concerning',
 'consequently',
 'consider',
 'con

In [10]:
#Exercise 3.1
#Removing the stop words.
remover = StopWordsRemover(inputCol="processed_words", outputCol="words", stopWords=stopWordsList.collect())
withoutStopWordsDF = remover.transform(removedConnectorsDF)

In [11]:
withoutStopWordsDF.select("words").collect()

[Row(words=['stereochemical', 'notably', 'concept', 'code', 'chemical', 'direct', 'propose', 'chemistry', 'general', 'regarded', 'notion', 'nucleic', 'information', 'drawn', 'evolution', 'absence', 'genetic', 'implicit', 'unsatisfactory', 'previous', 'frozen', 'alternative', 'amino', 'attributing', 'required', 'codonamino', 'structural', 'properties', 'maintain', 'requires', 'sense', 'hypotheses', 'pathways', 'acids', 'causal', 'codons', 'arbitrary', 'theory', 'macromolecules', 'assignments', 'accounts', 'contingency', 'acid', 'monod', 'virtue', 'sufficient', 'jacques', 'principle', 'argue', 'idea', 'arbitrariness', 'suggested', 'compatible', 'recent', 'semantic', 'codon', 'interactions', 'justifies', 'accident', 'spelled', 'evolutionary', 'differently']),
 Row(words=['profession', 'lack', 'explicit', 'results', 'merit', 'discussion', 'tenure', 'observation', 'choose', 'problems', 'smart', 'teachers', 'scientific', 'explicitly', 'choosing', 'scientist', 'figure', 'give', 'good', 'resul

In [12]:
#Exercise 3.1
#Function to perform stemming.
stemmer = PorterStemmer()

def stemming(wordList):
    wordSet = set(wordList)
    wordSet = [stemmer.stem(word) for word in wordList]
    return sorted(wordSet)

#User defined function to perform stemming.
udf_stemming = Func.udf(stemming, ArrayType(StringType()))

In [13]:
#Exercise 3.1
#Performing stemming.
stemmedWordsDF = withoutStopWordsDF.withColumn("stemmed_words", udf_stemming(withoutStopWordsDF.words))

In [14]:
#Exercise 3.1
#Counting the number of papers (frequency) in which a particular word appears. (Document Frequency)
explodedDF = stemmedWordsDF.select(stemmedWordsDF.paper_id,\
                                   Func.explode(stemmedWordsDF.stemmed_words).alias("single_word"))\
                            .distinct()\
                            .groupBy("single_word")\
                            .agg(Func.count("single_word")\
                                 .alias("df"))
explodedDF.show(10)

+-----------+---+
|single_word| df|
+-----------+---+
|   everyday|  2|
|      input|  2|
|    persist|  1|
| likelihood|  3|
|  geneannot|  1|
|     import|  4|
| photograph|  1|
|       oper|  1|
|    highest|  1|
|      equal|  1|
+-----------+---+
only showing top 10 rows



In [15]:
explodedDF.count()

1449

In [16]:
#Exercise 3.1
#Counting the number of unique papers.
#Setting the upper and lower bounds.
uniqPaperCount = stemmedWordsDF.count()
upperBoundary = 0.1*uniqPaperCount
lowerBoundary = 2
#filter the dataframe to select only words that appear in more than 20 papers and
#less than 10 percent of the total number of papers.
filteredDF = explodedDF.filter((explodedDF.df>=lowerBoundary) & (explodedDF.df<=upperBoundary))
filteredDF.show(10)

+-----------+---+
|single_word| df|
+-----------+---+
|   everyday|  2|
|      input|  2|
| likelihood|  3|
|     import|  4|
|    explain|  5|
|    classif|  3|
|     execut|  2|
|        map|  4|
|  character|  6|
|      uncov|  4|
+-----------+---+
only showing top 10 rows



In [17]:
#Exercise 3.1
#Limiting the number of important terms to 1000.
termsDF = filteredDF.sort("df", ascending=False).limit(1000).select("single_word")
termsDF.show(10)

+-----------+
|single_word|
+-----------+
|  scientist|
|    suggest|
|     articl|
|      major|
|       read|
|     access|
|    process|
|       wide|
|      basic|
|     exampl|
+-----------+
only showing top 10 rows



In [18]:
#Exercise 3.1
#Generating an index to the top 1000 important terms starting with the index 0.
termsDF = termsDF.withColumn("word_id", Func.monotonically_increasing_id())

wordWinSpec = Window.orderBy("word_id")

termsDF = termsDF.withColumn("word_id",Func.row_number().over(wordWinSpec)-1)
termsDF.show(10, truncate=True)

+-----------+-------+
|single_word|word_id|
+-----------+-------+
|    process|      0|
|     access|      1|
|    suggest|      2|
|       wide|      3|
|      major|      4|
|       read|      5|
|     articl|      6|
|  scientist|      7|
|      basic|      8|
|     theori|      9|
+-----------+-------+
only showing top 10 rows



In [19]:
#Exercise 3.1
#Counting the number of times a word appeared in a particular paper (word count per paper) (term frequency).
tempDF = stemmedWordsDF.select(stemmedWordsDF.paper_id,\
                               Func.explode(stemmedWordsDF.stemmed_words).alias("single_word"))\
                        .groupBy("paper_id", "single_word")\
                        .agg(Func.count("single_word").alias("tf"))
tempDF.show(10)

+--------+-----------+---+
|paper_id|single_word| tf|
+--------+-----------+---+
| 1242600|     public|  1|
| 1242600|       talk|  1|
|      99|       call|  1|
|  740681|       imit|  1|
|  740681|      model|  1|
|   99857|      degre|  1|
| 3614773|      studi|  1|
|  117535|   reweight|  1|
| 4131662|   techniqu|  1|
|  115158|     common|  1|
+--------+-----------+---+
only showing top 10 rows



In [20]:
#Exercise 3.1
#Creating a dataframe with term index and count of the word per paper.
#Creating a column containing a mapping of term index -> count per paper.
joinedResult = tempDF.join(termsDF, "single_word").withColumn("map", Func.create_map("word_id", "tf"))\
                    .groupBy("paper_id").agg(Func.collect_list("map").alias("map_list"))
joinedResult.show(10)

+--------+--------------------+
|paper_id|            map_list|
+--------+--------------------+
|  115158|[[0 -> 1], [1 -> ...|
|     101|[[0 -> 1], [3 -> ...|
|  740681|[[0 -> 1], [14 ->...|
| 4778506|[[0 -> 1], [1 -> ...|
|  105906|[[0 -> 1], [69 ->...|
|  212874|[[0 -> 1], [1 -> ...|
|  430834|[[0 -> 2], [13 ->...|
| 5394760|[[0 -> 1], [7 -> ...|
| 4200367|[[0 -> 1], [5 -> ...|
| 3721754|[[0 -> 1], [4 -> ...|
+--------+--------------------+
only showing top 10 rows



In [21]:
#Exercise 3.1
#Function to create a sparse vector for each paper.
def to_sparse(mapList):
    index = []
    value = []
    for map_val in mapList:
        for idx, val in map_val.items():
            index.append(idx)
            value.append(val)
    return SparseVector(1000, index, value)

#Creating a user defined function.
udf_to_sparse = Func.udf(to_sparse, VectorUDT())

In [22]:
#Exercise 3.1
#Creating the bag of words dataframe.
featurizedDataDF = joinedResult.select("paper_id",\
                                   udf_to_sparse(joinedResult.map_list)\
                                   .alias("tf_vector"))
featurizedDataDF.first()

Row(paper_id=115158, tf_vector=SparseVector(1000, {0: 1.0, 1: 1.0, 12: 1.0, 13: 1.0, 21: 1.0, 22: 2.0, 24: 1.0, 27: 1.0, 31: 1.0, 37: 1.0, 43: 1.0, 51: 1.0, 55: 1.0, 62: 1.0, 69: 1.0, 70: 1.0, 76: 1.0, 79: 1.0, 80: 1.0, 84: 1.0, 90: 1.0, 92: 1.0, 95: 1.0, 105: 2.0, 109: 1.0, 110: 1.0, 111: 2.0, 113: 1.0, 117: 1.0, 120: 1.0, 127: 1.0, 130: 1.0, 145: 1.0, 162: 1.0, 164: 1.0, 169: 1.0, 174: 1.0, 180: 1.0, 202: 1.0, 209: 1.0, 219: 1.0, 222: 1.0, 224: 1.0, 239: 1.0, 245: 1.0, 246: 1.0, 258: 1.0, 263: 2.0, 270: 1.0, 275: 1.0, 317: 2.0, 329: 1.0, 336: 1.0, 339: 1.0, 355: 1.0, 364: 1.0, 369: 1.0, 377: 1.0, 384: 1.0, 387: 1.0, 405: 1.0, 411: 1.0, 418: 2.0, 423: 1.0, 436: 1.0, 440: 1.0, 450: 1.0, 454: 1.0, 459: 1.0, 471: 1.0, 500: 1.0, 534: 1.0, 570: 1.0, 583: 1.0, 584: 1.0, 596: 1.0, 614: 1.0}))

## Exercise 3.2 (TF-IDF representation for the papers)


In [23]:
#Exercise 3.2
#Calculate TF and IDF using spark functionalities.
sparkMlIdf = IDF(inputCol="tf_vector", outputCol="idf_vector")
sparkMlIdfModel = sparkMlIdf.fit(featurizedDataDF)
IdfData = sparkMlIdfModel.transform(featurizedDataDF)

In [24]:
#Exercise 3.2
#Calculate TF and IDF using spark functionalities.
IdfData.first()

Row(paper_id=115158, tf_vector=SparseVector(1000, {0: 1.0, 1: 1.0, 12: 1.0, 13: 1.0, 21: 1.0, 22: 2.0, 24: 1.0, 27: 1.0, 31: 1.0, 37: 1.0, 43: 1.0, 51: 1.0, 55: 1.0, 62: 1.0, 69: 1.0, 70: 1.0, 76: 1.0, 79: 1.0, 80: 1.0, 84: 1.0, 90: 1.0, 92: 1.0, 95: 1.0, 105: 2.0, 109: 1.0, 110: 1.0, 111: 2.0, 113: 1.0, 117: 1.0, 120: 1.0, 127: 1.0, 130: 1.0, 145: 1.0, 162: 1.0, 164: 1.0, 169: 1.0, 174: 1.0, 180: 1.0, 202: 1.0, 209: 1.0, 219: 1.0, 222: 1.0, 224: 1.0, 239: 1.0, 245: 1.0, 246: 1.0, 258: 1.0, 263: 2.0, 270: 1.0, 275: 1.0, 317: 2.0, 329: 1.0, 336: 1.0, 339: 1.0, 355: 1.0, 364: 1.0, 369: 1.0, 377: 1.0, 384: 1.0, 387: 1.0, 405: 1.0, 411: 1.0, 418: 2.0, 423: 1.0, 436: 1.0, 440: 1.0, 450: 1.0, 454: 1.0, 459: 1.0, 471: 1.0, 500: 1.0, 534: 1.0, 570: 1.0, 583: 1.0, 584: 1.0, 596: 1.0, 614: 1.0}), idf_vector=SparseVector(1000, {0: 2.1972, 1: 2.1972, 12: 2.2925, 13: 2.2925, 21: 2.2925, 22: 4.7958, 24: 2.3979, 27: 2.3979, 31: 2.3979, 37: 2.3979, 43: 2.5157, 51: 2.5157, 55: 2.5157, 62: 2.5157, 69: 2

In [25]:
#Exercise 3.2
#Calculate TF and IDF using spark functionalities.
def multiply_sparse_vec(x_vec, y_vec):
    result = np.multiply(x_vec, y_vec).tolist()
    vector_args = len(result), [i for i, x in enumerate(result) if x != 0], [x for x in result if x != 0] 
    return Vectors.sparse(*vector_args)

udf_multi_sparse_vec = Func.udf(multiply_sparse_vec, VectorUDT())

In [26]:
#Exercise 3.2
#Calculate TF and IDF using spark functionalities.
paperTfIdf = IdfData.withColumn("tf_idf", udf_multi_sparse_vec(IdfData.tf_vector, IdfData.idf_vector))
paperTfIdf.first()

Row(paper_id=115158, tf_vector=SparseVector(1000, {0: 1.0, 1: 1.0, 12: 1.0, 13: 1.0, 21: 1.0, 22: 2.0, 24: 1.0, 27: 1.0, 31: 1.0, 37: 1.0, 43: 1.0, 51: 1.0, 55: 1.0, 62: 1.0, 69: 1.0, 70: 1.0, 76: 1.0, 79: 1.0, 80: 1.0, 84: 1.0, 90: 1.0, 92: 1.0, 95: 1.0, 105: 2.0, 109: 1.0, 110: 1.0, 111: 2.0, 113: 1.0, 117: 1.0, 120: 1.0, 127: 1.0, 130: 1.0, 145: 1.0, 162: 1.0, 164: 1.0, 169: 1.0, 174: 1.0, 180: 1.0, 202: 1.0, 209: 1.0, 219: 1.0, 222: 1.0, 224: 1.0, 239: 1.0, 245: 1.0, 246: 1.0, 258: 1.0, 263: 2.0, 270: 1.0, 275: 1.0, 317: 2.0, 329: 1.0, 336: 1.0, 339: 1.0, 355: 1.0, 364: 1.0, 369: 1.0, 377: 1.0, 384: 1.0, 387: 1.0, 405: 1.0, 411: 1.0, 418: 2.0, 423: 1.0, 436: 1.0, 440: 1.0, 450: 1.0, 454: 1.0, 459: 1.0, 471: 1.0, 500: 1.0, 534: 1.0, 570: 1.0, 583: 1.0, 584: 1.0, 596: 1.0, 614: 1.0}), idf_vector=SparseVector(1000, {0: 2.1972, 1: 2.1972, 12: 2.2925, 13: 2.2925, 21: 2.2925, 22: 4.7958, 24: 2.3979, 27: 2.3979, 31: 2.3979, 37: 2.3979, 43: 2.5157, 51: 2.5157, 55: 2.5157, 62: 2.5157, 69: 2

In [27]:
#Exercise 3.2
#Calculate TF and IDF without using spark functionalities.
def calcIdf(m, docFreq):
    Idf = math.log((m+1)/(docFreq+1))
    return Idf

udf_calcIdf = Func.udf(calcIdf, FloatType())

In [30]:
#Exercise 3.2
#Calculate TF and IDF without using spark functionalities.
termsWithDF = termsDF.join(filteredDF, "single_word")
termsWithDF.show(10)

+-----------+-------+---+
|single_word|word_id| df|
+-----------+-------+---+
|    process|      0| 10|
|     access|      1| 10|
|    suggest|      2| 10|
|       wide|      3| 10|
|      major|      4| 10|
|       read|      5| 10|
|     articl|      6| 10|
|  scientist|      7| 10|
|      basic|      8| 10|
|     theori|      9|  9|
+-----------+-------+---+
only showing top 10 rows



In [33]:
#Exercise 3.2
#Calculate TF and IDF without using spark functionalities.
termsWithIDF = termsWithDF.withColumn("idf",\
                                            udf_calcIdf(Func.lit(uniqPaperCount), termsWithDF.df))

termsWithIDF.show(10)

+-----------+-------+---+---------+
|single_word|word_id| df|      idf|
+-----------+-------+---+---------+
|    process|      0| 10|2.2172253|
|     access|      1| 10|2.2172253|
|    suggest|      2| 10|2.2172253|
|       wide|      3| 10|2.2172253|
|      major|      4| 10|2.2172253|
|       read|      5| 10|2.2172253|
|     articl|      6| 10|2.2172253|
|  scientist|      7| 10|2.2172253|
|      basic|      8| 10|2.2172253|
|     theori|      9|  9|2.3125355|
+-----------+-------+---+---------+
only showing top 10 rows



In [34]:
#Exercise 3.2
#Calculate TF and IDF without using spark functionalities.
termsWithTfIdf = termsWithIDF.withColumn("tf_idf", termsWithIDF.df*termsWithIDF.idf)
termsWithTfIdf.show(10)

+-----------+-------+---+---------+---------+
|single_word|word_id| df|      idf|   tf_idf|
+-----------+-------+---+---------+---------+
|    process|      0| 10|2.2172253|22.172253|
|     access|      1| 10|2.2172253|22.172253|
|    suggest|      2| 10|2.2172253|22.172253|
|       wide|      3| 10|2.2172253|22.172253|
|      major|      4| 10|2.2172253|22.172253|
|       read|      5| 10|2.2172253|22.172253|
|     articl|      6| 10|2.2172253|22.172253|
|  scientist|      7| 10|2.2172253|22.172253|
|      basic|      8| 10|2.2172253|22.172253|
|     theori|      9|  9|2.3125355| 20.81282|
+-----------+-------+---+---------+---------+
only showing top 10 rows



## Exercise 3.3 (Clustering)

In [35]:
usersDF.collect()

[Row(user_hash_id='d0c9aaa788153daeaf1f1538b3d46bbb', user_library='2080631,6343346,5184704,7756088,2653863,6607628,4236212,1277953,226864,3140015,8806369,311570,5687747,767516,4781370,2841637,2445106,1959511,2688186,2363430,6614346,853030,5336762,4226226,239571,4089758,4140337,913868,7562861,3190274,2782576,12571584,2049617,5761055,5441098,3466838,2080691,1805577,7570111,5760287,2855355,3281547,1012525,3512183,678653'),
 Row(user_hash_id='ca4f1ba4094011d9a8757b1bfcadae5b', user_library='278019'),
 Row(user_hash_id='d1d41a152019155035ceb2db7d331c44', user_library='6610569,6493797,6609079,7469737,7469738,6609102,6610585,6609240,7499869,7364976,7427963,7364991,7499833,7363519,7364990,6609245,7499801,7364971,7363487,7363498,7364986,7363513,7363526,6609222,7241881,6609169,7364933,6609099,1153734,7499794,3105791,6743243,6609238,7469728,7465494,7329626,1391614,1397150,7364987,7339380,7364973,7364823'),
 Row(user_hash_id='f2f77383828ea6d39438e525e40d54ba', user_library='943458,238121,763429')

In [36]:
#Exercise 3.3
explodUsersDF = usersDF.select(usersDF.user_hash_id,\
                               Func.explode(Func.split(usersDF.user_library, ","))\
                               .alias("paper_id"))
explodUsersDF.show()

+--------------------+--------+
|        user_hash_id|paper_id|
+--------------------+--------+
|d0c9aaa788153daea...| 2080631|
|d0c9aaa788153daea...| 6343346|
|d0c9aaa788153daea...| 5184704|
|d0c9aaa788153daea...| 7756088|
|d0c9aaa788153daea...| 2653863|
|d0c9aaa788153daea...| 6607628|
|d0c9aaa788153daea...| 4236212|
|d0c9aaa788153daea...| 1277953|
|d0c9aaa788153daea...|  226864|
|d0c9aaa788153daea...| 3140015|
|d0c9aaa788153daea...| 8806369|
|d0c9aaa788153daea...|  311570|
|d0c9aaa788153daea...| 5687747|
|d0c9aaa788153daea...|  767516|
|d0c9aaa788153daea...| 4781370|
|d0c9aaa788153daea...| 2841637|
|d0c9aaa788153daea...| 2445106|
|d0c9aaa788153daea...| 1959511|
|d0c9aaa788153daea...| 2688186|
|d0c9aaa788153daea...| 2363430|
+--------------------+--------+
only showing top 20 rows



In [37]:
explodUsersDF.select("user_hash_id").distinct().count()

6

In [38]:
#Exercise 3.3
joinUserPaperTfIdf = explodUsersDF.join(paperTfIdf, "paper_id")
joinUserPaperTfIdf.show()

+--------+--------------------+--------------------+--------------------+--------------------+
|paper_id|        user_hash_id|           tf_vector|          idf_vector|              tf_idf|
+--------+--------------------+--------------------+--------------------+--------------------+
| 1242600|1eac022a97d683eac...|(1000,[1,7,15,67,...|(1000,[1,7,15,67,...|(1000,[1,7,15,67,...|
|  255030|1eac022a97d683eac...|(1000,[1,30,47,72...|(1000,[1,30,47,72...|(1000,[1,30,47,72...|
|  238188|1eac022a97d683eac...|(1000,[3,11,24,36...|(1000,[3,11,24,36...|(1000,[3,11,24,36...|
| 1042553|1eac022a97d683eac...|(1000,[12,18,23,3...|(1000,[12,18,23,3...|(1000,[12,18,23,3...|
| 2058201|1eac022a97d683eac...|(1000,[27,159,337...|(1000,[27,159,337...|(1000,[27,159,337...|
| 2492402|1eac022a97d683eac...|(1000,[29,219,359...|(1000,[29,219,359...|(1000,[29,219,359...|
|  523772|1eac022a97d683eac...|(1000,[79,135],[1...|(1000,[79,135],[2...|(1000,[79,135],[2...|
| 3010240|1eac022a97d683eac...|  (1000,[153],[1.0]

In [39]:
#Exercise 3.3
joinUserPaperTfIdf.select("user_hash_id").distinct().count()

1

In [40]:
#Exercise 3.3
userTfIdfList = joinUserPaperTfIdf.groupBy("user_hash_id").agg(Func.collect_list("tf_idf").alias("tf_idf_list"))
userTfIdfList.show(10)

+--------------------+--------------------+
|        user_hash_id|         tf_idf_list|
+--------------------+--------------------+
|1eac022a97d683eac...|[(1000,[1,7,15,67...|
+--------------------+--------------------+



In [41]:
userTfIdfList.collect()

[Row(user_hash_id='1eac022a97d683eace8815545ce3153f', tf_idf_list=[SparseVector(1000, {1: 2.1972, 7: 2.1972, 15: 2.2925, 67: 2.6492, 69: 2.6492, 77: 2.6492, 81: 2.6492, 110: 2.8034, 148: 2.8034, 192: 2.9857, 220: 2.9857, 224: 2.9857, 301: 3.2088, 304: 3.2088, 309: 3.2088, 310: 3.2088, 314: 3.2088, 379: 3.4965, 386: 3.4965, 398: 3.4965, 407: 3.4965, 519: 3.4965, 578: 3.4965, 580: 3.4965, 590: 3.4965, 610: 3.4965, 616: 3.4965}), SparseVector(1000, {1: 2.1972, 30: 2.3979, 47: 2.5157, 72: 2.6492, 75: 2.6492, 120: 2.8034, 223: 2.9857, 280: 3.2088, 291: 3.2088, 332: 3.2088, 335: 3.2088, 518: 3.4965, 575: 3.4965}), SparseVector(1000, {3: 2.1972, 11: 2.2925, 24: 9.5916, 36: 2.3979, 58: 2.5157, 62: 2.5157, 66: 2.6492, 75: 2.6492, 84: 10.5968, 104: 2.8034, 122: 2.8034, 129: 2.8034, 141: 11.2134, 147: 2.8034, 149: 2.8034, 155: 2.9857, 166: 2.9857, 175: 2.9857, 187: 11.9427, 205: 2.9857, 212: 2.9857, 218: 2.9857, 228: 2.9857, 235: 12.8353, 259: 12.8353, 274: 3.2088, 290: 3.2088, 363: 3.4965, 410: 

In [42]:
#Exercise 3.3
#
def add_sparse_vec(vector_list):
    result = SparseVector(1000, list(range(0, 1000)), np.zeros(1000))
    for vector in vector_list:
        result = np.add(result, vector).tolist()

    vector_args = len(result), [i for i, x in enumerate(result) if x != 0], [x for x in result if x != 0] 
    return Vectors.sparse(*vector_args)
    
udf_add_sparse_vec = Func.udf(add_sparse_vec, VectorUDT())

In [56]:
userProfileDF = userTfIdfList.withColumn("features", udf_add_sparse_vec(userTfIdfList.tf_idf_list))
userFeaturesDF = userProfileDF.select("user_hash_id", "features")
userFeaturesDF.first()

Row(user_hash_id='1eac022a97d683eace8815545ce3153f', features=SparseVector(1000, {1: 4.3944, 3: 2.1972, 7: 2.1972, 11: 2.2925, 12: 2.2925, 15: 2.2925, 18: 2.2925, 23: 9.5916, 24: 9.5916, 27: 2.3979, 29: 2.3979, 30: 2.3979, 31: 2.3979, 36: 2.3979, 47: 2.5157, 58: 5.0314, 61: 10.0627, 62: 2.5157, 66: 2.6492, 67: 2.6492, 69: 2.6492, 71: 2.6492, 72: 2.6492, 75: 5.2984, 77: 2.6492, 79: 2.6492, 81: 2.6492, 84: 10.5968, 104: 2.8034, 107: 2.8034, 110: 2.8034, 113: 11.2134, 117: 11.2134, 120: 2.8034, 122: 2.8034, 129: 2.8034, 135: 2.8034, 141: 11.2134, 147: 2.8034, 148: 2.8034, 149: 2.8034, 153: 2.9857, 155: 2.9857, 157: 2.9857, 159: 2.9857, 166: 5.9714, 171: 2.9857, 175: 2.9857, 184: 11.9427, 187: 11.9427, 192: 2.9857, 194: 2.9857, 205: 2.9857, 212: 2.9857, 217: 2.9857, 218: 2.9857, 219: 2.9857, 220: 2.9857, 221: 2.9857, 223: 2.9857, 224: 2.9857, 228: 2.9857, 235: 12.8353, 259: 12.8353, 274: 3.2088, 276: 3.2088, 280: 3.2088, 290: 3.2088, 291: 3.2088, 296: 3.2088, 301: 3.2088, 304: 3.2088, 309:

In [57]:
#Exercise 3.3 b)
kmeans = KMeans(featuresCol="features", k=50, maxIter=10)
userClusterModel = kmeans.fit(userFeaturesDF.select("features"))

In [58]:
userCenters = userClusterModel.clusterCenters()
for center in userCenters:
    print(center)

[ 0.          4.39444915  0.          2.19722458  0.          0.
  0.          2.19722458  0.          0.          0.          2.29253476
  2.29253476  0.          0.          2.29253476  0.          0.
  2.29253476  0.          0.          0.          0.          9.59158109
  9.59158109  0.          0.          2.39789527  0.          2.39789527
  2.39789527  2.39789527  0.          0.          0.          0.
  2.39789527  0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          2.51567831
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          5.03135662  0.
  0.         10.06271323  2.51567831  0.          0.          0.
  2.6492097   2.6492097   0.          2.6492097   0.          2.6492097
  2.6492097   0.          0.          5.2984194   0.          2.6492097
  0.          2.6492097   0.          2.6492097   0.          0.
 10.5968388   0.          0.          0.    

In [59]:
userCenters

[array([ 0.        ,  4.39444915,  0.        ,  2.19722458,  0.        ,
         0.        ,  0.        ,  2.19722458,  0.        ,  0.        ,
         0.        ,  2.29253476,  2.29253476,  0.        ,  0.        ,
         2.29253476,  0.        ,  0.        ,  2.29253476,  0.        ,
         0.        ,  0.        ,  0.        ,  9.59158109,  9.59158109,
         0.        ,  0.        ,  2.39789527,  0.        ,  2.39789527,
         2.39789527,  2.39789527,  0.        ,  0.        ,  0.        ,
         0.        ,  2.39789527,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  2.51567831,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  5.03135662,  0.        ,
         0.        , 10.06271323,  2.51567831,  0.        ,  0.        ,
         0.        ,  2.6492097 ,  2.6492097 ,  0. 

In [53]:
userTransformed = userClusterModel.transform(userFeaturesDF).select('user_hash_id', 'prediction')
userTransformed.first()

Row(user_hash_id='1eac022a97d683eace8815545ce3153f', prediction=0)