In [2]:
#Import packages.
from pyspark.sql import SparkSession
from pyspark.sql import functions as Func
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, IDF
from pyspark.ml.linalg import SparseVector, DenseVector, VectorUDT, Vectors
from pyspark.ml.clustering import KMeans, LDA
from nltk.stem import PorterStemmer
import math
import numpy as np
import itertools
#import string

#Create a spark session.
sparkSession = SparkSession.builder.appName("Experiment3").getOrCreate()
#Get default configurations
sparkSession.sparkContext._conf.getAll()
#Update default configurations
conf = sparkSession.sparkContext._conf.setAll([('spark.executor.memory', '16g')\
                                        , ('spark.app.name', 'Spark Updated Conf')\
                                        , ('spark.executor.cores', '8')\
                                        , ('spark.cores.max', '8')\
                                        , ('spark.driver.memory','16g')\
                                        ,('spark.driver.maxResultSize','16g')])
#Stop the current Spark Session
sparkSession.sparkContext.stop()
#Create a Spark Session
sparkSession = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
#Loading stopwords_en.txt data into a dataframe.
stopWordsDF = sparkSession.read\
                .load("/home/jovyan/work/stopwords_en.txt", format="text", sep=" ", inferSchema="true", header="false")\
                .toDF('stop_word')
stopWordsDF.show(5, truncate=True)

+---------+
|stop_word|
+---------+
|        a|
|     able|
|    about|
|    above|
|according|
+---------+
only showing top 5 rows



In [4]:
#Loading users_libraries.txt data into a dataframe.
#Defining the column names.
user_columns = ['raw_data']
rawUsersDF = sparkSession.read\
            .load("/home/jovyan/work/users_libraries.txt", format="text", sep=";",\
                  inferSchema="true", quote='"', header="false")\
            .toDF(*user_columns)

usersDF = rawUsersDF.select(Func.split(rawUsersDF.raw_data, ";").getItem(0).alias("user_hash_id"),\
                           Func.split(rawUsersDF.raw_data, ";").getItem(1).alias("user_library"))
usersDF.show(5)

+--------------------+--------------------+
|        user_hash_id|        user_library|
+--------------------+--------------------+
|28d3f81251d94b097...|3929762,503574,58...|
|d0c9aaa788153daea...|2080631,6343346,5...|
|f05bcffe7951de9e5...|1158654,478707,12...|
|ca4f1ba4094011d9a...|              278019|
|d1d41a15201915503...|6610569,6493797,6...|
+--------------------+--------------------+
only showing top 5 rows



In [5]:
#Loading papers.csv data into a dataframe.
#Defining the column names.
paper_columns = ['paper_id', 'type', 'journal', 'book_title', \
           'series', 'publisher', 'pages', 'volume', \
           'number', 'year', 'month', 'postedate',\
           'address', 'title', 'abstract']

papersDF = sparkSession.read\
            .load("/home/jovyan/work/papers.csv", format="csv", sep=",", inferSchema="true", quote='"', header="false")\
            .toDF(*paper_columns)
papersDF.show(2, truncate=True)

+--------+-------+--------------------+----------+------+---------+-----+------+------+----+-----+-------------------+-------+--------------------+--------------------+
|paper_id|   type|             journal|book_title|series|publisher|pages|volume|number|year|month|          postedate|address|               title|            abstract|
+--------+-------+--------------------+----------+------+---------+-----+------+------+----+-----+-------------------+-------+--------------------+--------------------+
|   80546|article|biology and philo...|      null|  null|     null|   17|    19|     2|2004|  mar|2005-01-26 21:35:21|   null|the arbitrariness...|the genetic code ...|
| 5842862|article|      molecular cell|      null|  null| elsevier|    2|    35|     6|2009|  sep|2009-09-30 17:11:23|   null|how to choose a g...|choosing good pro...|
+--------+-------+--------------------+----------+------+---------+-----+------+------+----+-----+-------------------+-------+--------------------+--------

## Exercise 3.1 (Vector representation for the papers)
To generate the bag-of-words representation for each paper.

In [6]:
#Exercise 3.1
#Concatenate the title and abstract fields of papers together.
textDF = papersDF.select(papersDF.paper_id, Func.concat_ws(" ", papersDF.title, papersDF.abstract).alias("text"))
textDF.show(5)

+--------+--------------------+
|paper_id|                text|
+--------+--------------------+
|   80546|the arbitrariness...|
| 5842862|how to choose a g...|
| 1242600|how to write cons...|
| 3467077|defrosting the di...|
|  309395|why most publishe...|
+--------+--------------------+
only showing top 5 rows



In [7]:
#Exercise 3.1
#Perform tokenization and remove words less than 3 characters.
#Keep the words containing "-" and "_" characters.
reTokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=4, pattern="[^-_\\w]")
wordsDF = reTokenizer.transform(textDF)
wordsDF.head()

Row(paper_id=80546, text="the arbitrariness of the genetic code the genetic code has been regarded as arbitrary in the sense that the codon-amino acid assignments could be different than they actually are. this general idea has been spelled out differently by previous, often rather implicit accounts of arbitrariness. they have drawn on the frozen accident theory, on evolutionary contingency, on alternative causal pathways, and on the absence of direct stereochemical interactions between codons and amino acids. it has also been suggested that the arbitrariness of the genetic code justifies attributing semantic information to macromolecules, notably to {dna}. i argue that these accounts of arbitrariness are unsatisfactory. i propose that the code is arbitrary in the sense of jacques monod's concept of chemical arbitrariness: the genetic code is arbitrary in that any codon requires certain chemical and structural properties to specify a particular amino acid, but these properties are not 

In [8]:
#Exercise 3.1
#Function to remove the characters "-" and "_" from words.
def concatConnectedWords(wordList):
    wordSet = set(wordList)
    identity = str.maketrans("", "", "-_")
    wordSet = [word.translate(identity) for word in wordSet]
    return wordSet

udf_concatConnectedWords = Func.udf(concatConnectedWords, ArrayType(StringType()))

In [9]:
#Exercise 3.1
#removing the characters "-" and "_" from words.
removedConnectorsDF = wordsDF.select(wordsDF.paper_id,\
                                     Func.lit(udf_concatConnectedWords(wordsDF.words)).alias("processed_words"))
removedConnectorsDF.take(2)

[Row(paper_id=80546, processed_words=['these', 'actually', 'certain', 'stereochemical', 'notably', 'that', 'concept', 'particular', 'code', 'chemical', 'could', 'they', 'direct', 'propose', 'with', 'about', 'chemistry', 'general', 'regarded', 'notion', 'this', 'nucleic', 'information', 'drawn', 'specify', 'evolution', 'neither', 'absence', 'genetic', 'implicit', 'unsatisfactory', 'previous', 'frozen', 'between', 'alternative', 'amino', 'attributing', 'required', 'codonamino', 'been', 'have', 'structural', 'several', 'properties', 'maintain', 'necessary', 'requires', 'sense', 'hypotheses', 'pathways', 'acids', 'causal', 'codons', 'arbitrary', 'often', 'theory', 'also', 'macromolecules', 'assignments', 'rather', 'accounts', 'contingency', 'acid', 'monod', 'virtue', 'sufficient', 'jacques', 'principle', 'argue', 'idea', 'arbitrariness', 'suggested', 'compatible', 'than', 'recent', 'different', 'semantic', 'codon', 'interactions', 'justifies', 'accident', 'spelled', 'evolutionary', 'differ

In [11]:
#Exercise 3.1
#Creating a list of stop words.
stopWordsList = stopWordsDF.agg(Func.collect_list(stopWordsDF.stop_word)).rdd.flatMap(lambda row: row[0])
stopWordsList.take(10)

['a',
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after',
 'afterwards']

In [12]:
#Exercise 3.1
#Removing the stop words.
remover = StopWordsRemover(inputCol="processed_words", outputCol="words", stopWords=stopWordsList.collect())
withoutStopWordsDF = remover.transform(removedConnectorsDF)
withoutStopWordsDF.select("words").first()

Row(words=['stereochemical', 'notably', 'concept', 'code', 'chemical', 'direct', 'propose', 'chemistry', 'general', 'regarded', 'notion', 'nucleic', 'information', 'drawn', 'evolution', 'absence', 'genetic', 'implicit', 'unsatisfactory', 'previous', 'frozen', 'alternative', 'amino', 'attributing', 'required', 'codonamino', 'structural', 'properties', 'maintain', 'requires', 'sense', 'hypotheses', 'pathways', 'acids', 'causal', 'codons', 'arbitrary', 'theory', 'macromolecules', 'assignments', 'accounts', 'contingency', 'acid', 'monod', 'virtue', 'sufficient', 'jacques', 'principle', 'argue', 'idea', 'arbitrariness', 'suggested', 'compatible', 'recent', 'semantic', 'codon', 'interactions', 'justifies', 'accident', 'spelled', 'evolutionary', 'differently'])

In [13]:
#Exercise 3.1
#Function to perform stemming.
stemmer = PorterStemmer()

def stemming(wordList):
    wordSet = [stemmer.stem(word) for word in wordList]
    return sorted(wordSet)

#User defined function to perform stemming.
udf_stemming = Func.udf(stemming, ArrayType(StringType()))

In [14]:
#Exercise 3.1
#Performing stemming.
stemmedWordsDF = withoutStopWordsDF.withColumn("stemmed_words", udf_stemming(withoutStopWordsDF.words))

In [15]:
#Exercise 3.1
#Counting the number of papers (frequency) in which a particular word appears. (Document Frequency)
explodedDF = stemmedWordsDF.select(stemmedWordsDF.paper_id,
                                   Func.explode(stemmedWordsDF.stemmed_words).alias("single_word"))\
                            .distinct()\
                            .groupBy("single_word")\
                            .agg(Func.count("single_word")
                                 .alias("df"))
explodedDF.show(10)

+-----------+----+
|single_word|  df|
+-----------+----+
| likelihood|1388|
|      input|3770|
|  viewpoint| 444|
|    persist|1629|
|    nucleas| 117|
|     synopt|  35|
|     harder| 135|
|   ineffici| 321|
|  hematolog|  38|
|      still|  11|
+-----------+----+
only showing top 10 rows



In [16]:
#Exercise 3.1
#Counting the number of unique papers.
#Setting the upper and lower bounds.
uniqPaperCount = stemmedWordsDF.count()
upperBoundary = 0.1*uniqPaperCount
lowerBoundary = 20
#filter the dataframe to select only words that appear in more than 20 papers and
#less than 10 percent of the total number of papers.
filteredDF = explodedDF.filter((explodedDF.df>=lowerBoundary) & (explodedDF.df<=upperBoundary))
filteredDF.show(10)

+-----------+----+
|single_word|  df|
+-----------+----+
| likelihood|1388|
|      input|3770|
|  viewpoint| 444|
|    persist|1629|
|    nucleas| 117|
|     synopt|  35|
|     harder| 135|
|   ineffici| 321|
|  hematolog|  38|
|     imagin| 397|
+-----------+----+
only showing top 10 rows



In [17]:
#Exercise 3.1
#Limiting the number of important terms to 1000.
termsDF = filteredDF.sort("df", ascending=False).limit(1000).select("single_word")
termsDF.show(10)

+-----------+
|single_word|
+-----------+
|     applic|
|       time|
|   interact|
|      activ|
|     compar|
|    network|
|   identifi|
|     design|
|       find|
|       gene|
+-----------+
only showing top 10 rows



In [18]:
#Exercise 3.1
#Generating an index to the top 1000 important terms starting with the index 0.
termsDF = termsDF.withColumn("word_id", Func.monotonically_increasing_id())

wordWinSpec = Window.orderBy("word_id")

termsDF = termsDF.withColumn("word_id",Func.row_number().over(wordWinSpec)-1)
termsDF.show(10, truncate=True)

+-----------+-------+
|single_word|word_id|
+-----------+-------+
|     applic|      0|
|       time|      1|
|   interact|      2|
|      activ|      3|
|     compar|      4|
|    network|      5|
|   identifi|      6|
|     design|      7|
|       find|      8|
|       gene|      9|
+-----------+-------+
only showing top 10 rows



In [19]:
#Exercise 3.1
#Counting the number of times a word appeared in a particular paper (word count per paper) (term frequency).
tempDF = stemmedWordsDF.select(stemmedWordsDF.paper_id,
                               Func.explode(stemmedWordsDF.stemmed_words).alias("single_word"))\
                        .groupBy("paper_id", "single_word")\
                        .agg(Func.count("single_word").alias("tf"))
tempDF.show(10)

+--------+-----------+---+
|paper_id|single_word| tf|
+--------+-----------+---+
| 1242600|     public|  1|
| 1242600|       talk|  1|
|      99|       call|  1|
|  740681|       imit|  1|
|  740681|      model|  1|
|   99857|      degre|  1|
| 3614773|      studi|  1|
|  117535|   reweight|  1|
| 4131662|   techniqu|  1|
|  115158|     common|  1|
+--------+-----------+---+
only showing top 10 rows



In [20]:
#Exercise 3.1
#Creating a dataframe with term index and count of the word per paper.
#Creating a column containing a mapping of term index -> count per paper.
joinedResult = tempDF.join(termsDF, "single_word").withColumn("map", Func.create_map("word_id", "tf"))\
                    .groupBy("paper_id").agg(Func.collect_list("map").alias("map_list"))
joinedResult.show(10)

+--------+--------------------+
|paper_id|            map_list|
+--------+--------------------+
|     148|[[55 -> 1], [386 ...|
|     496|[[75 -> 1], [2 ->...|
|    1238|[[557 -> 1], [87 ...|
|    1959|[[743 -> 1], [619...|
|    4101|[[534 -> 1], [135...|
|    4935|[[761 -> 2], [161...|
|   29719|[[652 -> 1], [761...|
|   78113|         [[11 -> 1]]|
|   81501|[[45 -> 1], [82 -...|
|   89863|[[392 -> 1], [580...|
+--------+--------------------+
only showing top 10 rows



In [25]:
#Exercise 3.1
#Function to create a sparse vector for each paper.
def toSparse(mapList):
    pairs = []
    for map_val in mapList:
        pairs += map_val.items()
    pairs = sorted(pairs, key=lambda pair: pair[0])
    return SparseVector(1000, [x[0] for x in pairs], [x[1] for x in pairs])

#Creating a user defined function.
udf_toSparse = Func.udf(toSparse, VectorUDT())

In [26]:
#Exercise 3.1
#Creating the bag of words dataframe.
featurizedDataDF = joinedResult.select("paper_id",
                                   udf_toSparse(joinedResult.map_list)
                                   .alias("tf_vector"))
featurizedDataDF.first()

Row(paper_id=148, tf_vector=SparseVector(1000, {2: 1.0, 5: 2.0, 7: 1.0, 25: 1.0, 41: 1.0, 47: 1.0, 49: 1.0, 52: 1.0, 55: 1.0, 61: 1.0, 67: 1.0, 71: 1.0, 75: 1.0, 86: 1.0, 91: 1.0, 101: 1.0, 106: 1.0, 116: 1.0, 118: 1.0, 145: 1.0, 149: 1.0, 158: 1.0, 160: 1.0, 163: 1.0, 196: 1.0, 222: 1.0, 230: 1.0, 285: 1.0, 311: 1.0, 313: 1.0, 317: 1.0, 344: 1.0, 348: 1.0, 354: 1.0, 371: 1.0, 372: 1.0, 386: 1.0, 416: 1.0, 490: 1.0, 526: 1.0, 607: 1.0, 642: 1.0, 685: 1.0, 700: 1.0, 727: 1.0, 733: 1.0, 862: 1.0, 880: 1.0, 915: 1.0, 937: 2.0}))

## Exercise 3.2 (TF-IDF representation for the papers)


Calculating TF and IDF using functionality provided by pyspark.ml.feature.IDF

In [27]:
#Exercise 3.2
#Initializing the IDF object.
sparkMlIdf = IDF(inputCol="tf_vector", outputCol="idf_vector")
#Training the data and creating a model.
sparkMlIdfModel = sparkMlIdf.fit(featurizedDataDF)
#Adding the output produced from IDF to the dataset as a separate column.
IdfData = sparkMlIdfModel.transform(featurizedDataDF)
IdfData.first()

Row(paper_id=148, tf_vector=SparseVector(1000, {2: 1.0, 5: 2.0, 7: 1.0, 25: 1.0, 41: 1.0, 47: 1.0, 49: 1.0, 52: 1.0, 55: 1.0, 61: 1.0, 67: 1.0, 71: 1.0, 75: 1.0, 86: 1.0, 91: 1.0, 101: 1.0, 106: 1.0, 116: 1.0, 118: 1.0, 145: 1.0, 149: 1.0, 158: 1.0, 160: 1.0, 163: 1.0, 196: 1.0, 222: 1.0, 230: 1.0, 285: 1.0, 311: 1.0, 313: 1.0, 317: 1.0, 344: 1.0, 348: 1.0, 354: 1.0, 371: 1.0, 372: 1.0, 386: 1.0, 416: 1.0, 490: 1.0, 526: 1.0, 607: 1.0, 642: 1.0, 685: 1.0, 700: 1.0, 727: 1.0, 733: 1.0, 862: 1.0, 880: 1.0, 915: 1.0, 937: 2.0}), idf_vector=SparseVector(1000, {2: 2.2937, 5: 4.6545, 7: 2.3549, 25: 2.5051, 41: 2.6222, 47: 2.691, 49: 2.6938, 52: 2.7091, 55: 2.7245, 61: 2.7501, 67: 2.767, 71: 2.7875, 75: 2.7938, 86: 2.8662, 91: 2.9035, 101: 2.9508, 106: 2.9795, 116: 3.014, 118: 3.0221, 145: 3.1298, 149: 3.1438, 158: 3.1718, 160: 3.173, 163: 3.2011, 196: 3.291, 222: 3.3608, 230: 3.3891, 285: 3.565, 311: 3.6248, 313: 3.6291, 317: 3.6403, 344: 3.7059, 348: 3.7088, 354: 3.7118, 371: 3.7563, 372: 3

In [28]:
#Exercise 3.2
#Function to multiply 2 sparse vectors.
def multiplySparseVec(x_vec, y_vec):
    result = np.multiply(x_vec, y_vec).tolist()
    vector_args = len(result), [i for i, x in enumerate(result) if x != 0], [x for x in result if x != 0] 
    return Vectors.sparse(*vector_args)

#Creating a user defined function.
udf_multiplySparseVec = Func.udf(multiplySparseVec, VectorUDT())

In [29]:
#Exercise 3.2
#Calculating the value of TF-IDF.
paperTfIdf = IdfData.withColumn("tf_idf", udf_multiplySparseVec(IdfData.tf_vector, IdfData.idf_vector))
paperTfIdf.first()

Row(paper_id=148, tf_vector=SparseVector(1000, {2: 1.0, 5: 2.0, 7: 1.0, 25: 1.0, 41: 1.0, 47: 1.0, 49: 1.0, 52: 1.0, 55: 1.0, 61: 1.0, 67: 1.0, 71: 1.0, 75: 1.0, 86: 1.0, 91: 1.0, 101: 1.0, 106: 1.0, 116: 1.0, 118: 1.0, 145: 1.0, 149: 1.0, 158: 1.0, 160: 1.0, 163: 1.0, 196: 1.0, 222: 1.0, 230: 1.0, 285: 1.0, 311: 1.0, 313: 1.0, 317: 1.0, 344: 1.0, 348: 1.0, 354: 1.0, 371: 1.0, 372: 1.0, 386: 1.0, 416: 1.0, 490: 1.0, 526: 1.0, 607: 1.0, 642: 1.0, 685: 1.0, 700: 1.0, 727: 1.0, 733: 1.0, 862: 1.0, 880: 1.0, 915: 1.0, 937: 2.0}), idf_vector=SparseVector(1000, {2: 2.2937, 5: 4.6545, 7: 2.3549, 25: 2.5051, 41: 2.6222, 47: 2.691, 49: 2.6938, 52: 2.7091, 55: 2.7245, 61: 2.7501, 67: 2.767, 71: 2.7875, 75: 2.7938, 86: 2.8662, 91: 2.9035, 101: 2.9508, 106: 2.9795, 116: 3.014, 118: 3.0221, 145: 3.1298, 149: 3.1438, 158: 3.1718, 160: 3.173, 163: 3.2011, 196: 3.291, 222: 3.3608, 230: 3.3891, 285: 3.565, 311: 3.6248, 313: 3.6291, 317: 3.6403, 344: 3.7059, 348: 3.7088, 354: 3.7118, 371: 3.7563, 372: 3

Calculating TF and IDF WITHOUT using functionality provided by pyspark.ml.feature.IDF

In [30]:
#Exercise 3.2
#Creating a dataframe with the terms, ID and document-frequency.
termsWithDF = termsDF.join(filteredDF, "single_word")
termsWithDF.show(10)

+-----------+-------+----+
|single_word|word_id|  df|
+-----------+-------+----+
| likelihood|    993|1388|
|      input|    386|3770|
|    persist|    864|1629|
|       oper|    221|5775|
|    classif|    403|3602|
|     execut|    674|2182|
|        map|    273|4874|
|      equal|    839|1690|
|    explain|    250|5240|
|     growth|    327|4223|
+-----------+-------+----+
only showing top 10 rows



In [31]:
#Exercise 3.2
#creating a document frequency array where index of array element is equal to word_id
docFreqList = termsWithDF.select("word_id", "df").orderBy("word_id", ascending=True).rdd.map(lambda x: x[1]).collect()
docFreqList[0:5]

[17098, 17014, 16771, 16598, 16395]

In [32]:
#Exercise 3.2
#Function to calculate the IDF.
def calcIdf(m, docFreq):
    Idf = math.log((m+1)/(docFreq+1))
    return Idf

#Function to create the IDF vector.
def getIdfSparseVec(m, tf_vector, doc_freq_ls):
    index = []
    value = []
    #retrieving active indices
    active_idx = tf_vector.indices

    for idx in active_idx:
        doc_freq = doc_freq_ls[idx]
        index.append(idx)
        value.append(calcIdf(m, doc_freq))
    return SparseVector(1000, index, value)

udf_getIdfSparseVec = Func.udf(lambda x, y: getIdfSparseVec(x, y, docFreqList), VectorUDT())

In [33]:
#Exercise 3.2
#Creating a new dataframe with IDF values.
manualIdf = featurizedDataDF.withColumn("idf_vector", udf_getIdfSparseVec(Func.lit(uniqPaperCount),\
                                                                           featurizedDataDF.tf_vector))
manualIdf.first()

Row(paper_id=148, tf_vector=SparseVector(1000, {2: 1.0, 5: 2.0, 7: 1.0, 25: 1.0, 41: 1.0, 47: 1.0, 49: 1.0, 52: 1.0, 55: 1.0, 61: 1.0, 67: 1.0, 71: 1.0, 75: 1.0, 86: 1.0, 91: 1.0, 101: 1.0, 106: 1.0, 116: 1.0, 118: 1.0, 145: 1.0, 149: 1.0, 158: 1.0, 160: 1.0, 163: 1.0, 196: 1.0, 222: 1.0, 230: 1.0, 285: 1.0, 311: 1.0, 313: 1.0, 317: 1.0, 344: 1.0, 348: 1.0, 354: 1.0, 371: 1.0, 372: 1.0, 386: 1.0, 416: 1.0, 490: 1.0, 526: 1.0, 607: 1.0, 642: 1.0, 685: 1.0, 700: 1.0, 727: 1.0, 733: 1.0, 862: 1.0, 880: 1.0, 915: 1.0, 937: 2.0}), idf_vector=SparseVector(1000, {2: 2.3282, 5: 2.3618, 7: 2.3894, 25: 2.5396, 41: 2.6567, 47: 2.7255, 49: 2.7283, 52: 2.7436, 55: 2.759, 61: 2.7846, 67: 2.8015, 71: 2.822, 75: 2.8283, 86: 2.9007, 91: 2.938, 101: 2.9853, 106: 3.014, 116: 3.0485, 118: 3.0566, 145: 3.1643, 149: 3.1783, 158: 3.2063, 160: 3.2075, 163: 3.2356, 196: 3.3255, 222: 3.3953, 230: 3.4236, 285: 3.5995, 311: 3.6593, 313: 3.6636, 317: 3.6748, 344: 3.7404, 348: 3.7433, 354: 3.7463, 371: 3.7908, 372:

In [34]:
#Exercise 3.2
#Calculating the value of TF-IDF.
manualTfIdf = manualIdf.withColumn("tf_idf", udf_multiplySparseVec(manualIdf.tf_vector, manualIdf.idf_vector))
manualTfIdf.first()

Row(paper_id=148, tf_vector=SparseVector(1000, {2: 1.0, 5: 2.0, 7: 1.0, 25: 1.0, 41: 1.0, 47: 1.0, 49: 1.0, 52: 1.0, 55: 1.0, 61: 1.0, 67: 1.0, 71: 1.0, 75: 1.0, 86: 1.0, 91: 1.0, 101: 1.0, 106: 1.0, 116: 1.0, 118: 1.0, 145: 1.0, 149: 1.0, 158: 1.0, 160: 1.0, 163: 1.0, 196: 1.0, 222: 1.0, 230: 1.0, 285: 1.0, 311: 1.0, 313: 1.0, 317: 1.0, 344: 1.0, 348: 1.0, 354: 1.0, 371: 1.0, 372: 1.0, 386: 1.0, 416: 1.0, 490: 1.0, 526: 1.0, 607: 1.0, 642: 1.0, 685: 1.0, 700: 1.0, 727: 1.0, 733: 1.0, 862: 1.0, 880: 1.0, 915: 1.0, 937: 2.0}), idf_vector=SparseVector(1000, {2: 2.3282, 5: 2.3618, 7: 2.3894, 25: 2.5396, 41: 2.6567, 47: 2.7255, 49: 2.7283, 52: 2.7436, 55: 2.759, 61: 2.7846, 67: 2.8015, 71: 2.822, 75: 2.8283, 86: 2.9007, 91: 2.938, 101: 2.9853, 106: 3.014, 116: 3.0485, 118: 3.0566, 145: 3.1643, 149: 3.1783, 158: 3.2063, 160: 3.2075, 163: 3.2356, 196: 3.3255, 222: 3.3953, 230: 3.4236, 285: 3.5995, 311: 3.6593, 313: 3.6636, 317: 3.6748, 344: 3.7404, 348: 3.7433, 354: 3.7463, 371: 3.7908, 372:

## Exercise 3.3 (Clustering)

a) Calculating user profile for each user as the summation of the TF-IDF vectors of the papers.

In [35]:
#Exercise 3.3 a)
#Splitting the list of paper_ids into individual rows.
explodUsersDF = usersDF.select(usersDF.user_hash_id,\
                               Func.explode(Func.split(usersDF.user_library, ","))\
                               .alias("paper_id"))
explodUsersDF.show(5, truncate=True)

+--------------------+--------+
|        user_hash_id|paper_id|
+--------------------+--------+
|28d3f81251d94b097...| 3929762|
|28d3f81251d94b097...|  503574|
|28d3f81251d94b097...| 5819422|
|28d3f81251d94b097...| 4238883|
|28d3f81251d94b097...| 5788061|
+--------------------+--------+
only showing top 5 rows



In [36]:
#Exercise 3.3 a)
#Creating a dataframe with user_hash_id as well as paper_id, tf, idf, tf-idf information.
joinUserPaperTfIdf = explodUsersDF.join(paperTfIdf, "paper_id")
joinUserPaperTfIdf.show(5)

+--------+--------------------+--------------------+--------------------+--------------------+
|paper_id|        user_hash_id|           tf_vector|          idf_vector|              tf_idf|
+--------+--------------------+--------------------+--------------------+--------------------+
|     148|8ac80c1b48f33b5c2...|(1000,[2,5,7,25,4...|(1000,[2,5,7,25,4...|(1000,[2,5,7,25,4...|
|     148|add58a98787fee1a1...|(1000,[2,5,7,25,4...|(1000,[2,5,7,25,4...|(1000,[2,5,7,25,4...|
|     148|20e72d3b3cbe48c98...|(1000,[2,5,7,25,4...|(1000,[2,5,7,25,4...|(1000,[2,5,7,25,4...|
|     148|e571f7858c3c6d226...|(1000,[2,5,7,25,4...|(1000,[2,5,7,25,4...|(1000,[2,5,7,25,4...|
|     148|e6dc6cf6460b94a70...|(1000,[2,5,7,25,4...|(1000,[2,5,7,25,4...|(1000,[2,5,7,25,4...|
+--------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [37]:
#Exercise 3.3 a)
#Grouping the tf-idf vectors with respect to the user.
userTfIdfList = joinUserPaperTfIdf.groupBy("user_hash_id").agg(Func.collect_list("tf_idf").alias("tf_idf_list"))
userTfIdfList.show(10)

+--------------------+--------------------+
|        user_hash_id|         tf_idf_list|
+--------------------+--------------------+
|03237605301d9dd8e...|[(1000,[9,10,12,1...|
|2086aff81a8d6f9d9...|[(1000,[614],[4.2...|
|42407318182edf74b...|[(1000,[27,62,474...|
|4dccc6267cfee33b5...|[(1000,[87,557],[...|
|dcb0db5e7f1e041d8...|[(1000,[7,11,12,1...|
|e6db138f507c636de...|[(1000,[684,933],...|
|e8e6b6347145fe653...|[(1000,[0,19,29,3...|
|f0baca0d108f14901...|[(1000,[17,21,30,...|
|f1109606cf8f36542...|[(1000,[2,9,10,12...|
|f1e1cd4ff25018273...|[(1000,[1,3,22,61...|
+--------------------+--------------------+
only showing top 10 rows



In [38]:
#Exercise 3.3 a)
#Function to add elements of a list of sparse vectors.
def addSparseVec(vector_list):
    result = SparseVector(1000, list(range(0, 1000)), np.zeros(1000))
    for vector in vector_list:
        result = np.add(result, vector).tolist()

    vector_args = len(result), [i for i, x in enumerate(result) if x != 0], [x for x in result if x != 0] 
    return Vectors.sparse(*vector_args)
    
udf_addSparseVec = Func.udf(addSparseVec, VectorUDT())

In [39]:
#Exercise 3.3 a)
#Creating the user profile.
userProfileDF = userTfIdfList.withColumn("features", udf_addSparseVec(userTfIdfList.tf_idf_list))
userFeaturesDF = userProfileDF.select("user_hash_id", "features")
userFeaturesDF.first()

Row(user_hash_id='03237605301d9dd8e883b91a7f0423de', features=SparseVector(1000, {0: 11.3722, 1: 4.5587, 2: 13.7624, 3: 2.3041, 4: 25.4805, 5: 2.3273, 6: 30.3518, 7: 4.7098, 8: 9.4399, 9: 94.555, 10: 21.339, 11: 11.8563, 12: 7.1328, 13: 19.1858, 15: 12.084, 16: 14.5671, 17: 4.8583, 18: 4.898, 19: 22.0523, 20: 29.4257, 21: 26.9874, 22: 7.4243, 23: 2.4942, 24: 7.5135, 25: 20.0406, 27: 115.6691, 28: 7.5669, 29: 5.0448, 30: 134.1892, 31: 2.5366, 32: 5.0786, 33: 12.8502, 34: 10.3152, 35: 2.5822, 36: 5.1812, 37: 12.9671, 38: 15.5881, 39: 54.7407, 40: 15.6431, 41: 2.6222, 42: 15.8054, 43: 5.2718, 44: 2.6569, 45: 5.3194, 46: 2.664, 47: 2.691, 48: 10.7745, 49: 13.469, 50: 8.0972, 51: 5.4049, 53: 10.8706, 54: 2.7224, 56: 10.9312, 57: 2.7341, 58: 13.6927, 59: 8.2209, 61: 22.0004, 62: 110.1265, 63: 5.5084, 65: 8.2777, 66: 16.5765, 67: 2.767, 68: 11.0705, 69: 8.3054, 70: 27.7937, 71: 33.4505, 72: 8.3682, 74: 8.3814, 76: 8.3826, 77: 11.2076, 78: 11.2679, 79: 2.8177, 80: 14.1855, 81: 8.5172, 82: 5.67

b) Applying the K-Means algorithm to cluster the users in 50 clusters given their profiles.

In [40]:
#Exercise 3.3 b)
#Initializing a Kmeans object.
kmeans50 = KMeans(featuresCol="features", k=50, maxIter=10)
#Creating the model.
userClusterModel = kmeans50.fit(userFeaturesDF.select("features"))

c) Calculating the Davies-Bouldin index for the 50 Generated clusters.

In [41]:
#Exercise 3.3 c)
#Getting the list of centroids.
centroids50_list = userClusterModel.clusterCenters()
centroids50_list[0:1]

[array([2.80984482, 2.17845141, 2.73838195, 2.76097062, 2.18150292,
        3.16367322, 2.31596347, 3.18632578, 2.02938087, 2.25164002,
        1.78562005, 1.71686914, 2.70850743, 1.91131523, 3.12124388,
        1.86098396, 1.88025302, 2.22703824, 1.70804603, 2.4263263 ,
        2.03899846, 1.78754908, 2.07838753, 2.64912025, 2.04582905,
        2.3203221 , 2.16036808, 2.38924492, 1.83421976, 1.76515535,
        2.95020505, 2.12709653, 1.99264003, 1.66137374, 1.89430865,
        2.54181706, 1.8599884 , 1.53951585, 2.16569755, 1.71878419,
        1.9876657 , 2.17888803, 1.27929436, 1.86330959, 2.50580459,
        2.04128101, 1.91569372, 1.6112169 , 1.450521  , 1.50372528,
        1.81006994, 1.3514131 , 1.69836131, 1.71648713, 2.76856002,
        1.72894839, 1.31591783, 1.50705642, 1.5986712 , 1.69053138,
        2.37190699, 1.96538919, 2.21784286, 1.66622038, 1.89837264,
        1.80774909, 1.67063764, 1.54398705, 1.33556325, 1.72555655,
        1.98596221, 1.72369906, 1.62369048, 2.06

In [42]:
#Exercise 3.3 c)
#Creating a dataframe with the predictions of the KMeans algo.
userTransformed = userClusterModel.transform(userFeaturesDF)
userTransformed.show(5)

+--------------------+--------------------+----------+
|        user_hash_id|            features|prediction|
+--------------------+--------------------+----------+
|03237605301d9dd8e...|(1000,[0,1,2,3,4,...|        20|
|2086aff81a8d6f9d9...|(1000,[1,2,3,4,6,...|         1|
|42407318182edf74b...|(1000,[0,1,2,3,4,...|        20|
|4dccc6267cfee33b5...|(1000,[0,1,2,5,6,...|         0|
|dcb0db5e7f1e041d8...|(1000,[0,1,2,3,4,...|         1|
+--------------------+--------------------+----------+
only showing top 5 rows



In [43]:
#Exercise 3.3 c)
#Function to calculate the euclidean distance between 2 points.
def calcDistance(point_a, point_b):
    return(np.sqrt(np.sum(np.square(np.subtract(point_a, point_b)))))

#Function to calculate the inter-cluster distances between a set of points and its centroid.
def interClusterDist(prediction, user_points, centroids_list):
    distances_list = []
    index = int(prediction)
    centroid = centroids_list[index]
    for point in user_points:
        distances_list.append(calcDistance(point, centroid))
    average_distance = np.average(distances_list)
    return(float(average_distance))    

In [44]:
#Creating a user-defined function
udf_calcInterClustDist50 = Func.udf(lambda x, y: interClusterDist(x, y, centroids50_list), FloatType())

In [45]:
#Exercise 3.3 c)
#Grouping the user features with respect to the prediction.
groupedUserProfDF = userTransformed.groupBy("prediction").agg(Func.collect_list("features").alias("user_points"))
groupedUserProfDF.show(5)              

+----------+--------------------+
|prediction|         user_points|
+----------+--------------------+
|        31|[(1000,[0,1,2,3,4...|
|        34|[(1000,[0,1,2,3,4...|
|        28|[(1000,[0,1,2,3,4...|
|        27|[(1000,[0,1,2,3,4...|
|        26|[(1000,[0,1,2,3,4...|
+----------+--------------------+
only showing top 5 rows



In [46]:
#Exercise 3.3 c)
#Creating a dataframe containing the inter-cluster distance values.
interClustDist50DF = groupedUserProfDF.withColumn("inter_cluster_dist",\
                                                udf_calcInterClustDist50(groupedUserProfDF.prediction,\
                                                                         groupedUserProfDF.user_points))
interClustDist50DF.show(5)

+----------+--------------------+------------------+
|prediction|         user_points|inter_cluster_dist|
+----------+--------------------+------------------+
|        31|[(1000,[0,1,2,3,4...|         1633.9684|
|        34|[(1000,[0,1,2,3,4...|          669.6046|
|        28|[(1000,[0,1,2,3,4...|         1924.6855|
|        27|[(1000,[0,1,2,3,4...|         1550.7633|
|        26|[(1000,[0,1,2,3,4...|         928.39703|
+----------+--------------------+------------------+
only showing top 5 rows



In [47]:
#Exercise 3.3 c)
#Creating a list of inter-cluster distances.
interClust50List = interClustDist50DF.select("prediction", "inter_cluster_dist")\
                                    .orderBy("prediction", ascending=True)\
                                    .rdd.map(lambda x: x[1]).collect()
#interClust50List

In [48]:
#Exercise 3.3 c)
#Function to calculate the Davies-Boulding index.
def calcDBIndex(centroids_list, inter_clust_list):
    distance = []
    counter = 0
    #calculate Davies-Bouldin value for pairs of clusters.
    while counter < len(centroids_list):
        other_counter = counter + 1
        other_centroids = list(itertools.islice(centroids_list, counter+1, None))
        for other_centroid in other_centroids:
            inter_clust_pair = inter_clust_list[counter]+inter_clust_list[other_counter]
            #Calculate intra cluster distance (distance between 2 clusters)
            intra_clust_pair = calcDistance(centroids_list[counter], other_centroid)
            distance.append(inter_clust_pair/intra_clust_pair)
            other_counter += 1    
        counter += 1
        
    #calculate the Davies-Bouldin index.
    DB_index = sum(distance)/len(centroids_list)
    
    return(DB_index)    

In [49]:
#Exercise 3.3 c)
#Davies-Bouldin Index for a cluster of size 50
calcDBIndex(centroids50_list, interClust50List)

17.787159323220504

d) Calculating the Davies-Bouldin for the 10 generated clusters.

In [50]:
#Exercise 3.3 d)
#Initializing a Kmeans object.
kmeans10 = KMeans(featuresCol="features", k=10, maxIter=10)
#Creating a model.
userCluster10Model = kmeans10.fit(userFeaturesDF.select("features"))

In [51]:
#Exercise 3.3 d)
#Getting the list of centroids.
centroids10_list = userCluster10Model.clusterCenters()

In [52]:
#Exercise 3.3 d)
#Creating a dataframe with the predictions of the KMeans algo.
user10Transformed = userCluster10Model.transform(userFeaturesDF)
user10Transformed.show(5)

+--------------------+--------------------+----------+
|        user_hash_id|            features|prediction|
+--------------------+--------------------+----------+
|03237605301d9dd8e...|(1000,[0,1,2,3,4,...|         0|
|2086aff81a8d6f9d9...|(1000,[1,2,3,4,6,...|         0|
|42407318182edf74b...|(1000,[0,1,2,3,4,...|         4|
|4dccc6267cfee33b5...|(1000,[0,1,2,5,6,...|         0|
|dcb0db5e7f1e041d8...|(1000,[0,1,2,3,4,...|         0|
+--------------------+--------------------+----------+
only showing top 5 rows



In [53]:
#Exercise 3.3 d)
#Grouping the user features with respect to the prediction.
groupedUserProf10DF = user10Transformed.groupBy("prediction").agg(Func.collect_list("features").alias("user_points"))
groupedUserProf10DF.show(5)

+----------+--------------------+
|prediction|         user_points|
+----------+--------------------+
|         1|[(1000,[0,1,2,3,4...|
|         6|[(1000,[0,1,2,3,4...|
|         3|[(1000,[0,1,2,3,4...|
|         5|[(1000,[0,1,2,3,4...|
|         9|[(1000,[0,1,2,3,4...|
+----------+--------------------+
only showing top 5 rows



In [54]:
#Exercise 3.3 d)
#Creating a user defined function
udf_calcInterClustDist10 = Func.udf(lambda x, y: interClusterDist(x, y, centroids10_list), FloatType())

In [55]:
#Exercise 3.3 d)
#Creating a dataframe containing the inter-cluster distance values.
interClustDist10DF = groupedUserProf10DF.withColumn("inter_cluster_dist",\
                                                udf_calcInterClustDist10(groupedUserProf10DF.prediction,\
                                                                        groupedUserProf10DF.user_points))
interClustDist10DF.show(5)

+----------+--------------------+------------------+
|prediction|         user_points|inter_cluster_dist|
+----------+--------------------+------------------+
|         1|[(1000,[0,1,2,3,4...|         2373.3965|
|         6|[(1000,[0,1,2,3,4...|          3202.719|
|         3|[(1000,[0,1,2,3,4...|         3173.4805|
|         5|[(1000,[0,1,2,3,4...|         2194.3276|
|         9|[(1000,[0,1,2,3,4...|         1834.0939|
+----------+--------------------+------------------+
only showing top 5 rows



In [56]:
#Exercise 3.3 d)
#Creating a list of inter-cluster distances.
interClust10List = interClustDist10DF.select("prediction", "inter_cluster_dist")\
                                    .orderBy("prediction", ascending=True)\
                                    .rdd.map(lambda x: x[1]).collect()
#interClust10List

KeyboardInterrupt: 

In [None]:
#Exercise 3.3 d)
#Davies-Bouldin Index for a cluster of size 10
calcDBIndex(centroids10_list, interClust10List)

## Exercise 3.4 (Latent Direchlet Allocation (LDA))

a) Running LDA Algorithm with k=40 and showing the top 5 terms for each extracted topic.

In [None]:
#Exercise 3.3 a)
#Initializing an LDA object.
lda = LDA(featuresCol="tf_vector", k=40, maxIter=10)
#Creating the model.
ldaModel = lda.fit(featurizedDataDF.select("tf_vector"))

In [None]:
#Exercise 3.3 a)
#Showing the top 5 terms for each extracted latent topic.
topTopics_5 = ldaModel.describeTopics(5)
topTopics_5.show(5, truncate=False)

In [None]:
#Exercise 3.3 a)
#Showing the top 5 terms for each extracted latent topic.
topTopics_5.select("topic", Func.explode("termIndices").alias("word_id"))\
.join(termsDF, "word_id" ).show()

In [None]:
#Exercise 3.3 a)
#Creating a dataframe with the topic distibution generated from LDA.
ldaTransformed = ldaModel.transform(featurizedDataDF)
ldaTransformed.first()

In [None]:
#Exercise 3.3 a)
#Joining the topic distribution to the users.
joinUserLDA = explodUsersDF.join(ldaTransformed, "paper_id")
joinUserLDA.show(5, truncate=True)

In [None]:
#Exercise 3.3 a)
#Grouping the topic distribution with respect to the users.
grpUserTopicDistDF = joinUserLDA.groupBy("user_hash_id")\
                                .agg(Func.collect_list("topicDistribution").alias("topic_distribution_ls"))
grpUserTopicDistDF.first()

b) Calculating the LDA based user profiles as the summation of the paper topics vectors.

In [None]:
#Exercise 3.4 b)
#Function to sum a list of dense vectors.
def addDenseVec(dense_vec_list):
    result = DenseVector(np.zeros(40))
    for dense_vector in dense_vec_list:
        result = np.add(result, dense_vector).tolist() 
    return DenseVector(result)

#Creating a user defined function.
udf_addDenseVec = Func.udf(addDenseVec, VectorUDT())

In [None]:
#Exercise 3.4 b)
#Summation of the paper topics vectors.
userLDAProfile = grpUserTopicDistDF.withColumn("features",\
                                               Func.lit(udf_addDenseVec(grpUserTopicDistDF.topic_distribution_ls)))

userLDAProfile.select("user_hash_id", "features").first()

c) Applying the K-means algorithm to cluster the users using their LDA profiles in 50 clusters.

In [None]:
#Exercise 3.4 c)
#Initializing the K-means object.
UsrLDAKmeans = KMeans(featuresCol="features", k=50, maxIter=10)
#Creating the model.
userLDAClustModel = UsrLDAKmeans.fit(userLDAProfile.select("features"))

In [None]:
#Exercise 3.4 d)
#Getting a list of centroids.
topicCentroids_list = userLDAClustModel.clusterCenters()
topicCentroids_list[0:1]

In [None]:
#Exercise 3.4 d)
#Creating a dataframe with the predictions of the KMeans algo.
userTopicTransformed = userLDAClustModel.transform(userLDAProfile)
userTopicTransformed.show(5)

In [None]:
#Exercise 3.4 d)
#Grouping the user features with respect to the prediction.
grpUserTopicProfDF = userTopicTransformed.groupBy("prediction")\
                                            .agg(Func.collect_list("features").alias("paper_topic_pts"))
grpUserTopicProfDF.show(5)

In [None]:
#Exercise 3.4 d)
#Creating a user defined function.
udf_calcTopicInterClust = Func.udf(lambda x, y: interClusterDist(x, y, topicCentroids_list), FloatType())

In [None]:
#Exercise 3.4 d)
#Creating a dataframe containing the inter-cluster distance values.
topicInterClustDist = grpUserTopicProfDF.withColumn("inter_cluster_dist",\
                                                udf_calcTopicInterClust(grpUserTopicProfDF.prediction,\
                                                                        grpUserTopicProfDF.paper_topic_pts))
topicInterClustDist.show(10)

In [None]:
#Exercise 3.4 d)
#Creating a list of inter-cluster distances.
topicInterClustList = topicInterClustDist.select("prediction", "inter_cluster_dist")\
                                    .orderBy("prediction", ascending=True)\
                                    .rdd.map(lambda x: x[1]).collect()

In [None]:
#Exercise 3.4 d)
#Davies-Bouldin Index for a cluster of size 50
calcDBIndex(topicCentroids_list, topicInterClustList)