# Project 3 - BD Analytics

In [30]:
from pyspark.context import SparkContext # for RDDs
from pyspark.sql import SparkSession # for DFs

spark = (SparkSession.builder
                    .appName('BDM_project3')
                    .getOrCreate()
        ) # for DFs

## Importing the data

In [32]:
# Reading in all the files
files = ["dblp-ref-0.json", "dblp-ref-1.json", "dblp-ref-2.json", "dblp-ref-3.json"]

papers_df = (spark.read
             .option("inferSchema", True) # Letting Spark itself define the schema
             .json(files) 
            )

papers_df.printSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- n_citation: long (nullable = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: long (nullable = true)



In [None]:
# Reading in only one file (for testing purposes
'''
papers_df = (spark.read
             .option("inferSchema", True) # Letting Spark itself define the schema
             .json("dblp-ref-3.json")
            )

papers_df.printSchema()
'''

## Exploratory data analysis

In [33]:
# Subset of the dataframe
papers_df.show(5)

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|The purpose of th...|[Makoto Satoh, Ry...|00127ee2-cb05-48c...|         0|[51c7e02e-f5ed-43...|Preliminary Desig...|international con...|2013|
|This paper descri...|[Gareth Beale, Gr...|001c58d3-26ad-46b...|        50|[10482dd3-4642-41...|A methodology for...|visual analytics ...|2011|
|This article appl...|[Altaf Hossain, F...|001c8744-73c4-4b0...|        50|[2d84c0f2-e656-4c...|Comparison of GAR...|pattern recogniti...|2009|
|                NULL|[Jea-Bum Park, By...|00338203-9eb3-40c...|         0|[8c78e4b0-632b-42...|Development of Re...|                   

In [34]:
# Shape of the dataframe
print((papers_df.count(), len(papers_df.columns)))

(3079007, 8)


In [35]:
# First row
papers_df.show(n=1, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 abstract   | The purpose of this study is to develop a learning tool for high school students studying the scientific aspects of information and communication net- works. More specifically, we focus on the basic principles of network proto- cols as the aim to develop our learning tool. Our tool gives students hands-on experience to help understand the basic principles of network protocols. 
 authors    | [Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Kazunori Itoh, Masami Hashimoto, Makoto Otani, Michio Shimizu, Masahiko Sugimoto]                                                                       

In [39]:
# Number on NaN values in each column 
# https://stackoverflow.com/questions/44627386/how-to-find-count-of-null-and-nan-values-for-each-column-in-a-pyspark-dataframe

Dict_Null = {col:papers_df.filter(papers_df[col].isNull()).count() for col in papers_df.columns}
Dict_Null

{'abstract': 530475,
 'authors': 4,
 'id': 0,
 'n_citation': 0,
 'references': 362865,
 'title': 0,
 'venue': 0,
 'year': 0}

In [40]:
# Summary statistics (for numeric columns)
papers_df.describe("n_citation", "year").toPandas()

Unnamed: 0,summary,n_citation,year
0,count,3079007.0,3079007.0
1,mean,35.220902713114974,2007.7665994263732
2,stddev,157.70065110545153,7.8165384986224415
3,min,0.0,1936.0
4,max,73362.0,2018.0


## Data Preprocessing

In [41]:
# Only keeping the English documents

# !pip install langdetect

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from langdetect import detect, LangDetectException
# Only keeping the English documents
def detect_language(text):
    if text:
        try:
            return detect(text)
        except:
            return 'unknown'
    return 'unknown'
detect_language_udf = udf(detect_language, StringType())
papers_df = papers_df.withColumn("language", detect_language_udf(papers_df.abstract))
papers_df = papers_df.filter(papers_df.language == 'en')


In [42]:
# Removing stopwords (with Gensim)
# !pip install gensim
from gensim.parsing.preprocessing import remove_stopwords #!pip install gensim

def remove_stop_words(text):
    if text is not None:
        return remove_stopwords(text)
    return text

In [43]:
# Remove custom stopwords
custom_stop_words = [ 'doi',
'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure','rights',
'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier',
'PMC', 'CZI', 'www']
def remove_custom_stop_words(text):
    if text is not None:
        words = text.split()
        filtered_words = [word for word in words if word not in custom_stop_words]
        return ' '.join(filtered_words)
    return text


In [44]:
# Remove punctuation
import re

def remove_punctuation(text):
    if text is not None:
        return re.sub(r'[!()\[\]{};:\'"\,<>./?@#$%^&*_~]', '', text)
    return text

In [45]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import lower
from pyspark.sql.functions import regexp_extract, col

# Create a user-defined function (UDF)
remove_stop_words_udf = F.udf(remove_stop_words, StringType()) # Default return type is string
custom_stop_words_udf = F.udf(remove_custom_stop_words, StringType()) # Default return type is string
remove_punctuation_udf = F.udf(remove_punctuation, StringType()) # Default return type is string

# Apply the UDF 
# Remove stop words
papers_df = papers_df.withColumn("abstract", remove_stop_words_udf(papers_df["abstract"]))
papers_df = papers_df.withColumn("title", remove_stop_words_udf(papers_df["title"]))

# Convert into a lowercase
papers_df = papers_df.withColumn('abstract', lower(papers_df['abstract']))
papers_df = papers_df.withColumn('title', lower(papers_df['title']))

# Remove custom stop words
papers_df = papers_df.withColumn("abstract", custom_stop_words_udf(papers_df["abstract"]))
papers_df = papers_df.withColumn("title", custom_stop_words_udf(papers_df["title"]))

# Remove punctuation
papers_df = papers_df.withColumn('abstract', remove_punctuation_udf(papers_df['abstract']))
papers_df = papers_df.withColumn('title', remove_punctuation_udf(papers_df['title']))

In [46]:
papers_df.show(n=1, truncate=False, vertical=True)


-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 abstract   | the purpose study develop learning tool high school students studying scientific aspects information communication net- works more specifically focus basic principles network proto- cols aim develop learning tool our tool gives students hands-on experience help understand basic principles network protocols 
 authors    | [Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Kazunori Itoh, Masami Hashimoto, Makoto Otani, Michio Shimizu, Masahiko Sugimoto]                                                                                                                                                                                       
 id         | 00127ee2-cb05-48c

## Vectorization

In [13]:
from pyspark.ml.feature import Tokenizer, Word2Vec

# Converting data for ML algorithms
# Tokenize text
tokenizer = Tokenizer(inputCol="abstract", outputCol="abstract_words")
papers_df = tokenizer.transform(papers_df)
tokenizer = Tokenizer(inputCol="title", outputCol="title_words")
papers_df = tokenizer.transform(papers_df)

# Train Word2Vec model on abstract
word2Vec_abstract = Word2Vec(vectorSize=3, minCount=0, inputCol="abstract_words", outputCol="abstract_word_vectors")
model_abstract = word2Vec_abstract.fit(papers_df)
papers_df = model_abstract.transform(papers_df)

# Train Word2Vec model on title
word2Vec_title = Word2Vec(vectorSize=3, minCount=0, inputCol="title_words", outputCol="title_word_vectors")
model_title = word2Vec_title.fit(papers_df)
papers_df = model_title.transform(papers_df)


In [14]:
# Show the results
papers_df.select("title_words", "title_word_vectors").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------+
|title_words                                                                                                                     |title_word_vectors                                              |
+--------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------+
|[a, heterogeneous, system, real-time, detection, adaboost]                                                                      |[0.1047421395778656,-0.19825784070417285,-0.2943623377941549]   |
|[a, novel, conformal, jigsaw, ebg, structure, design]                                                                           |[-0.01776855013200215,0.11805542784609964,-0.18724185600876808] |
|[a, source-seeking,

In [16]:
#papers_df.toPandas()

Unnamed: 0,abstract,authors,id,n_citation,references,title,venue,year,language,abstract_words,title_words,abstract_word_vectors,title_word_vectors
0,adaboost algorithm based haar-like features ac...,"[Zheng Xu, Runbin Shi, Zhihao Sun, Yaqi Li, Yu...",001eef4f-1d00-4ae6-8b4f-7e66344bbc6e,0,"[0a11984c-ab6e-4b75-9291-e1b700c98d52, 1f4152a...",a heterogeneous system real-time detection ada...,high performance computing and communications,2016,en,"[adaboost, algorithm, based, haar-like, featur...","[a, heterogeneous, system, real-time, detectio...","[0.11266263969710294, 0.14881521901033357, 0.0...","[0.1047421395778656, -0.19825784070417285, -0...."
1,in paper kind novel jigsaw ebg structure desig...,"[Yufei Liang, Yan Zhang, Tao Dong, Shan-wei Lu]",002e0b7e-d62f-4140-b015-1fe29a9acbaa,0,[],a novel conformal jigsaw ebg structure design,international conference on conceptual structures,2016,en,"[in, paper, kind, novel, jigsaw, ebg, structur...","[a, novel, conformal, jigsaw, ebg, structure, ...","[0.26782341763426204, 0.08140046438692432, 0.0...","[-0.01776855013200215, 0.11805542784609964, -0..."
2,this paper studies problem autonomous underwat...,"[Xiaodong Ai, Keyou You, Shiji Song]",00352759-f0a7-4678-82ae-fed68c700da6,0,"[1862a08a-08c6-4ab1-a214-8932bbd0d2d9, 7bcea2f...",a source-seeking strategy autonomous underwate...,"international conference on control, automatio...",2016,en,"[this, paper, studies, problem, autonomous, un...","[a, source-seeking, strategy, autonomous, unde...","[0.15594720700207879, 0.2469724685748053, 0.06...","[0.1360605876478884, -0.14277052051491207, -0...."
3,this paper presents methodology analyze lingui...,"[Vincent Buntinx, Cyril Bornet, Frédéric Kaplan]",01522369-3b88-4256-99d4-4e0c1de9f1ba,0,"[426b57a8-2e7d-498d-9a57-c09983ae0699, 6499d57...",studying linguistic changes 200 years newspape...,Frontiers in Digital Humanities,2017,en,"[this, paper, presents, methodology, analyze, ...","[studying, linguistic, changes, 200, years, ne...","[-0.037486696774049574, 0.17322031537848478, 0...","[0.04104703077529039, 0.03502231393940747, -0...."
4,boneh durfee eurocrypt 1999 proposed polynomia...,"[Atsushi Takayasu, Noboru Kunihiro]",01537b60-9ae2-4684-a1fa-e688e7757e6f,0,,small secret exponent attacks rsa unbalanced p...,international symposium on information theory ...,2016,en,"[boneh, durfee, eurocrypt, 1999, proposed, pol...","[small, secret, exponent, attacks, rsa, unbala...","[0.18371008007281173, 0.13022235570673457, -0....","[0.10223487787880003, -0.09928686544299126, -0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
44845,we analyse problem aggregating judgments multi...,"[Irem Bozbay, Franz Dietrich, Hans Peters]",fab4bc37-adaf-46a8-928f-cb232d14a574,0,,judgment aggregation search truth,,2014,en,"[we, analyse, problem, aggregating, judgments,...","[judgment, aggregation, search, truth]","[0.020626525186220712, 0.05914261931967404, 0....","[0.07361198449507356, 0.067861866671592, -0.19..."
44846,the utilization icts creating new jobs elimina...,"[Marios Pappas, Yannis Papagerasimou, Athanasi...",fb60d6a0-ac9f-40cf-a4b7-9dfbcd0951e7,0,,ict-based innovation employability women,,2017,en,"[the, utilization, icts, creating, new, jobs, ...","[ict-based, innovation, employability, women]","[-0.1855058832864823, -0.0411357720350397, 0.4...","[-0.05307137384079397, 0.08278964925557375, 0...."
44847,in infinite horizon inventory sales model sell...,"[Anita van den Berg, Jean-Jacques Herings, Han...",fc0809cd-e8b5-4f50-a929-e09312c59fe0,0,"[35a8f1b8-4e51-4928-9603-1f6a1872d02b, 460f12f...",the economic order decision continuous dynamic...,Operations Research Letters,2017,en,"[in, infinite, horizon, inventory, sales, mode...","[the, economic, order, decision, continuous, d...","[0.3582092604690021, -0.023328542408923948, 0....","[0.19857570487591955, 0.10304184188134968, -0...."
44848,infrared imaging technology study deep-space b...,"[Michele Dei, Stepan Sutula, Jose Cisneros, Er...",fc27af72-9f8c-4bda-8f19-11100096ae59,0,,a robust 966-db-sndr 50-khz-bandwidth switched...,Sensors,2017,en,"[infrared, imaging, technology, study, deep-sp...","[a, robust, 966-db-sndr, 50-khz-bandwidth, swi...","[0.21958948684480498, -0.06015576256782208, 0....","[-0.004775028083134782, -0.016148306598717518,..."


## Clustering

### Elbow method for number of clusters

In [29]:
# Calculate cost and plot
import numpy as np
import pandas as pd
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import concat

n=15
cost = np.zeros(n)


#papers_df_vectors = papers_df.select("abstract_word_vectors", "title_word_vectors")
#papers_df_vectors.select(concat(papers_df_vectors.abstract_word_vectors, papers_df_vectors.title_word_vectors).alias("features")).collect()

# Dataset has to have one column with the name 'features' that the model uses. 
# But what info and in which form should it be?


for k in range(2,n):
    kmeans = KMeans(k=k, seed=3)
    model = kmeans.fit(papers_df_vectors)
    #cost[k] = model.computeCost(papers_df_vectors)
    clusterdData = model.transform(papers_df_vectors)
    evaluator = ClusteringEvaluator()
    cost[k] = evaluator.evaluate(clusterdData)

# Plot the cost
df_cost = pd.DataFrame(cost[2:])
df_cost.columns = ["cost"]
new_col = range(2,n)
df_cost.insert(0, 'cluster', new_col)

import pylab as pl
pl.plot(df_cost.cluster, df_cost.cost)
pl.xlabel('Number of Clusters')
pl.ylabel('Score')
pl.title('Elbow Curve')
pl.show()

AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "concat(abstract_word_vectors, title_word_vectors)" due to data type mismatch: Parameter 1 requires the ("STRING" or "BINARY" or "ARRAY") type, however "abstract_word_vectors" has the type "STRUCT<type: TINYINT, size: INT, indices: ARRAY<INT>, values: ARRAY<DOUBLE>>".;
'Project [concat(abstract_word_vectors#495, title_word_vectors#521) AS features#598]
+- Project [abstract_word_vectors#495, title_word_vectors#521]
   +- Project [abstract#379, authors#9, id#10, n_citation#11L, references#12, title#390, venue#14, year#15L, language#304, abstract_words#454, title_words#471, abstract_word_vectors#495, UDF(title_words#471) AS title_word_vectors#521]
      +- Project [abstract#379, authors#9, id#10, n_citation#11L, references#12, title#390, venue#14, year#15L, language#304, abstract_words#454, title_words#471, UDF(abstract_words#454) AS abstract_word_vectors#495]
         +- Project [abstract#379, authors#9, id#10, n_citation#11L, references#12, title#390, venue#14, year#15L, language#304, abstract_words#454, UDF(title#390) AS title_words#471]
            +- Project [abstract#379, authors#9, id#10, n_citation#11L, references#12, title#390, venue#14, year#15L, language#304, UDF(abstract#379) AS abstract_words#454]
               +- Project [abstract#379, authors#9, id#10, n_citation#11L, references#12, remove_punctuation(title#368)#389 AS title#390, venue#14, year#15L, language#304]
                  +- Project [remove_punctuation(abstract#357)#378 AS abstract#379, authors#9, id#10, n_citation#11L, references#12, title#368, venue#14, year#15L, language#304]
                     +- Project [abstract#357, authors#9, id#10, n_citation#11L, references#12, remove_custom_stop_words(title#346)#367 AS title#368, venue#14, year#15L, language#304]
                        +- Project [remove_custom_stop_words(abstract#336)#356 AS abstract#357, authors#9, id#10, n_citation#11L, references#12, title#346, venue#14, year#15L, language#304]
                           +- Project [abstract#336, authors#9, id#10, n_citation#11L, references#12, lower(title#326) AS title#346, venue#14, year#15L, language#304]
                              +- Project [lower(abstract#315) AS abstract#336, authors#9, id#10, n_citation#11L, references#12, title#326, venue#14, year#15L, language#304]
                                 +- Project [abstract#315, authors#9, id#10, n_citation#11L, references#12, remove_stop_words(title#13)#325 AS title#326, venue#14, year#15L, language#304]
                                    +- Project [remove_stop_words(abstract#8)#314 AS abstract#315, authors#9, id#10, n_citation#11L, references#12, title#13, venue#14, year#15L, language#304]
                                       +- Filter (language#304 = en)
                                          +- Project [abstract#8, authors#9, id#10, n_citation#11L, references#12, title#13, venue#14, year#15L, detect_language(abstract#8)#303 AS language#304]
                                             +- Relation [abstract#8,authors#9,id#10,n_citation#11L,references#12,title#13,venue#14,year#15L] json


### KMeans clustering

In [17]:
# K-means (from practise session)

from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=3, seed=42)
model = kmeans.fit(transformed_data) # What shape should transformed_data be?

In [None]:
# Create the clusters using the model
clusterdData = model.transform(transformed_data)

In [None]:
# Evaluating: the silhouette value

from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(clusterdData)
print('Silhouette with squared euclidean distance = ', silhouette)

## Search engine

In [None]:
# Recommender function

# Some ideas:
# https://asdkazmi.medium.com/ai-movies-recommendation-system-with-clustering-based-k-means-algorithm-f04467e02fcd
# https://www.activestate.com/blog/exploring-k-means-clustering-in-big-data-using-python/