# Project 3 - BD Analytics

In [1]:
from pyspark.context import SparkContext # for RDDs
from pyspark.sql import SparkSession # for DFs

spark = (SparkSession.builder
                    .appName('BDM_project3')
                    .getOrCreate()
        ) # for DFs

## Importing the data

In [2]:
# Reading in all the files
#files = ["dblp-ref-0.json", "dblp-ref-1.json", "dblp-ref-2.json", "dblp-ref-3.json"]
files = ["dblp-ref-3.json"] # testing with one file
 
papers_df = (spark.read
             .option("inferSchema", True) # Letting Spark itself define the schema
             .json(files) 
            )

papers_df.printSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- n_citation: long (nullable = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: long (nullable = true)



## Exploratory data analysis

In [3]:
# Subset of the dataframe
papers_df.show(5)

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|AdaBoost algorith...|[Zheng Xu, Runbin...|001eef4f-1d00-4ae...|         0|[0a11984c-ab6e-4b...|A Heterogeneous S...|high performance ...|2016|
|In this paper, a ...|[Yufei Liang, Yan...|002e0b7e-d62f-414...|         0|                  []|A novel conformal...|international con...|2016|
|This paper studie...|[Xiaodong Ai, Key...|00352759-f0a7-467...|         0|[1862a08a-08c6-4a...|A source-seeking ...|international con...|2016|
|                NULL|[Francine Berman,...|00f77fa9-ae49-493...|         0|                  []|Social and ethica...|Communications of..

In [4]:
# Shape of the dataframe
print((papers_df.count(), len(papers_df.columns)))

(79007, 8)


In [5]:
# First row
papers_df.show(n=1, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
# Number of non-NaN values in each column
# https://stackoverflow.com/questions/33900726/count-number-of-non-nan-entries-in-each-column-of-spark-dataframe-in-pyspark

from pyspark.sql.functions import col, count, isnan, lit, sum

def count_not_null(c, nan_as_null=False):
    pred = col(c).isNotNull() & (~isnan(c) if nan_as_null else lit(True))
    return sum(pred.cast("integer")).alias(c)

papers_df.agg(*[count_not_null(c) for c in papers_df.columns]).show()

+--------+-------+-----+----------+----------+-----+-----+-----+
|abstract|authors|   id|n_citation|references|title|venue| year|
+--------+-------+-----+----------+----------+-----+-----+-----+
|   44970|  79007|79007|     79007|     49546|79007|79007|79007|
+--------+-------+-----+----------+----------+-----+-----+-----+



In [7]:
# Summary statistics (for numeric columns)
papers_df.describe("n_citation", "year").toPandas()

Unnamed: 0,summary,n_citation,year
0,count,79007.0,79007.0
1,mean,7.607566418165479,2014.668105357753
2,stddev,51.0728507952884,5.48549815947471
3,min,0.0,1955.0
4,max,7091.0,2018.0


## Data Preprocessing

In [8]:
# Only keeping the English documents

# !pip install langdetect

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from langdetect import detect, LangDetectException
# Only keeping the English documents
def detect_language(text):
    if text:
        try:
            return detect(text)
        except:
            return 'unknown'
    return 'unknown'
detect_language_udf = udf(detect_language, StringType())
papers_df = papers_df.withColumn("language", detect_language_udf(papers_df.abstract))
papers_df = papers_df.filter(papers_df.language == 'en')


In [9]:
# Removing stopwords (with Gensim)
# !pip install gensim
from gensim.parsing.preprocessing import remove_stopwords #!pip install gensim

def remove_stop_words(text):
    if text is not None:
        return remove_stopwords(text)
    return text

In [10]:
# Remove custom stopwords
custom_stop_words = [ 'doi',
'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure','rights',
'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier',
'PMC', 'CZI', 'www']
def remove_custom_stop_words(text):
    if text is not None:
        words = text.split()
        filtered_words = [word for word in words if word not in custom_stop_words]
        return ' '.join(filtered_words)
    return text


In [11]:
# Remove punctuation
import re

def remove_punctuation(text):
    if text is not None:
        return re.sub(r'[!()\[\]{};:\'"\,<>./?@#$%^&*_~]', '', text)
    return text

In [12]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import lower
from pyspark.sql.functions import regexp_extract, col

# Create a user-defined function (UDF)
remove_stop_words_udf = F.udf(remove_stop_words, StringType()) # Default return type is string
custom_stop_words_udf = F.udf(remove_custom_stop_words, StringType()) # Default return type is string
remove_punctuation_udf = F.udf(remove_punctuation, StringType()) # Default return type is string

# Apply the UDF 
# Remove stop words
papers_df = papers_df.withColumn("abstract", remove_stop_words_udf(papers_df["abstract"]))
papers_df = papers_df.withColumn("title", remove_stop_words_udf(papers_df["title"]))

# Convert into a lowercase
papers_df = papers_df.withColumn('abstract', lower(papers_df['abstract']))
papers_df = papers_df.withColumn('title', lower(papers_df['title']))

# Remove custom stop words
papers_df = papers_df.withColumn("abstract", custom_stop_words_udf(papers_df["abstract"]))
papers_df = papers_df.withColumn("title", custom_stop_words_udf(papers_df["title"]))

# Remove punctuation
papers_df = papers_df.withColumn('abstract', remove_punctuation_udf(papers_df['abstract']))
papers_df = papers_df.withColumn('title', remove_punctuation_udf(papers_df['title']))

In [13]:
papers_df.show(n=1, truncate=False, vertical=True)


-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 abstract   | ada

## Vectorization

In [14]:
from pyspark.sql.functions import concat_ws, col

# Combine title and abstract into one column
papers_df = papers_df.fillna({'title': '', 'abstract': ''})
papers_df = papers_df.withColumn("text", concat_ws(" ", col("title"), col("abstract")))

In [15]:
papers_df.show()

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------+--------------------+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|language|                text|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------+--------------------+
|adaboost algorith...|[Zheng Xu, Runbin...|001eef4f-1d00-4ae...|         0|[0a11984c-ab6e-4b...|a heterogeneous s...|high performance ...|2016|      en|a heterogeneous s...|
|in paper kind nov...|[Yufei Liang, Yan...|002e0b7e-d62f-414...|         0|                  []|a novel conformal...|international con...|2016|      en|a novel conformal...|
|this paper studie...|[Xiaodong Ai, Key...|00352759-f0a7-467...|         0|[1862a08a-08c6-4a...|a source-seeking ...|international

In [16]:
from pyspark.ml.feature import Tokenizer, Word2Vec
from pyspark.sql.functions import size

# Tokenize the combined text
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenized_data = tokenizer.transform(papers_df)

# Filter out rows where 'words' column is null or empty
filtered_data = tokenized_data.filter(size(col("words")) > 0)

In [17]:
# Vectorize using Word2Vec
word2vec = Word2Vec(vectorSize=3, minCount=1, inputCol="words", outputCol="features1", numPartitions=4)
word2vec_model = word2vec.fit(filtered_data)
result = word2vec_model.transform(filtered_data)

## Clustering

### Elbow method for number of clusters

In [18]:
from pyspark.ml.feature import PCA

pca = PCA(k=3, inputCol="features1", outputCol="features")
pca_model = pca.fit(result)
pca_result = pca_model.transform(result)

In [19]:
pca_result = pca_result.dropna()

In [20]:
#pca_result.select("features").show(n=5, truncate=False, vertical=True)

In [21]:
# Repartition data if necessary
pca_result = pca_result.repartition(50)

In [None]:
cost = []

for k in range(2, 21):
    kmeans = KMeans(k=k, seed=3)
    model = kmeans.fit(pca_result)
    evaluator = ClusteringEvaluator()  
    cost.append(model.summary.trainingCost)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(int(range(2, 21)), cost, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Cost')
plt.title('Elbow Curve')
plt.show()

### KMeans clustering

In [22]:
# K-means (from practise session)

from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=6, seed=42)
model = kmeans.fit(pca_result) # What shape should transformed_data be?

In [23]:
# Create the clusters using the model
clusters = model.transform(pca_result)

In [24]:
# Evaluating: the silhouette value

from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(clusters)
print('Silhouette with squared euclidean distance = ', silhouette)

Silhouette with squared euclidean distance =  0.47008049439904004


## Search engine

In [25]:
clusters.show(5)

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------+--------------------+--------------------+--------------------+--------------------+----------+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|language|                text|               words|           features1|            features|prediction|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------+--------------------+--------------------+--------------------+--------------------+----------+
|a number improvem...|[Rostislav V. Lap...|2c4fa78a-7e5f-400...|         0|[9ab4d920-d1c9-42...|an improved param...|arXiv: Optimizati...|2017|      en|an improved param...|[an, improved, pa...|[0.24362396326790...|[-0.1854054560534...|         5|
|prostat

In [26]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import Normalizer
from pyspark.sql.functions import lit
from pyspark.sql.types import FloatType
from numpy.linalg import norm
import numpy as np
import random

from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity(v1, v2):
    return random.random()
    #return float(v1.dot(v2) / (v1.norm(2) * v2.norm(2)))
    #return np.dot(v1,v2)/(norm(v1)*norm(v2))

# Create a user-defined function (UDF)
cosine_similarity_udf = F.udf(cosine_similarity, FloatType())

In [27]:
def recommend_papers(title, cluster_nr, vector, top_papers):

    # Only wanted cluster
    cluster = clusters.filter((clusters.prediction==cluster_nr)) 
    # Finding the score for every row (article)
    cluster = cluster.withColumn("score", cosine_similarity_udf(cluster["features"], lit(vector)))
    # Sorting based on score
    #cluster = cluster.sort(cluster.score.desc()).collect()
    cluster = cluster.sort(F.col("score").desc())
    # Returning top papers ( +1 because we return this paper itself)
    top6 = cluster.limit(top_papers)
    
    return top6

In [28]:
title = "extending soft sets optimality decision based multiple decisions same data"
cluster_nr = 0
return_paper_amount = 5
vector = [-0.10243023044169974,0.11760577260862987,-0.1057083581270069,-0.1793964755532844,-0.0763572113374742,0.06533546890329417,0.05405455779683669,0.0014953876143246753,0.05579619871718542,-0.06045485277158717,-0.031985408328361255,-0.0065214477254662735,0.055045270135136444,-0.039381974801982335,0.1150758126529062,0.04839263885073337,0.00924491386608666,-0.047626787554568506,0.062001876267498854,0.23482869677353602] 

In [29]:
top_papers = recommend_papers(title, cluster_nr, vector, return_paper_amount)
top_papers.show(10)

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------+--------------------+--------------------+--------------------+--------------------+----------+----------+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|language|                text|               words|           features1|            features|prediction|     score|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------+--------------------+--------------------+--------------------+--------------------+----------+----------+
|the paper based c...|[Heng Wang, Xin W...|31bd5651-63b3-41a...|         0|[cf97d226-bed1-4c...|functional resear...|biomedical engine...|2016|      en|functional resear...|[functional, rese...|[0.22399424616190...|[0.20781