# Project 3 - BD Analytics

In [1]:
from pyspark.context import SparkContext # for RDDs
from pyspark.sql import SparkSession # for DFs

spark = (SparkSession.builder
                    .appName('BDM_project3')
                    .getOrCreate()
        ) # for DFs

## Importing the data

In [2]:
papers_df = (spark.read
             .option("inferSchema", True) # Letting Spark itself define the schema
             .json("dblp-ref-3.json")
            )

papers_df.printSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- n_citation: long (nullable = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: long (nullable = true)



## Exploratory data analysis

In [3]:
# Subset of the dataframe
papers_df.show(5)

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|AdaBoost algorith...|[Zheng Xu, Runbin...|001eef4f-1d00-4ae...|         0|[0a11984c-ab6e-4b...|A Heterogeneous S...|high performance ...|2016|
|In this paper, a ...|[Yufei Liang, Yan...|002e0b7e-d62f-414...|         0|                  []|A novel conformal...|international con...|2016|
|This paper studie...|[Xiaodong Ai, Key...|00352759-f0a7-467...|         0|[1862a08a-08c6-4a...|A source-seeking ...|international con...|2016|
|                NULL|[Francine Berman,...|00f77fa9-ae49-493...|         0|                  []|Social and ethica...|Communications of..

In [4]:
# First row
papers_df.show(n=1, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
# Summary statistics (for numeric columns)
papers_df.describe("n_citation", "year").toPandas()

Unnamed: 0,summary,n_citation,year
0,count,79007.0,79007.0
1,mean,7.607566418165479,2014.668105357753
2,stddev,51.072850795288375,5.485498159474597
3,min,0.0,1955.0
4,max,7091.0,2018.0


In [6]:
# Extracting features

# Add more??

## Data Preprocessing

In [7]:
!pip install langdetect



In [8]:
# Only keeping the English documents

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from langdetect import detect, LangDetectException
# Only keeping the English documents
def detect_language(text):
    if text:
        try:
            return detect(text)
        except:
            return 'unknown'
    return 'unknown'
detect_language_udf = udf(detect_language, StringType())
papers_df = papers_df.withColumn("language", detect_language_udf(papers_df.abstract))
papers_df = papers_df.filter(papers_df.language == 'en')


In [9]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [10]:
# Removing stopwords (with Gensim)
from gensim.parsing.preprocessing import remove_stopwords #!pip install gensim

def remove_stop_words(text):
    if text is not None:
        return remove_stopwords(text)
    return text

In [11]:
# Remove custom stopwords
custom_stop_words = [ 'doi',
'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure','rights',
'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier',
'PMC', 'CZI', 'www']
def remove_custom_stop_words(text):
    if text is not None:
        words = text.split()
        filtered_words = [word for word in words if word not in custom_stop_words]
        return ' '.join(filtered_words)
    return text


In [12]:
# Remove punctuation
import re

def remove_punctuation(text):
    if text is not None:
        return re.sub(r'[!()\[\]{};:\'"\,<>./?@#$%^&*_~]', '', text)
    return text

In [13]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import lower
from pyspark.sql.functions import regexp_extract, col

# Create a user-defined function (UDF)
remove_stop_words_udf = F.udf(remove_stop_words, StringType()) # Default return type is string
custom_stop_words_udf = F.udf(remove_custom_stop_words, StringType()) # Default return type is string
remove_punctuation_udf = F.udf(remove_punctuation, StringType()) # Default return type is string

# Apply the UDF 
# Remove stop words
papers_df = papers_df.withColumn("abstract", remove_stop_words_udf(papers_df["abstract"]))
papers_df = papers_df.withColumn("title", remove_stop_words_udf(papers_df["title"]))

# Convert into a lowercase
papers_df = papers_df.withColumn('abstract', lower(papers_df['abstract']))
papers_df = papers_df.withColumn('title', lower(papers_df['title']))

# Remove custom stop words
papers_df = papers_df.withColumn("abstract", custom_stop_words_udf(papers_df["abstract"]))
papers_df = papers_df.withColumn("title", custom_stop_words_udf(papers_df["title"]))

# Remove punctuation
papers_df = papers_df.withColumn('abstract', remove_punctuation_udf(papers_df['abstract']))
papers_df = papers_df.withColumn('title', remove_punctuation_udf(papers_df['title']))

In [14]:
papers_df.show(n=1, truncate=False, vertical=True)


-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 abstract   | ada

## Vectorization

In [15]:
from pyspark.ml.feature import Tokenizer, Word2Vec

# Converting data for ML algorithms
# Tokenize text
tokenizer = Tokenizer(inputCol="abstract", outputCol="abstract_words")
papers_df = tokenizer.transform(papers_df)
tokenizer = Tokenizer(inputCol="title", outputCol="title_words")
papers_df = tokenizer.transform(papers_df)

# Train Word2Vec model on abstract
word2Vec_abstract = Word2Vec(vectorSize=3, minCount=0, inputCol="abstract_words", outputCol="abstract_word_vectors")
model_abstract = word2Vec_abstract.fit(papers_df)
papers_df = model_abstract.transform(papers_df)

# Train Word2Vec model on title
word2Vec_title = Word2Vec(vectorSize=3, minCount=0, inputCol="title_words", outputCol="title_word_vectors")
model_title = word2Vec_title.fit(papers_df)
papers_df = model_title.transform(papers_df)


In [16]:
# Show the results
papers_df.select("title_words", "title_word_vectors").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------+
|title_words                                                                                                                     |title_word_vectors                                               |
+--------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------+
|[a, heterogeneous, system, real-time, detection, adaboost]                                                                      |[0.13081834806750217,0.2661234525342782,0.11660287280877431]     |
|[a, novel, conformal, jigsaw, ebg, structure, design]                                                                           |[0.21808177831449677,0.07638901409726323,-0.18523061594792772]   |
|[a, source-see

## Clustering

In [17]:
# K-means and elbow method

## Search engine

In [None]:
# Recommender function