In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import types
import pyspark.sql.functions  as F

In [3]:
credentials_location = "/home/de-zoom/agile-polymer-376104-a02a7ed99393.json"

conf = (
    SparkConf()
    .setMaster("local[*]")
    .setAppName("test")
    .set("spark.jars", "./lib/gcs-connector-hadoop3-latest.jar")
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true")
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)
)

In [4]:
sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

23/04/06 15:14:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [6]:
datasets_df = spark.read.parquet("gs://pwc-de-datalake/data/processed_data/datasets.parquet")
links_papers_code_df = spark.read.parquet(
    "gs://pwc-de-datalake/data/processed_data/links_between_papers_and_code.parquet"
)
papers_df = spark.read.parquet(
    "gs://pwc-de-datalake/data/processed_data/papers_with_abstracts.parquet"
)


                                                                                

In [7]:
type(papers_df)

pyspark.sql.dataframe.DataFrame

In [6]:
datasets_df.printSchema()

root
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- full_name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- introduced_date: timestamp (nullable = true)
 |-- tasks: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- variants: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- num_papers: long (nullable = true)



In [7]:
datasets_df = datasets_df.withColumn("num_papers", datasets_df["num_papers"].cast(types.IntegerType()))

In [8]:
datasets_df.show(10)

[Stage 3:>                                                          (0 + 1) / 1]

+--------------------+-------------+--------------------+--------------------+-------------------+--------------------+------------------+--------------------+----------+
|                 url|         name|           full_name|         description|    introduced_date|               tasks|         languages|            variants|num_papers|
+--------------------+-------------+--------------------+--------------------+-------------------+--------------------+------------------+--------------------+----------+
|https://paperswit...|        MNIST|                    |The **MNIST** dat...|1998-11-01 00:00:00|[Density Estimati...|         [English]|[Mnist-Full, Rota...|      6014|
|https://paperswit...|       CelebA|CelebFaces Attrib...|CelebFaces Attrib...|2015-01-01 00:00:00|[Multi-Task Learn...|                []|[Celeba 256X256, ...|      2475|
|https://paperswit...|     JFT-300M|            JFT-300M|**JFT-300M** is a...|2017-07-10 00:00:00|[Image Classifica...|                []|       

                                                                                

In [9]:
links_papers_code_df.printSchema()

root
 |-- paper_url: string (nullable = true)
 |-- paper_title: string (nullable = true)
 |-- paper_arxiv_id: string (nullable = true)
 |-- is_official: boolean (nullable = true)
 |-- mentioned_in_paper: boolean (nullable = true)
 |-- mentioned_in_github: boolean (nullable = true)
 |-- framework: string (nullable = true)



In [10]:
links_papers_code_df.show(10)

+--------------------+--------------------+----------------+-----------+------------------+-------------------+---------+
|           paper_url|         paper_title|  paper_arxiv_id|is_official|mentioned_in_paper|mentioned_in_github|framework|
+--------------------+--------------------+----------------+-----------+------------------+-------------------+---------+
|https://paperswit...|Automatic Post-Ed...|            null|      false|             false|              false|       tf|
|https://paperswit...|Plug and Play Lan...|      1912.02164|      false|             false|              false|  pytorch|
|https://paperswit...|AttnGAN: Fine-Gra...|      1711.10485|      false|             false|              false|  pytorch|
|https://paperswit...|The Measurement C...|quant-ph/0412135|      false|             false|               true|     none|
|https://paperswit...|mudirac: a Dirac ...|      2004.11876|       true|              true|               true|     none|
|https://paperswit...|Ac

In [11]:
papers_df.printSchema()

root
 |-- paper_url: string (nullable = true)
 |-- arxiv_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- proceeding: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tasks: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- date: timestamp (nullable = true)



In [12]:
papers_df.show(10)



+--------------------+----------+--------------------+--------------------+---------------+--------------------+--------------------+-------------------+
|           paper_url|  arxiv_id|               title|            abstract|     proceeding|             authors|               tasks|               date|
+--------------------+----------+--------------------+--------------------+---------------+--------------------+--------------------+-------------------+
|https://paperswit...|1805.10616|Dynamic Network M...|Can evolving netw...|NeurIPS 2018 12|[Elahe Ghalebi, J...|                  []|2018-05-27 00:00:00|
|https://paperswit...|1806.06827|PAC-Bayes bounds ...|PAC-Bayes bounds ...|NeurIPS 2018 12|[Csaba Szepesvari...|                  []|2018-06-18 00:00:00|
|https://paperswit...|1806.06820|Automated Bridge ...|This paper invest...|           null|[Tu A. Hoang, Bil...|[Big-Bench Machin...|2018-06-18 00:00:00|
|https://paperswit...|1802.06093|Gradient descent ...|We analyze algori...| 

                                                                                

In [61]:
papers_framework_df = (
    papers_df.join(
        links_papers_code_df,
        papers_df["paper_url"] == links_papers_code_df["paper_url"],
        how="inner",
    )
    .filter((F.col("framework").isNotNull()) & (F.col("framework") != "none"))
    .select("title", "date", "framework", "is_official")
)


In [14]:
datasets_df.printSchema()

root
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- full_name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- introduced_date: timestamp (nullable = true)
 |-- tasks: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- variants: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- num_papers: integer (nullable = true)



In [15]:
languages_datasets_df = datasets_df.select(
    "name", "introduced_date", F.explode("languages").alias("language")
)

In [16]:
papers_df.printSchema()

root
 |-- paper_url: string (nullable = true)
 |-- arxiv_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- proceeding: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tasks: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- date: timestamp (nullable = true)



In [26]:
def clean_proceeding_name(proceeding_name: str):
    if not proceeding_name:
        return proceeding_name
    i = len(proceeding_name)
    while i > 0 and not proceeding_name[i-1].isalpha():
        i -= 1
    return proceeding_name[:i]

proceeding_udf = F.udf(clean_proceeding_name, returnType=types.StringType())

proceedings_papers_df = papers_df \
    .withColumn("proceeding", proceeding_udf(papers_df.proceeding)) \
    .filter(F.col("proceeding").isNotNull()) \
    .select("title", "proceeding", "date")

In [18]:
papers_df.printSchema()
datasets_df.printSchema()

root
 |-- paper_url: string (nullable = true)
 |-- arxiv_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- proceeding: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tasks: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- date: timestamp (nullable = true)

root
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- full_name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- introduced_date: timestamp (nullable = true)
 |-- tasks: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- variants: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- num_papers: integer (nullable = true)



In [19]:
papers_task_df = papers_df.filter(papers_df.tasks.isNotNull()).select(
    "date", F.explode("tasks").alias("task")
)
datasets_task_df = datasets_df.filter(datasets_df.tasks.isNotNull()).select(
    "name", "introduced_date", F.explode("tasks").alias("task")
)

In [None]:
papers_framework_df
languages_datasets_df
proceedings_papers_df
papers_task_df
datasets_task_df

In [70]:
papers_framework_df.show(10)

+--------------------+-------------------+---------+-----------+
|               title|               date|framework|is_official|
+--------------------+-------------------+---------+-----------+
|Temporal coherenc...|2018-06-18 00:00:00|  pytorch|       true|
|Scaling Neural Ma...|2018-06-01 00:00:00|  pytorch|      false|
|Scaling Neural Ma...|2018-06-01 00:00:00|  pytorch|      false|
|Scaling Neural Ma...|2018-06-01 00:00:00|  pytorch|      false|
|Scaling Neural Ma...|2018-06-01 00:00:00|  pytorch|      false|
|Scaling Neural Ma...|2018-06-01 00:00:00|  pytorch|       true|
|Consistent Indivi...|2018-02-12 00:00:00|       tf|      false|
|Consistent Indivi...|2018-02-12 00:00:00|       tf|      false|
|Consistent Indivi...|2018-02-12 00:00:00|       tf|      false|
|Closing the Gener...|2018-06-18 00:00:00|  pytorch|       true|
+--------------------+-------------------+---------+-----------+
only showing top 10 rows



In [21]:
languages_datasets_df.show(10)

+------------+-------------------+-----------+
|        name|    introduced_date|   language|
+------------+-------------------+-----------+
|       MNIST|1998-11-01 00:00:00|    English|
|        GLUE|2019-01-01 00:00:00|    English|
|    MultiNLI|2018-01-01 00:00:00|    English|
|    ImageNet|2009-01-01 00:00:00|    Chinese|
|    ImageNet|2009-01-01 00:00:00|    English|
|WikiText-103|2016-09-26 00:00:00|    English|
|  OpenAI Gym|2016-01-01 00:00:00|Azerbaijani|
|  WikiText-2|2016-09-26 00:00:00|    English|
|   WikiLarge|2017-01-01 00:00:00|    English|
|   Flickr30k|2014-01-01 00:00:00|    English|
+------------+-------------------+-----------+
only showing top 10 rows



In [27]:
proceedings_papers_df.show(10)



+--------------------+----------+-------------------+
|               title|proceeding|               date|
+--------------------+----------+-------------------+
|Dynamic Network M...|   NeurIPS|2018-05-27 00:00:00|
|PAC-Bayes bounds ...|   NeurIPS|2018-06-18 00:00:00|
|Gradient descent ...|      ICML|2018-02-16 00:00:00|
|Scaling Neural Ma...|        WS|2018-06-01 00:00:00|
|BinGAN: Learning ...|   NeurIPS|2018-06-18 00:00:00|
|A Memory Network ...|      CVPR|2018-05-08 00:00:00|
|    Surface Networks|      CVPR|2017-05-30 00:00:00|
|Extracting Automa...|      ICML|2017-11-27 00:00:00|
|Tree Edit Distanc...|      ICML|2018-06-13 00:00:00|
|Banach Wasserstei...|   NeurIPS|2018-06-18 00:00:00|
+--------------------+----------+-------------------+
only showing top 10 rows



                                                                                

In [41]:
papers_task_df.show(10)

+-------------------+--------------------+
|               date|                task|
+-------------------+--------------------+
|2018-06-18 00:00:00|Big-Bench Machine...|
|2018-06-18 00:00:00|Self-Supervised L...|
|2018-06-01 00:00:00|         Translation|
|2018-06-01 00:00:00|  Question Answering|
|2018-06-01 00:00:00| Machine Translation|
|2018-06-18 00:00:00|    Causal Inference|
|2018-06-18 00:00:00|          Regression|
|2018-02-12 00:00:00|Big-Bench Machine...|
|2018-06-18 00:00:00|Dimensionality Re...|
|2018-06-18 00:00:00|           Retrieval|
+-------------------+--------------------+
only showing top 10 rows



In [42]:
datasets_task_df.show(10)

+-----+-------------------+--------------------+
| name|    introduced_date|                task|
+-----+-------------------+--------------------+
|MNIST|1998-11-01 00:00:00|  Density Estimation|
|MNIST|1998-11-01 00:00:00|Structured Predic...|
|MNIST|1998-11-01 00:00:00|Clustering Algori...|
|MNIST|1998-11-01 00:00:00|General Classific...|
|MNIST|1998-11-01 00:00:00|                 Nmt|
|MNIST|1998-11-01 00:00:00|Personalized Fede...|
|MNIST|1998-11-01 00:00:00|Sequential Image ...|
|MNIST|1998-11-01 00:00:00|Automatic Speech ...|
|MNIST|1998-11-01 00:00:00|Unsupervised Imag...|
|MNIST|1998-11-01 00:00:00|                 Sts|
+-----+-------------------+--------------------+
only showing top 10 rows

