# Exploratory Analysis -- Metadata


#### Imports


In [78]:
import sys
!{sys.executable} -m pip install umap-learn

Collecting umap-learn
  Downloading umap-learn-0.5.1.tar.gz (80 kB)
[K     |████████████████████████████████| 80 kB 11.8 MB/s eta 0:00:01
Collecting numba>=0.49
  Downloading numba-0.53.1-cp37-cp37m-manylinux2014_x86_64.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 55.5 MB/s eta 0:00:01
[?25hCollecting pynndescent>=0.5
  Downloading pynndescent-0.5.2.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 94.5 MB/s eta 0:00:01
Collecting llvmlite<0.37,>=0.36.0rc1
  Downloading llvmlite-0.36.0-cp37-cp37m-manylinux2010_x86_64.whl (25.3 MB)
[K     |████████████████████████████████| 25.3 MB 78.8 MB/s eta 0:00:01
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25ldone
[?25h  Created wheel for umap-learn: filename=umap_learn-0.5.1-py3-none-any.whl size=76566 sha256=ca9ead313617f18dee2695f6a1987c8057b00d47d98e30094d14fd17e9de15bc
  Stored in directory: /home/hadoop/.cache/pip/wheels/01/e7/bb/347dc

In [79]:
import numpy as np
import umap

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark import SparkContext

import time 
from time import sleep

sc = SparkContext()
sc

In [2]:
spark = SparkSession \
 .builder \
 .appName("Python Spark SQL basic example") \
 .config("spark.some.config.option", "some-value") \
 .getOrCreate()

In [3]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType,ArrayType

### Read in metadata 

In [9]:
data = spark.read.csv('s3://ai2-semanticscholar-cord-19/2021-05-03/metadata.csv/',header = True)

In [10]:
data.cache()

DataFrame[cord_uid: string, sha: string, source_x: string, title: string, doi: string, pmcid: string, pubmed_id: string, license: string, abstract: string, publish_time: string, authors: string, journal: string, mag_id: string, who_covidence_id: string, arxiv_id: string, pdf_json_files: string, pmc_json_files: string, url: string, s2_id: string]

In [11]:
data.columns

['cord_uid',
 'sha',
 'source_x',
 'title',
 'doi',
 'pmcid',
 'pubmed_id',
 'license',
 'abstract',
 'publish_time',
 'authors',
 'journal',
 'mag_id',
 'who_covidence_id',
 'arxiv_id',
 'pdf_json_files',
 'pmc_json_files',
 'url',
 's2_id']

In [12]:
data[['cord_uid','title','abstract','authors','journal','pmc_json_files','url']].show()

+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|cord_uid|               title|            abstract|             authors|             journal|      pmc_json_files|                 url|
+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|ug7v899j|Clinical features...|OBJECTIVE: This r...|Madani, Tariq A; ...|      BMC Infect Dis|document_parses/p...|https://www.ncbi....|
|02tnwd4m|Nitric oxide: a p...|Inflammatory dise...|Vliet, Albert van...|          Respir Res|document_parses/p...|https://www.ncbi....|
|ejv2xln0|Surfactant protei...|Surfactant protei...|     Crouch, Erika C|          Respir Res|document_parses/p...|https://www.ncbi....|
|2b73a28n|Role of endotheli...|Endothelin-1 (ET-...|Fagan, Karen A; M...|          Respir Res|document_parses/p...|https://www.ncbi....|
|9785vg6d|Gene expression i...|Respirator

## Load embeddings from S3 bucket

In [16]:
embedding = spark.read.csv('s3://hids511final/cord_19_embeddings_2021-05-03.csv',header = False)

In [23]:
embedding.cache()

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string, _c18: string, _c19: string, _c20: string, _c21: string, _c22: string, _c23: string, _c24: string, _c25: string, _c26: string, _c27: string, _c28: string, _c29: string, _c30: string, _c31: string, _c32: string, _c33: string, _c34: string, _c35: string, _c36: string, _c37: string, _c38: string, _c39: string, _c40: string, _c41: string, _c42: string, _c43: string, _c44: string, _c45: string, _c46: string, _c47: string, _c48: string, _c49: string, _c50: string, _c51: string, _c52: string, _c53: string, _c54: string, _c55: string, _c56: string, _c57: string, _c58: string, _c59: string, _c60: string, _c61: string, _c62: string, _c63: string, _c64: string, _c65: string, _c66: string, _c67: string, _c68: string, _c69: string, _c70: string, _c71: 

In [22]:
embedding.columns

['_c0',
 '_c1',
 '_c2',
 '_c3',
 '_c4',
 '_c5',
 '_c6',
 '_c7',
 '_c8',
 '_c9',
 '_c10',
 '_c11',
 '_c12',
 '_c13',
 '_c14',
 '_c15',
 '_c16',
 '_c17',
 '_c18',
 '_c19',
 '_c20',
 '_c21',
 '_c22',
 '_c23',
 '_c24',
 '_c25',
 '_c26',
 '_c27',
 '_c28',
 '_c29',
 '_c30',
 '_c31',
 '_c32',
 '_c33',
 '_c34',
 '_c35',
 '_c36',
 '_c37',
 '_c38',
 '_c39',
 '_c40',
 '_c41',
 '_c42',
 '_c43',
 '_c44',
 '_c45',
 '_c46',
 '_c47',
 '_c48',
 '_c49',
 '_c50',
 '_c51',
 '_c52',
 '_c53',
 '_c54',
 '_c55',
 '_c56',
 '_c57',
 '_c58',
 '_c59',
 '_c60',
 '_c61',
 '_c62',
 '_c63',
 '_c64',
 '_c65',
 '_c66',
 '_c67',
 '_c68',
 '_c69',
 '_c70',
 '_c71',
 '_c72',
 '_c73',
 '_c74',
 '_c75',
 '_c76',
 '_c77',
 '_c78',
 '_c79',
 '_c80',
 '_c81',
 '_c82',
 '_c83',
 '_c84',
 '_c85',
 '_c86',
 '_c87',
 '_c88',
 '_c89',
 '_c90',
 '_c91',
 '_c92',
 '_c93',
 '_c94',
 '_c95',
 '_c96',
 '_c97',
 '_c98',
 '_c99',
 '_c100',
 '_c101',
 '_c102',
 '_c103',
 '_c104',
 '_c105',
 '_c106',
 '_c107',
 '_c108',
 '_c109',
 '_c110',


In [24]:
embedding.count()

552928

In [25]:
## Vectorize data 
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model

In [31]:
embed_cols = [i for i in embedding.columns if i != "_c0"]

In [53]:
from pyspark.sql.functions import col

embedding_float = embedding.select(*(c if c =="_c0" else col(c).cast("float").alias(c) for c in embedding.columns))

In [58]:
embedding_float[['_c0','_c1']].show()

+--------+-----------+
|     _c0|        _c1|
+--------+-----------+
|ug7v899j| -2.9399836|
|02tnwd4m|  4.6884656|
|ejv2xln0|  0.5599198|
|2b73a28n|  2.3525403|
|9785vg6d| -1.2909216|
|zjufx4fo|  2.7210763|
|5yhe786e|  -3.562534|
|8zchiykl|-0.00439921|
|8qnrcgnk|   2.569323|
|jg13scgo|  -4.686873|
|5tkvsudh| -0.0718144|
|6lvn10f4|  -5.229452|
|tvxpckxo| -3.9141045|
|mcuixluu|  -2.864266|
|6iu1dtyl|  2.9879527|
|t35n7bk9| -3.9089847|
|eiqypt0m| -2.7213426|
|sgmk96vr| -5.2066107|
|di0fcy0j|  1.9947275|
|4k8f7ou1| -0.0629611|
+--------+-----------+
only showing top 20 rows



In [59]:
assembler = VectorAssembler(
    inputCols= embed_cols,
    outputCol="EMBEDDING")

output = assembler.transform(embedding_float)

In [63]:
output[['_c0','EMBEDDING']].show()

+--------+--------------------+
|     _c0|           EMBEDDING|
+--------+--------------------+
|ug7v899j|[-2.9399836063385...|
|02tnwd4m|[4.68846559524536...|
|ejv2xln0|[0.55991977453231...|
|2b73a28n|[2.35254025459289...|
|9785vg6d|[-1.2909215688705...|
|zjufx4fo|[2.72107625007629...|
|5yhe786e|[-3.5625340938568...|
|8zchiykl|[-0.0043992102146...|
|8qnrcgnk|[2.56932306289672...|
|jg13scgo|[-4.6868729591369...|
|5tkvsudh|[-0.0718144029378...|
|6lvn10f4|[-5.2294521331787...|
|tvxpckxo|[-3.9141044616699...|
|mcuixluu|[-2.8642659187316...|
|6iu1dtyl|[2.98795270919799...|
|t35n7bk9|[-3.9089846611022...|
|eiqypt0m|[-2.7213425636291...|
|sgmk96vr|[-5.2066106796264...|
|di0fcy0j|[1.99472749233245...|
|4k8f7ou1|[-0.0629611015319...|
+--------+--------------------+
only showing top 20 rows



In [77]:
output.take(1)[0]['EMBEDDING'].shape

(768,)

### Explore Metadata

##### Number of documents

In [181]:
data.count()

536817

##### Number of unique cord_uids

In [182]:
data[['cord_uid']].distinct().count()

508313

#### Number of unique Journals 


In [183]:
data[['journal']].distinct().count()

45848

#### Number of Abtracts per Journal

In [184]:
data[['journal']].groupBy("journal").count().orderBy(F.col("count").desc()).show()

+--------------------+-----+
|             journal|count|
+--------------------+-----+
|                null|33977|
|             bioRxiv| 5070|
|            PLoS One| 4854|
|                 BMJ| 4583|
|             Sci Rep| 2457|
|Int J Environ Res...| 2290|
|              Nature| 2205|
|              Lancet| 2187|
| Journal of virology| 1869|
|                JAMA| 1744|
|    Int J Infect Dis| 1611|
|              Cureus| 1590|
|             Viruses| 1518|
|             Science| 1517|
|         J Med Virol| 1295|
|    Emerg Infect Dis| 1286|
|       Front Immunol| 1278|
|     Clin Infect Dis| 1245|
|Int. j. environ. ...| 1239|
|               Chest| 1120|
+--------------------+-----+
only showing top 20 rows



##### Number of unique authors 

In [185]:
## Total rows vs dropping rows with missing authors
data.count(),data.na.drop(subset='authors').count()

(536817, 522012)

In [189]:
counter = (data.na.drop(subset='authors')
           .select("authors")
           .rdd
           # join all strings in the list and then split to get each word
           .map(lambda x: " ".join(x).split(";")) 
           #.map(lambda x: x.split(";")) 
           .flatMap(lambda x: x)
           # make a tuple for each word so later it can be grouped by to get its frequency count
           .map(lambda x: (x, 1))
           .reduceByKey(lambda a,b: a+b)
           
           #collectAsMap()
          )

In [190]:
spark.createDataFrame(counter).toDF("authors", "count").orderBy(F.col('count').desc()).show()

+-------------------+-----+
|            authors|count|
+-------------------+-----+
|         Anonymous,| 1621|
|              O039,| 1232|
|              D039,|  918|
|               2020|  832|
|          Wang, Wei|  458|
|  Mahase, Elisabeth|  415|
|    Yuen, Kwok-Yung|  371|
|         Zhang, Wei|  359|
|            Li, Yan|  343|
|              O039,|  336|
|  Wiwanitkit, Viroj|  314|
|  Iacobucci, Gareth|  309|
| Drosten, Christian|  296|
|            Li, Wei|  275|
|         Wang, Jing|  264|
|        Rimmer, Abi|  258|
|          Liu, Yang|  249|
|           Liu, Wei|  245|
|         Wang, Ying|  239|
|             Li, Li|  237|
+-------------------+-----+
only showing top 20 rows



In [191]:
data.filter(data.authors.contains("Li, Li;"))[['authors']].count()

239

In [192]:
data.filter(data.authors.contains("Wang, Ying;"))[['authors']].count()

258

In [193]:
data.filter(data.authors.contains("2020"))[['authors']].show()

+--------------------+
|             authors|
+--------------------+
|          2020-02-20|
|          2020-05-15|
|          2020-06-10|
|          2020-06-19|
|          2020-06-30|
|          2020-07-21|
|          2020-09-10|
|          2020-09-24|
| 2020: e1-e3. doi...|
|          2020-11-13|
|          2020-04-06|
|          2020-06-09|
|          2020-12-23|
|              2020."|
|          2020-09-25|
|          2020-07-03|
|          2020-10-02|
|          2020-07-01|
|          2020-09-22|
|          2020-07-01|
+--------------------+
only showing top 20 rows



In [None]:
#### create edgelist form authors 
# ['Source','Target','cord_id','year','journal']


In [220]:
def split_authors(df_authors):
    split_authors_lambda = lambda x: x.split(';')
    try:
        if df_authors== None:
            print('here')
            return None
        res = split_authors_lambda(df_authors)
        return res
    except:
        return None
    #df_authors.apply(lambda x: x.split(';'))

In [221]:
split_authors_udf = F.udf(split_authors, ArrayType(StringType(),False))


In [222]:
data.na.drop(subset='authors').withColumn("authors_split", split_authors_udf(data['authors']))[
    ['authors','authors_split']].show()

+--------------------+--------------------+
|             authors|       authors_split|
+--------------------+--------------------+
|Madani, Tariq A; ...|[Madani, Tariq A,...|
|Vliet, Albert van...|[Vliet, Albert va...|
|     Crouch, Erika C|   [Crouch, Erika C]|
|Fagan, Karen A; M...|[Fagan, Karen A, ...|
|Domachowske, Jose...|[Domachowske, Jos...|
|Pasternak, Alexan...|[Pasternak, Alexa...|
|Alvarez, Gonzalo;...|[Alvarez, Gonzalo...|
|Ball, Jonathan; V...|[Ball, Jonathan, ...|
|Slebos, Dirk-Jan;...|[Slebos, Dirk-Jan...|
|Tsui, Fu-Chiang; ...|[Tsui, Fu-Chiang,...|
|Ivanov, Ivaylo P....|[Ivanov, Ivaylo P...|
|Shi, Stephanie T....|[Shi, Stephanie T...|
|Pridgeon, Julia W...|[Pridgeon, Julia ...|
|Ploubidou, Aspasi...|[Ploubidou, Aspas...|
|       Barry, John M|     [Barry, John M]|
|Shieh, Biehuoy; L...|[Shieh, Biehuoy, ...|
|Verheij, Joanne; ...|[Verheij, Joanne,...|
|Porco, Travis C; ...|[Porco, Travis C,...|
|Kremer, Ted M; Ri...|[Kremer, Ted M,  ...|
|Bratlie, Marit S;...|[Bratlie, 

In [232]:
data_dropna_split = data.na.drop(subset='authors').withColumn("authors_split", split_authors_udf(data['authors']))

In [233]:
data_dropna_split[['authors_split']].take(10)

[Row(authors_split=['Madani, Tariq A', ' Al-Ghamdi, Aisha A']),
 Row(authors_split=['Vliet, Albert van der', ' Eiserich, Jason P', ' Cross, Carroll E']),
 Row(authors_split=['Crouch, Erika C']),
 Row(authors_split=['Fagan, Karen A', ' McMurtry, Ivan F', ' Rodman, David M']),
 Row(authors_split=['Domachowske, Joseph B', ' Bonville, Cynthia A', ' Rosenberg, Helene F']),
 Row(authors_split=['Pasternak, Alexander O.', ' van den Born, Erwin', ' Spaan, Willy J.M.', ' Snijder, Eric J.']),
 Row(authors_split=['Alvarez, Gonzalo', ' Hébert, Paul C', ' Szick, Sharyn']),
 Row(authors_split=['Ball, Jonathan', ' Venn, Richard']),
 Row(authors_split=['Slebos, Dirk-Jan', ' Ryter, Stefan W', ' Choi, Augustine MK']),
 Row(authors_split=['Tsui, Fu-Chiang', ' Espino, Jeremy U.', ' Dato, Virginia M.', ' Gesteland, Per H.', ' Hutman, Judith', ' Wagner, Michael M.'])]

In [234]:
data_dropna_split.cache()

DataFrame[cord_uid: string, sha: string, source_x: string, title: string, doi: string, pmcid: string, pubmed_id: string, license: string, abstract: string, publish_time: string, authors: string, journal: string, mag_id: string, who_covidence_id: string, arxiv_id: string, pdf_json_files: string, pmc_json_files: string, url: string, s2_id: string, authors_split: array<string>]

In [261]:
import itertools

combinations_udf = F.udf(
    lambda x: list(itertools.combinations(x, 2)),
    "array<struct<item1:string,item2:string>>"
)

edgelist = data_dropna_split.withColumn("edgelist", 
                                        F.explode(combinations_udf(F.col("authors_split")))) \
    .selectExpr("*") #.selectExpr('edgelist.*') 

edgelist[['edgelist.*','cord_uid','journal']].show()

+--------------------+--------------------+--------+----------------+
|               item1|               item2|cord_uid|         journal|
+--------------------+--------------------+--------+----------------+
|     Madani, Tariq A|  Al-Ghamdi, Aisha A|ug7v899j|  BMC Infect Dis|
|Vliet, Albert van...|   Eiserich, Jason P|02tnwd4m|      Respir Res|
|Vliet, Albert van...|    Cross, Carroll E|02tnwd4m|      Respir Res|
|   Eiserich, Jason P|    Cross, Carroll E|02tnwd4m|      Respir Res|
|      Fagan, Karen A|    McMurtry, Ivan F|2b73a28n|      Respir Res|
|      Fagan, Karen A|     Rodman, David M|2b73a28n|      Respir Res|
|    McMurtry, Ivan F|     Rodman, David M|2b73a28n|      Respir Res|
|Domachowske, Jose...| Bonville, Cynthia A|9785vg6d|      Respir Res|
|Domachowske, Jose...| Rosenberg, Helene F|9785vg6d|      Respir Res|
| Bonville, Cynthia A| Rosenberg, Helene F|9785vg6d|      Respir Res|
|Pasternak, Alexan...| van den Born, Erwin|zjufx4fo|The EMBO Journal|
|Pasternak, Alexan..

In [247]:
edgelist[['authors_split','edgelist','cord_uid']].take(10)

[Row(authors_split=['Madani, Tariq A', ' Al-Ghamdi, Aisha A'], edgelist=Row(item1='Madani, Tariq A', item2=' Al-Ghamdi, Aisha A'), cord_uid='ug7v899j'),
 Row(authors_split=['Vliet, Albert van der', ' Eiserich, Jason P', ' Cross, Carroll E'], edgelist=Row(item1='Vliet, Albert van der', item2=' Eiserich, Jason P'), cord_uid='02tnwd4m'),
 Row(authors_split=['Vliet, Albert van der', ' Eiserich, Jason P', ' Cross, Carroll E'], edgelist=Row(item1='Vliet, Albert van der', item2=' Cross, Carroll E'), cord_uid='02tnwd4m'),
 Row(authors_split=['Vliet, Albert van der', ' Eiserich, Jason P', ' Cross, Carroll E'], edgelist=Row(item1=' Eiserich, Jason P', item2=' Cross, Carroll E'), cord_uid='02tnwd4m'),
 Row(authors_split=['Fagan, Karen A', ' McMurtry, Ivan F', ' Rodman, David M'], edgelist=Row(item1='Fagan, Karen A', item2=' McMurtry, Ivan F'), cord_uid='2b73a28n'),
 Row(authors_split=['Fagan, Karen A', ' McMurtry, Ivan F', ' Rodman, David M'], edgelist=Row(item1='Fagan, Karen A', item2=' Rodman, 

In [172]:
sc.stop()

In [173]:
spark.stop()