# MAG analysis notebook

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from MAG import MicrosoftAcademicGraph
import os
from sparkhpc import sparkjob
import findspark

# set environment variables
os.environ["SPARK_LOCAL_DIRS"] = "/home/laal/MAG/TMP"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.242.b08-0.el7_7.x86_64"
os.environ['SPARK_HOME'] = "/home/laal/MAG/spark-3.0.2-bin-hadoop2.7"

In [2]:
sj = sparkjob.sparkjob(jobid=36483, memory_per_executor=5500)

In [3]:
job = sj.start_spark()

['NAME STATE JOBID', 'hebbian_weights_submit PENDING 36481', 'roberta_base RUNNING 36230', 'simple RUNNING 35764', 'sparkcluster RUNNING 36483', 'hebbian_weights_submit RUNNING 36236', 'hebbian_weights_submit RUNNING 36235', 'jupyter RUNNING 36477', 'bikenwgrowth RUNNING 36387_46', 'bikenwgrowth RUNNING 36387_47', 'bikenwgrowth RUNNING 36387_48', 'bikenwgrowth RUNNING 36387_49', 'bikenwgrowth RUNNING 36387_50', 'bikenwgrowth RUNNING 36387_51', 'bikenwgrowth RUNNING 36387_52', 'bikenwgrowth RUNNING 36387_53', 'bikenwgrowth RUNNING 36387_54', 'bikenwgrowth RUNNING 36387_55', 'bikenwgrowth RUNNING 36387_57', 'bikenwgrowth RUNNING 36387_58', 'bikenwgrowth RUNNING 36387_18', 'bikenwgrowth RUNNING 36387_19', 'bikenwgrowth RUNNING 36387_22', 'bikenwgrowth RUNNING 36387_23', 'bikenwgrowth RUNNING 36387_24', 'bikenwgrowth RUNNING 36387_25', 'bikenwgrowth RUNNING 36387_26', 'bikenwgrowth RUNNING 36387_28', 'bikenwgrowth RUNNING 36387_29', 'bikenwgrowth RUNNING 36387_30', 'bikenwgrowth RUNNING 36

In [4]:
spark = SparkSession.builder.config(conf=job.getConf()).getOrCreate()

In [5]:
mag = MicrosoftAcademicGraph(spark=spark, data_folderpath="/home/laal/MAG/DATA/")

In [6]:
def author_to_field_of_study(mag):
    author_affiliations = mag.getDataframe('PaperAuthorAffiliations')
    authors = mag.getDataframe('WosToMag')
    paper_root_field = mag.getDataframe('PaperRootField')

    query = """
    SELECT paa.AuthorId, 
           prf.AncestorId,
           COUNT(*) AS NumPapersInField
    FROM PaperAuthorAffiliations AS paa
    INNER JOIN WosToMag AS wtm ON paa.AuthorId = wtm.MAG 
    INNER JOIN PaperRootField AS prf ON paa.PaperId = prf.PaperId
    WHERE prf.fieldRank = 1
    GROUP BY paa.AuthorId, 
             prf.AncestorId
    ORDER BY COUNT(*) DESC
    LIMIT 1000
    """
    author_to_field = mag.query_sql(query)
    return author_to_field

In [7]:
author_to_field = author_to_field_of_study(mag)

In [8]:
sj

['NAME STATE JOBID', 'hebbian_weights_submit PENDING 36481', 'roberta_base RUNNING 36230', 'simple RUNNING 35764', 'sparkcluster RUNNING 36483', 'hebbian_weights_submit RUNNING 36236', 'hebbian_weights_submit RUNNING 36235', 'jupyter RUNNING 36477', 'bikenwgrowth RUNNING 36387_46', 'bikenwgrowth RUNNING 36387_47', 'bikenwgrowth RUNNING 36387_48', 'bikenwgrowth RUNNING 36387_49', 'bikenwgrowth RUNNING 36387_50', 'bikenwgrowth RUNNING 36387_51', 'bikenwgrowth RUNNING 36387_52', 'bikenwgrowth RUNNING 36387_53', 'bikenwgrowth RUNNING 36387_54', 'bikenwgrowth RUNNING 36387_55', 'bikenwgrowth RUNNING 36387_57', 'bikenwgrowth RUNNING 36387_58', 'bikenwgrowth RUNNING 36387_17', 'bikenwgrowth RUNNING 36387_18', 'bikenwgrowth RUNNING 36387_19', 'bikenwgrowth RUNNING 36387_22', 'bikenwgrowth RUNNING 36387_23', 'bikenwgrowth RUNNING 36387_24', 'bikenwgrowth RUNNING 36387_25', 'bikenwgrowth RUNNING 36387_26', 'bikenwgrowth RUNNING 36387_27', 'bikenwgrowth RUNNING 36387_28', 'bikenwgrowth RUNNING 36

In [9]:
job

In [10]:
spark

In [11]:
author_to_field.show(20)

+----------+----------+----------------+
|  AuthorId|AncestorId|NumPapersInField|
+----------+----------+----------------+
|2632342345|  71924100|               1|
|2120530206|  71924100|             468|
|2098520860| 127413603|               1|
|2124939244|  86803240|              57|
|2114367723| 162324750|              52|
|2063428679|  15744967|              11|
|2665439808| 121332964|               1|
|1975701172|  71924100|             130|
|1999107301|  71924100|              17|
|2147188725|  41008148|               4|
|1927734089|  86803240|               7|
|2764227198| 121332964|               5|
|2501383496| 138885662|               1|
|2144881208|  71924100|             306|
|2372880476| 192562407|               2|
|2098297569| 127413603|              10|
|2751618052| 121332964|             133|
|1941268798| 185592680|             109|
|2706617920| 127413603|               1|
|2715597251| 127413603|               1|
+----------+----------+----------------+
only showing top