# Graph Frames

In [0]:
import uuid
from functools import reduce
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, ArrayType, StringType, LongType, StructField, IntegerType
from typing import List
from pyspark.sql.functions import udf

# Here Path indicates input file path, and delta_dir points to file
path = "dbfs:/FileStore/data/split_data/"
delta_dir = "dbfs:/delta/tables/"

# There should be 18 files each with 300 k records. This would change if you change split value.
file_count = len(dbutils.fs.ls(path))
assert file_count == 18, "Data not found. You may want to check the path or run the notebook from start again. If you updated the split value, ignore this assertion error"

In [0]:
import graphframes as gf

spark.conf.set("spark.sql.shuffle.partitions", sc.defaultParallelism)

help(gf)

Help on package graphframes:

NAME
    graphframes

PACKAGE CONTENTS
    examples (package)
    graphframe
    lib (package)
    tests

CLASSES
    builtins.object
        graphframes.graphframe.GraphFrame
    
    class GraphFrame(builtins.object)
     |  GraphFrame(v, e)
     |  
     |  Represents a graph with vertices and edges stored as DataFrames.
     |  
     |  :param v:  :class:`DataFrame` holding vertex information.
     |             Must contain a column named "id" that stores unique
     |             vertex IDs.
     |  :param e:  :class:`DataFrame` holding edge information.
     |             Must contain two columns "src" and "dst" storing source
     |             vertex IDs and destination vertex IDs of edges, respectively.
     |  
     |  >>> localVertices = [(1,"A"), (2,"B"), (3, "C")]
     |  >>> localEdges = [(1,2,"love"), (2,1,"hate"), (2,3,"follow")]
     |  >>> v = sqlContext.createDataFrame(localVertices, ["id", "name"])
     |  >>> e = sqlContext.createData

In [0]:
factTable = spark.read.format('delta').load(f'{delta_dir}FactTable')
authors = spark.read.format('delta').load(f'{delta_dir}Author')
publications = spark.read.format('delta').load(f'{delta_dir}Publication')
# venues = spark.read.format('delta').load(f'{delta_dir}Venue')
domains = spark.read.format('delta').load(f'{delta_dir}FieldOfStudy')
institutions = spark.read.format('delta').load(f'{delta_dir}Organization')

In [0]:
factTable.printSchema()

root
 |-- Publication_ID: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Venue_ID: string (nullable = true)
 |-- year: long (nullable = true)
 |-- Lang_ID: long (nullable = true)
 |-- FOS_ID: long (nullable = true)
 |-- AuthorRank: integer (nullable = true)
 |-- Author_ID: string (nullable = true)
 |-- Type_ID: long (nullable = true)



In [0]:
co_authorship_v = (factTable
                       .select(F.col("Author_ID").alias("id"))
                       .union(factTable.select(F.col("Author_ID").alias("id")))
                   .distinct()
                  )
co_authorship_v.show(5)

+--------------------+
|                  id|
+--------------------+
|53f43c4adabfaefed...|
|53f44859dabfaeee2...|
|53f461dfdabfaedf4...|
|53f449cbdabfaedd7...|
|53f4a18fdabfaedce...|
+--------------------+
only showing top 5 rows



In [0]:
co_authorship_e = (factTable
                       .select(F.col("Author_ID").alias("src"), F.col("Author_ID").alias("dst"), "*")
                   .drop("Author_ID")
                  )
co_authorship_e.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+----+-------+------+----------+-------+
|                 src|                 dst|      Publication_ID|            keywords|            Venue_ID|year|Lang_ID|FOS_ID|AuthorRank|Type_ID|
+--------------------+--------------------+--------------------+--------------------+--------------------+----+-------+------+----------+-------+
|53f449e5dabfaeb22...|53f449e5dabfaeb22...|53e997ccb7602d970...|[Positron emissio...|                null|2013|   null|  null|         3|   null|
|53f47d87dabfaee43...|53f47d87dabfaee43...|53e997d1b7602d970...|[programming lang...|555037907cea80f95...|2011|   null|  null|         5|   null|
|53f42941dabfaec22...|53f42941dabfaec22...|53e997d7b7602d970...|[circuit optimisa...|53a72b9420f7420be...|2004|   null|  null|         1|   null|
|53f43536dabfaee43...|53f43536dabfaee43...|53e997ddb7602d970...|[room square, spe...|555036df7cea80f95...|1993|   null|  nul

In [0]:
co_authorship_graph = gf.GraphFrame(co_authorship_v, co_authorship_e)

display(co_authorship_graph)

GraphFrame(v:[id: string], e:[src: string, dst: string ... 8 more fields])

In [0]:
#display(co_authorship_graph.vertices) # same as our created dataframe
#display(co_authorship_graph.edges) # same as our created edges
#display(co_authorship_graph.degrees) # total edges connected to a vertice
#display(co_authorship_graph.inDegrees) # incoming edges
#display(co_authorship_graph.outDegrees) # outgoing edges
display(co_authorship_graph.triplets) # source / edge / destination combined


src,edge,dst
List(53f43c4adabfaefedbafcab8),"List(53f43c4adabfaefedbafcab8, 53f43c4adabfaefedbafcab8, 53e99860b7602d970209d2d3, List(edge detection, word recognition, convex hull, image recognition, image segmentation, histograms, smoothing, feature extraction, handwriting recognition), 53a72a4920f7420be8bfa51b, 1995, null, null, 3, null)",List(53f43c4adabfaefedbafcab8)
List(53f44859dabfaeee229ff54a),"List(53f44859dabfaeee229ff54a, 53f44859dabfaeee229ff54a, 53e9988cb7602d97020c4dd0, List(quantum physics, channel capacity, quantum information, information theory, relative entropy, game theory, quantum channel), 53e180d020f7dfbc07e8b56e, 2008, null, null, 2, null)",List(53f44859dabfaeee229ff54a)
List(53f461dfdabfaedf436341be),"List(53f461dfdabfaedf436341be, 53f461dfdabfaedf436341be, 53e998c0b7602d97020fb00f, List(cognition, selective attention), 555036b87cea80f95414c3b9, 1993, null, null, 1, null)",List(53f461dfdabfaedf436341be)
List(53f449cbdabfaedd74dfc5c1),"List(53f449cbdabfaedd74dfc5c1, 53f449cbdabfaedd74dfc5c1, 53e998dbb7602d97021136b2, List(Ant clustering, Locally weighted regression, Intrusion detection system), 53a723a920f74fc8a8f32fac, 2008, null, null, 2, null)",List(53f449cbdabfaedd74dfc5c1)
List(53f4a18fdabfaedce562f185),"List(53f4a18fdabfaedce562f185, 53f4a18fdabfaedce562f185, 53e998e8b7602d9702124099, List(software project, hypermedia community, software development environment, software document, scm system, hypermedia technology, versioned hypermedia service, hypermedia service, commercial software configuration management, hypermedia-based software development environment, versioned hypermedia framework, version control, software configuration management, configuration management, software engineering, management system), 53907ba420f770854f5efef3, 2003, null, null, 2, null)",List(53f4a18fdabfaedce562f185)
List(53f434d3dabfaee2a1cd7f5e),"List(53f434d3dabfaee2a1cd7f5e, 53f434d3dabfaee2a1cd7f5e, 53e998f6b7602d970213420c, List(logic programming, parallel algorithms, resource allocation, shared memory systems, storage management, VPIM, concurrent logic programming system, contention, fragmentation, garbage collection, heap allocation, load distribution, load-balancing, parallel copying, shared-memory multiprocessor, symbolic languages), 555036e37cea80f954163f30, 1993, null, null, 2, null)",List(53f434d3dabfaee2a1cd7f5e)
List(548867d8dabfaed7b5fa381a),"List(548867d8dabfaed7b5fa381a, 548867d8dabfaed7b5fa381a, 53e998fdb7602d970213b2d7, List(wireless sensor network, sensor network), 555036b67cea80f95414b7c3, 2003, null, null, 2, null)",List(548867d8dabfaed7b5fa381a)
List(53f456c1dabfaee2a1d757b0),"List(53f456c1dabfaee2a1d757b0, 53f456c1dabfaee2a1d757b0, 53e99905b7602d9702140fe7, List(ad auctions, tac aa game, linear relaxation, ad auction, multiple choice knapsack problem, penalized multiple choice knapsack, penalized knapsack problem, greedy heuristic, greedy algorithm, globalpmckp optimally, knapsack-based approach), 53a72a8420f7420be8c03ee3, 2010, null, null, 2, null)",List(53f456c1dabfaee2a1d757b0)
List(53f438cfdabfaedd74db7556),"List(53f438cfdabfaedd74db7556, 53f438cfdabfaedd74db7556, 53e9990db7602d970214bbc0, List(complete graph, topological dimension, essential hypothesis, interesting by-product, finite presentability, finitely presented mv-algebras, finite automorphism group, theoretic isomorphism invariant, maximal spectral space), 555036e47cea80f954164b04, 2010, null, null, 2, null)",List(53f438cfdabfaedd74db7556)
List(53f42ca7dabfaedf4350bf16),"List(53f42ca7dabfaedf4350bf16, 53f42ca7dabfaedf4350bf16, 53e9990db7602d970214cc0c, List(computer simulation, torque, sagittal plane, manufacturing industry, centrifugal force, coriolis force, robot arm, robot kinematics, nuclear power), 555037377cea80f95417a149, 2007, null, null, 3, null)",List(53f42ca7dabfaedf4350bf16)
