 # Configuration

In [13]:
# automatic download of graphframes, if requried
# see https://spark-packages.org/package/graphframes/graphframes
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.8.2-spark3.2-s_2.12 pyspark-shell'

In [14]:
import pyspark
from pyspark.sql import SparkSession 
spark = SparkSession.builder \
            .master("local[4]") \
            .appName("facebook") \
            .getOrCreate() 

In [15]:
# make sure to create this dir first
spark.sparkContext.setCheckpointDir('/home/sarah/spark/_checkpoints')

In [16]:
from graphframes import GraphFrame

# Read Graph from File
## Create Edges DF 
With columns from source to destination containing the IDs of source and destination vertices + other columns for edge attributes (optional)

In [17]:
edgesDF = spark.read.csv("/home/sarah/spark/Documents/facebook/0.edges", header=False, sep=' ')
edgesDF = edgesDF.withColumnRenamed("_c0", "src")
edgesDF = edgesDF.withColumnRenamed("_c1", "dst")
edgesDF.printSchema()

rows = edgesDF.take(5)
for row in rows:
    print(row)

print(edgesDF.count())

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)

Row(src='236', dst='186')
Row(src='122', dst='285')
Row(src='24', dst='346')
Row(src='271', dst='304')
Row(src='176', dst='9')
5038


## Create Vertices DF

In [18]:
verticesDF = edgesDF.select('src').union(edgesDF.select('dst')).distinct()
verticesDF = verticesDF.withColumnRenamed('src','id')

rows = verticesDF.take(5)
for row in rows:
    print(row)


verticesDF.printSchema()
print(verticesDF.count())

Row(id='296')
Row(id='125')
Row(id='7')
Row(id='124')
Row(id='51')
root
 |-- id: string (nullable = true)



333


# Create Graph (object of 2 data frames - verticesDF, edgesDF)

Graph Frames can be queried via

- graph patterns (motif)
- extraction of subgraphs via edge and/or vertex filters and/or motifs

In [19]:
gf = GraphFrame(verticesDF, edgesDF)
print("graph with",gf.vertices.count(),"vertices and",gf.edges.count(),"edges")

graph with 333 vertices and 5038 edges


# Disconnected Social Groups (Connected Components)

1. Find all distinct connected components
2. If there are more than one we 

In [20]:
cc = gf.connectedComponents()
cc = cc.select('component').distinct()
cc.show()

23/07/10 21:47:02 WARN CacheManager: Asked to cache already cached data.
23/07/10 21:47:03 WARN CacheManager: Asked to cache already cached data.
23/07/10 21:47:04 WARN CacheManager: Asked to cache already cached data.        
23/07/10 21:47:05 WARN CacheManager: Asked to cache already cached data.
23/07/10 21:47:05 WARN CacheManager: Asked to cache already cached data.
23/07/10 21:47:06 WARN CacheManager: Asked to cache already cached data.        
23/07/10 21:47:07 WARN CacheManager: Asked to cache already cached data.        
23/07/10 21:47:08 WARN CacheManager: Asked to cache already cached data.        
23/07/10 21:47:08 WARN CacheManager: Asked to cache already cached data.
                                                                                

+------------+
|   component|
+------------+
|           0|
|678604832769|
|128849018881|
| 60129542146|
|377957122048|
+------------+



## Closed relationships or connections between three individuals.

The 'id' column represents the vertex ID in the graph.
The 'count' column represents the number of triangles that the corresponding vertex is involved in.

In [21]:
triangle_count = gf.triangleCount()
triangle_count.show() # Display the triangle count DataFrame

                                                                                

+-----+---+
|count| id|
+-----+---+
|   14|296|
|    1|125|
|   63|  7|
|    0|124|
|   12| 51|
|    1|307|
|  272|169|
|  166|334|
|    0|205|
|  306|272|
|  162|232|
|    9| 54|
|    0|282|
|    0|234|
|    0|155|
|    5|317|
|   33|132|
|    0|154|
|  637|200|
|  124|101|
+-----+---+
only showing top 20 rows



# Person with the Most Direct Friends (Degree)

In [22]:
gf.inDegrees.sort("inDegree", ascending=False).show()

+---+--------+
| id|inDegree|
+---+--------+
| 56|      77|
| 67|      75|
|271|      72|
|322|      71|
| 25|      68|
| 26|      67|
|252|      64|
| 21|      64|
|277|      64|
|122|      62|
|119|      61|
|239|      58|
|200|      56|
|  9|      56|
|203|      56|
|315|      55|
|304|      54|
| 98|      48|
|188|      47|
|285|      46|
+---+--------+
only showing top 20 rows



# Person with the Most Indirect Friends

In [23]:
# Perform breadth-first search (BFS) traversal to find indirect friends for each vertex
bfs_result = gf.bfs(
    fromExpr="id = 1",
    toExpr="",
    maxPathLength=2
)

# Group by vertices and count the number of indirect friends
indirect_friends_count = bfs_result.groupBy("to").count()

# Find the person with the most indirect friends
person_with_most_indirect_friends = indirect_friends_count.orderBy(indirect_friends_count["count"].desc()).first()

# Display the person with the most indirect friends
print("Person with the most indirect friends:", person_with_most_indirect_friends["to"])

ParseException: 
mismatched input '<EOF>' expecting {'(', 'ADD', 'AFTER', 'ALL', 'ALTER', 'ANALYZE', 'AND', 'ANTI', 'ANY', 'ARCHIVE', 'ARRAY', 'AS', 'ASC', 'AT', 'AUTHORIZATION', 'BETWEEN', 'BOTH', 'BUCKET', 'BUCKETS', 'BY', 'CACHE', 'CASCADE', 'CASE', 'CAST', 'CHANGE', 'CHECK', 'CLEAR', 'CLUSTER', 'CLUSTERED', 'CODEGEN', 'COLLATE', 'COLLECTION', 'COLUMN', 'COLUMNS', 'COMMENT', 'COMMIT', 'COMPACT', 'COMPACTIONS', 'COMPUTE', 'CONCATENATE', 'CONSTRAINT', 'COST', 'CREATE', 'CROSS', 'CUBE', 'CURRENT', 'CURRENT_DATE', 'CURRENT_TIME', 'CURRENT_TIMESTAMP', 'CURRENT_USER', 'DAY', 'DATA', 'DATABASE', DATABASES, 'DBPROPERTIES', 'DEFINED', 'DELETE', 'DELIMITED', 'DESC', 'DESCRIBE', 'DFS', 'DIRECTORIES', 'DIRECTORY', 'DISTINCT', 'DISTRIBUTE', 'DIV', 'DROP', 'ELSE', 'END', 'ESCAPE', 'ESCAPED', 'EXCEPT', 'EXCHANGE', 'EXISTS', 'EXPLAIN', 'EXPORT', 'EXTENDED', 'EXTERNAL', 'EXTRACT', 'FALSE', 'FETCH', 'FIELDS', 'FILTER', 'FILEFORMAT', 'FIRST', 'FOLLOWING', 'FOR', 'FOREIGN', 'FORMAT', 'FORMATTED', 'FROM', 'FULL', 'FUNCTION', 'FUNCTIONS', 'GLOBAL', 'GRANT', 'GROUP', 'GROUPING', 'HAVING', 'HOUR', 'IF', 'IGNORE', 'IMPORT', 'IN', 'INDEX', 'INDEXES', 'INNER', 'INPATH', 'INPUTFORMAT', 'INSERT', 'INTERSECT', 'INTERVAL', 'INTO', 'IS', 'ITEMS', 'JOIN', 'KEYS', 'LAST', 'LATERAL', 'LAZY', 'LEADING', 'LEFT', 'LIKE', 'LIMIT', 'LINES', 'LIST', 'LOAD', 'LOCAL', 'LOCATION', 'LOCK', 'LOCKS', 'LOGICAL', 'MACRO', 'MAP', 'MATCHED', 'MERGE', 'MINUTE', 'MONTH', 'MSCK', 'NAMESPACE', 'NAMESPACES', 'NATURAL', 'NO', NOT, 'NULL', 'NULLS', 'OF', 'ON', 'ONLY', 'OPTION', 'OPTIONS', 'OR', 'ORDER', 'OUT', 'OUTER', 'OUTPUTFORMAT', 'OVER', 'OVERLAPS', 'OVERLAY', 'OVERWRITE', 'PARTITION', 'PARTITIONED', 'PARTITIONS', 'PERCENT', 'PIVOT', 'PLACING', 'POSITION', 'PRECEDING', 'PRIMARY', 'PRINCIPALS', 'PROPERTIES', 'PURGE', 'QUERY', 'RANGE', 'RECORDREADER', 'RECORDWRITER', 'RECOVER', 'REDUCE', 'REFERENCES', 'REFRESH', 'RENAME', 'REPAIR', 'REPLACE', 'RESET', 'RESPECT', 'RESTRICT', 'REVOKE', 'RIGHT', RLIKE, 'ROLE', 'ROLES', 'ROLLBACK', 'ROLLUP', 'ROW', 'ROWS', 'SECOND', 'SCHEMA', 'SELECT', 'SEMI', 'SEPARATED', 'SERDE', 'SERDEPROPERTIES', 'SESSION_USER', 'SET', 'MINUS', 'SETS', 'SHOW', 'SKEWED', 'SOME', 'SORT', 'SORTED', 'START', 'STATISTICS', 'STORED', 'STRATIFY', 'STRUCT', 'SUBSTR', 'SUBSTRING', 'SYNC', 'TABLE', 'TABLES', 'TABLESAMPLE', 'TBLPROPERTIES', TEMPORARY, 'TERMINATED', 'THEN', 'TIME', 'TO', 'TOUCH', 'TRAILING', 'TRANSACTION', 'TRANSACTIONS', 'TRANSFORM', 'TRIM', 'TRUE', 'TRUNCATE', 'TRY_CAST', 'TYPE', 'UNARCHIVE', 'UNBOUNDED', 'UNCACHE', 'UNION', 'UNIQUE', 'UNKNOWN', 'UNLOCK', 'UNSET', 'UPDATE', 'USE', 'USER', 'USING', 'VALUES', 'VIEW', 'VIEWS', 'WHEN', 'WHERE', 'WINDOW', 'WITH', 'YEAR', 'ZONE', '+', '-', '*', '~', STRING, BIGINT_LITERAL, SMALLINT_LITERAL, TINYINT_LITERAL, INTEGER_VALUE, EXPONENT_VALUE, DECIMAL_VALUE, FLOAT_LITERAL, DOUBLE_LITERAL, BIGDECIMAL_LITERAL, IDENTIFIER, BACKQUOTED_IDENTIFIER}(line 1, pos 0)

== SQL ==

^^^


# Most Important Persons (PageRank)

In [None]:
pr = gf.pageRank(resetProbability=0.15, tol=0.01).vertices
pr.sort("pagerank", ascending=False).show()

23/07/08 13:08:46 WARN BlockManager: Block rdd_667_0 already exists on this machine; not re-adding it


+---+------------------+
| id|          pagerank|
+---+------------------+
| 25|3.2143889388762528|
| 56| 3.202448130965981|
|322| 3.158174546526567|
|119| 3.125720572775718|
| 67| 3.096315776598929|
|271| 2.951859801155131|
|277|2.9137075632197584|
| 21| 2.779733501281871|
| 26| 2.741748133688712|
|252|2.6524457151983247|
|122| 2.650562461337814|
|312| 2.519876920108743|
|239|2.5008420829290903|
|315|2.4405387176853828|
|203|2.3880828720498006|
|  9| 2.375254010034537|
|200| 2.340494903196381|
|304| 2.313954622142133|
| 41| 2.311232654696034|
|115|2.1752748697865454|
+---+------------------+
only showing top 20 rows



# Groups of 4 Friends (Motif)

In [None]:
# nodes a,b,c,d
friendPattern = '(a)-[ab]->(b); (a)-[ac]->(c); (a)-[ad]->(d); (b)-[bc]->(c); (b)-[bd]->(d); (c)-[cd]->(d)'

In [None]:
fourFriends = gf.find(friendPattern)

In [None]:
fourFriends = fourFriends.select('a','b','c','d')
fourFriends.show()

+-----+-----+-----+-----+
|    a|    b|    c|    d|
+-----+-----+-----+-----+
|{296}| {68}|{227}|{263}|
|{296}| {68}|{227}|{175}|
|{296}| {68}|{227}| {99}|
|{296}| {68}|{263}|{227}|
|{296}| {68}|{263}|{175}|
|{296}| {68}|{263}| {99}|
|{296}| {68}|{175}|{227}|
|{296}| {68}|{175}|{263}|
|{296}| {68}|{175}| {99}|
|{296}| {68}| {99}|{227}|
|{296}| {68}| {99}|{263}|
|{296}| {68}| {99}|{175}|
|{296}|{227}| {68}|{263}|
|{296}|{227}| {68}|{175}|
|{296}|{227}| {68}| {99}|
|{296}|{227}|{263}| {68}|
|{296}|{227}|{263}|{175}|
|{296}|{227}|{263}| {99}|
|{296}|{227}|{263}|{102}|
|{296}|{227}|{175}| {68}|
+-----+-----+-----+-----+
only showing top 20 rows



In [None]:
# Find all friends involved in a triangle
triangle_friends = gf.find("(a)-[]->(b); (b)-[]->(c); (c)-[]->(a)")

# Show the DataFrame of friends involved in triangles
threeFriends = triangle_friends.select('a','b','c')
threeFriends.show()

+-----+-----+-----+
|    a|    b|    c|
+-----+-----+-----+
|{296}| {68}| {99}|
|{296}| {68}|{263}|
|{296}| {68}|{227}|
|{296}| {68}|{175}|
|{296}|{227}|{175}|
|{296}|{227}| {99}|
|{296}|{227}|{263}|
|{296}|{227}|{102}|
|{296}|{227}| {68}|
|{296}|{263}| {68}|
|{296}|{263}|{227}|
|{296}|{263}| {99}|
|{296}|{263}|{102}|
|{296}|{263}|{175}|
|{296}|{175}|{263}|
|{296}|{175}|{227}|
|{296}|{175}|{102}|
|{296}|{175}| {68}|
|{296}|{175}| {99}|
|{296}| {99}|{175}|
+-----+-----+-----+
only showing top 20 rows



# A and B have common neighbor C
 The variables e and e2 represent specific edge labels or types. This pattern specifies that there should be two different types of edges connecting the vertices 'a', 'b', and 'c'.

In [None]:
# Find all friends involved in a triangle
common_neighbors = gf.find("(a)-[e]->(b); (b)-[e2]->(c)")

# Show the DataFrame of friends involved in triangles
threeFriends = common_neighbors.select('a','b','c')
threeFriends.show()

+-----+-----+-----+
|    a|    b|    c|
+-----+-----+-----+
|{296}| {68}|{296}|
|{296}| {68}| {99}|
|{296}| {68}|{263}|
|{296}| {68}|{143}|
|{296}| {68}|{227}|
|{296}| {68}|{225}|
|{296}| {68}|{175}|
|{296}| {68}|{177}|
|{296}|{227}|{131}|
|{296}|{227}|{175}|
|{296}|{227}| {19}|
|{296}|{227}| {99}|
|{296}|{227}|{278}|
|{296}|{227}|{263}|
|{296}|{227}|{225}|
|{296}|{227}|{143}|
|{296}|{227}|{296}|
|{296}|{227}|{102}|
|{296}|{227}| {86}|
|{296}|{227}| {23}|
+-----+-----+-----+
only showing top 20 rows



# Common neighbor
a and c have common neigbor v

In [None]:

# Create a DataFrame representing the graph
vertices = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,)], ['id'])
edges = spark.createDataFrame([(1, 2), (2, 3), (3, 4), (4, 5), (5, 1)], ['src', 'dst'])
graph = GraphFrame(vertices, edges)

# Find motif pattern (a)-[e]->(b); (b)-[e2]->(c)
motif_result = graph.find("(a)-[e]->(b); (b)-[e2]->(c)")
motif_result.show()

+---+------+---+------+---+
|  a|     e|  b|    e2|  c|
+---+------+---+------+---+
|{4}|{4, 5}|{5}|{5, 1}|{1}|
|{5}|{5, 1}|{1}|{1, 2}|{2}|
|{1}|{1, 2}|{2}|{2, 3}|{3}|
|{2}|{2, 3}|{3}|{3, 4}|{4}|
|{3}|{3, 4}|{4}|{4, 5}|{5}|
+---+------+---+------+---+



In [None]:
motif_result = graph.find("(a)-[]->(b); (b)-[]->(c)")
motif_result.show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|{4}|{5}|{1}|
|{5}|{1}|{2}|
|{1}|{2}|{3}|
|{2}|{3}|{4}|
|{3}|{4}|{5}|
+---+---+---+



# Clique
A clique motif represents a complete subgraph where every individual is connected to every other individual. It can help identify tightly-knit groups with strong connections.


In [None]:

vertices = spark.createDataFrame([(1,), (2,), (3,), (4,)], ['id'])
edges = spark.createDataFrame([(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)], ['src', 'dst'])
graph = GraphFrame(vertices, edges)

# Find clique motif (a)-[]->(b); (a)-[]->(c); (b)-[]->(c)
clique_motif = graph.find("(a)-[]->(b); (a)-[]->(c); (b)-[]->(c)")
clique_motif.show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|{1}|{2}|{3}|
|{2}|{3}|{4}|
|{1}|{3}|{4}|
|{1}|{2}|{4}|
+---+---+---+



# Star Motiv
A star motif represents a central individual connected to multiple other individuals. It can help identify influential or central figures within a social network. You can find star motifs using the following code


In [None]:
# Create a DataFrame representing the graph
vertices = spark.createDataFrame([(1,), (2,), (3,), (4,)], ['id'])
edges = spark.createDataFrame([(1, 2), (1, 3), (1, 4)], ['src', 'dst'])
graph = GraphFrame(vertices, edges)

# Find star motif (a)-[]->(x); (a)-[]->(y); (a)-[]->(z)
star_motif = graph.find("(a)-[]->(x); (a)-[]->(y); (a)-[]->(z)")
star_motif.show()

+---+---+---+---+
|  a|  x|  y|  z|
+---+---+---+---+
|{1}|{4}|{2}|{2}|
|{1}|{3}|{2}|{2}|
|{1}|{2}|{2}|{2}|
|{1}|{4}|{3}|{2}|
|{1}|{3}|{3}|{2}|
|{1}|{2}|{3}|{2}|
|{1}|{4}|{4}|{2}|
|{1}|{3}|{4}|{2}|
|{1}|{2}|{4}|{2}|
|{1}|{4}|{2}|{3}|
|{1}|{3}|{2}|{3}|
|{1}|{2}|{2}|{3}|
|{1}|{4}|{3}|{3}|
|{1}|{3}|{3}|{3}|
|{1}|{2}|{3}|{3}|
|{1}|{4}|{4}|{3}|
|{1}|{3}|{4}|{3}|
|{1}|{2}|{4}|{3}|
|{1}|{4}|{2}|{4}|
|{1}|{3}|{2}|{4}|
+---+---+---+---+
only showing top 20 rows

