# ***Exercise 57b - GraphFrames***

Input:

- The textual file vertexes.csv
    - It contains the vertexes of a graph
    

- Each vertex is characterized by
    - id (string): user identifier
    - name (string): user name
    - age (integer): user age

- The textual file edges.csv
    - It contains the edges of a graph
    

- Each edge is characterized by
    - src (string): source vertex
    - dst (string): destination vertex
    - linktype (string): “follow”or “friend”

Output:

- Count for each user the number of “neighbors” with ages less than 35
    - User X is a neighbor of User Y if there is a link from User X to User Y
    
 
- For each user with at least one neighbor with ages less than 35, store in the output folder his/her id and the number of neighbors with ages less than 35


- Use the CSV format to store the result

In [6]:
from graphframes import GraphFrame
from pyspark.sql.functions import sum
from graphframes.lib import AggregateMessages
from pyspark.sql.types import IntegerType

inputPathVertexes = "/data/students/bigdata-01QYD/ex_data/Ex57b/data/vertexes.csv"
inputPathEdges = "/data/students/bigdata-01QYD/ex_data/Ex57b/data/edges.csv"
outputPath = "resOut_ex57b/"

In [3]:
# Read the content of vertexes.csv
vDF = spark.read.load(inputPathVertexes,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

vDF.printSchema()
vDF.show()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

+---+-----+---+
| id| name|age|
+---+-----+---+
| u1|Alice| 34|
| u2|  Bob| 36|
| u3| John| 30|
| u4|David| 29|
| u5| Paul| 32|
| u6| Adel| 36|
| u7| Eddy| 60|
+---+-----+---+



In [4]:
# Read the content of edges.csv
eDF = spark.read.load(inputPathEdges,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

eDF.printSchema()
eDF.show()

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- linktype: string (nullable = true)

+---+---+--------+
|src|dst|linktype|
+---+---+--------+
| u1| u2|  friend|
| u1| u4|  friend|
| u1| u5|  friend|
| u2| u1|  friend|
| u2| u3|  follow|
| u3| u2|  follow|
| u4| u1|  friend|
| u4| u5|  friend|
| u5| u1|  friend|
| u5| u4|  friend|
| u5| u6|  follow|
| u6| u3|  follow|
+---+---+--------+



In [7]:
# Add to each node/vertex a new "feature": AgeLess35
def funcAgeLess35(age):
    if(age<35):
        return 1
    else:
        return 0

spark.udf.register("funcAgeLess35", funcAgeLess35, IntegerType())
vDFnew = vDF.selectExpr("*", "funcAgeLess35(age) as AgeLess35")

In [8]:
vDFnew.show()

+---+-----+---+---------+
| id| name|age|AgeLess35|
+---+-----+---+---------+
| u1|Alice| 34|        1|
| u2|  Bob| 36|        0|
| u3| John| 30|        1|
| u4|David| 29|        1|
| u5| Paul| 32|        1|
| u6| Adel| 36|        0|
| u7| Eddy| 60|        0|
+---+-----+---+---------+



In [10]:
g = GraphFrame(vDFnew, eDF)

In [11]:
# For each user, sum the values of AgeLess35 of neighbors

# Send AgeLess35 of each source of an edge to its destination
# N.B. we are not intrested in outcoming edges
msgToDst = AggregateMessages.src['AgeLess35']

In [13]:
# Aggregate messages
aggAgeLess35 = g.aggregateMessages(sum(AggregateMessages.msg),\
                                  sendToSrc=None,\
                                  sendToDst=msgToDst)\
.withColumnRenamed("sum(MSG)", "numNeighborsLess35")

In [14]:
aggAgeLess35.show()

+---+------------------+
| id|numNeighborsLess35|
+---+------------------+
| u3|                 0|
| u4|                 2|
| u5|                 2|
| u1|                 2|
| u6|                 1|
| u2|                 2|
+---+------------------+



In [15]:
selectedUsersDF = aggAgeLess35.filter("numNeighborsLess35>0")
selectedUsersDF.show()

+---+------------------+
| id|numNeighborsLess35|
+---+------------------+
| u4|                 2|
| u5|                 2|
| u1|                 2|
| u6|                 1|
| u2|                 2|
+---+------------------+



In [16]:
selectedUsersDF.write.csv(outputPath, header=True)