In [1]:
import findspark
findspark.init()

import os
PROJECT_HOME = os.path.abspath(os.curdir)
print(PROJECT_HOME)

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth
import pyspark.sql.functions as F

/home/noobcoder/0_Project/school/BigData/DemoNov29


In [2]:
SPARK_MASTER_HOST = os.environ.get('SPARK_MASTER_HOST', 'localhost')
sc = SparkSession.builder\
    .master(f'spark://{SPARK_MASTER_HOST}:7077')\
    .appName('Quiz04_6')\
    .config('spark.executor.memory', '512m')\
    .config('spark.jars', f'file://{PROJECT_HOME}/third_party/graphframes-0.7.0-spark2.4-s_2.11.jar')\
    .getOrCreate()

# TASK 06: Social network analysis with GraphFrame

## a. READ DATA AND CREATE GRAPHFRAME

In [3]:
users = sc.read.csv(f'file://{PROJECT_HOME}/data/users.txt', sep=',', header=None)
users = users.withColumnRenamed('_c0', 'id')\
    .withColumnRenamed('_c1', 'username')\
    .withColumnRenamed('_c2', 'name')
users.show()

+---+-------------+---------------+
| id|     username|           name|
+---+-------------+---------------+
|  1|  BarackObama|   Barack Obama|
|  2|     ladygaga|Goddess of Love|
|  3|      jeresig|     John Resig|
|  4| justinbieber|  Justin Bieber|
|  6|matei_zaharia|  Matei Zaharia|
|  7|      odersky| Martin Odersky|
|  8|      anonsys|           null|
+---+-------------+---------------+



In [4]:
followEdge = sc.read.csv(f'file://{PROJECT_HOME}/data/followers.txt', sep=' ', header=None)
followEdge = followEdge.withColumnRenamed('_c0', 'src')\
    .withColumnRenamed('_c1', 'dst')

In [5]:
from graphframes import GraphFrame

In [6]:
g = GraphFrame(users, followEdge)

## b. SOME BASIC ANALYST

### 1. In Degrees

In [7]:
in_degrees = g.inDegrees.alias('ind')
in_degrees\
    .join(users.alias('u'), F.col('ind.id') == F.col('u.id'))\
    .sort(F.col('inDegree').desc())\
    .show()

+---+--------+---+-------------+---------------+
| id|inDegree| id|     username|           name|
+---+--------+---+-------------+---------------+
|  7|       2|  7|      odersky| Martin Odersky|
|  3|       2|  3|      jeresig|     John Resig|
|  1|       2|  1|  BarackObama|   Barack Obama|
|  2|       1|  2|     ladygaga|Goddess of Love|
|  6|       1|  6|matei_zaharia|  Matei Zaharia|
+---+--------+---+-------------+---------------+



### 2. Out Degrees

In [8]:
out_degrees = g.outDegrees.alias('oud')
out_degrees\
    .join(users.alias('u'), F.col('u.id') == F.col('oud.id'))\
    .sort(F.col('outDegree').desc())\
    .show()

+---+---------+---+-------------+---------------+
| id|outDegree| id|     username|           name|
+---+---------+---+-------------+---------------+
|  7|        2|  7|      odersky| Martin Odersky|
|  6|        2|  6|matei_zaharia|  Matei Zaharia|
|  1|        1|  1|  BarackObama|   Barack Obama|
|  2|        1|  2|     ladygaga|Goddess of Love|
|  4|        1|  4| justinbieber|  Justin Bieber|
|  3|        1|  3|      jeresig|     John Resig|
+---+---------+---+-------------+---------------+



### 3. PAGE RANK

In [9]:
pageRank = g.pageRank(resetProbability=0.01, maxIter=20)

In [10]:
pageRank.vertices.orderBy(F.col('pagerank').desc()).show()

+---+-------------+---------------+--------------------+
| id|     username|           name|            pagerank|
+---+-------------+---------------+--------------------+
|  2|     ladygaga|Goddess of Love|  2.2122923333429894|
|  7|      odersky| Martin Odersky|  1.5503700407060304|
|  1|  BarackObama|   Barack Obama|  1.2702367847934501|
|  3|      jeresig|     John Resig|  1.1647254575707153|
|  6|matei_zaharia|  Matei Zaharia|  0.7790808744354002|
|  8|      anonsys|           null|0.011647254575707157|
|  4| justinbieber|  Justin Bieber|0.011647254575707157|
+---+-------------+---------------+--------------------+

