In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.12 (default, Nov 19 2016 06:48:10)
SparkSession available as 'spark'.


<h3> Create Spark session, enable Hive support </h3>

In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local [2]").getOrCreate()

In [3]:
graphPath = "/data/graphDFSample"

<h3>Create Spark Dataframe from parquet file</h3>

In [4]:
from pyspark.sql.functions import explode, collect_list, size, col, row_number
from pyspark.sql import Window

reversedGraph = sparkSession.read.parquet(graphPath) \
    .withColumn("friend", explode('friends')) \
    .groupBy("friend") \
    .agg(collect_list("user").alias("users")) \
    .withColumn("users_size", size("users")) \

In [5]:
reversedGraph.printSchema()

root
 |-- friend: integer (nullable = true)
 |-- users: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- users_size: integer (nullable = false)



<h3>inspect original data from the parquet file</h3>

In [6]:
originalGraph = sparkSession.read.parquet(graphPath)

In [7]:
originalGraph.show(3)

+--------+--------------------+
|    user|             friends|
+--------+--------------------+
|22991438|[20699, 175973, 5...|
|37586597|[83616, 139192, 1...|
|56325000|[504270, 645333, ...|
+--------+--------------------+
only showing top 3 rows



In [15]:
sparkSession.read.parquet(graphPath).limit(5).toPandas()

ImportError: No module named pandas

In [9]:
sparkSession.catalog.listDatabases()

[Database(name=u'default', description=u'Default Hive database', locationUri=u'file:/home/jovyan/demos/spark-warehouse')]

In [12]:
sparkSession.catalog.listTables("default")

[]

In [13]:
window = Window.orderBy(col("users_size").desc())
    
top50 = reversedGraph.withColumn("row_number", row_number().over(window)) \
            .filter(col("row_number") < 50) \
            .select(col("friend"), col("users_size")) \
            .orderBy(col("users_size").desc()) \
            .collect()

In [7]:
for val in top50:
    print '%s %s' % val

9606655 244
62922315 241
1288836 240
36402159 239
36079654 239
40342046 235
24319760 234
34854364 234
45353567 233
28229916 231
16364918 230
52511791 229
549319 227
5137947 227
65079230 227
17636074 226
49067109 225
53106903 225
6570168 223
44621704 223
34850500 223
27338193 222
32810368 222
25606717 222
34201873 220
6147442 219
62386165 219
45239367 219
32821462 218
30234171 218
63649194 217
53826156 217
13813472 217
26158314 217
17679500 217
14394422 216
7153815 216
13062446 216
36039499 216
64373911 216
12890141 215
20291955 215
36757249 214
64856469 214
40043869 213
34071175 212
11768267 211
38750752 211
3295906 211
