### Boiler plate code

In [76]:
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.types import (StructField, StructType, 
                               IntegerType, StringType)

### Define schema

In [77]:
graph_fields = [
        StructField(name="hero_graph", dataType=StringType(), nullable=True)
        ]
graph_schema = StructType(graph_fields)

hero_fields = [
        StructField(name="hero_cd", dataType=IntegerType(), nullable=False),
        StructField(name="hero_name", dataType=StringType(), nullable=False)
            ]
hero_schema = StructType(hero_fields)

### Load data

In [78]:
spark = SparkSession.builder.appName("marvel_characters").getOrCreate()

In [79]:
marvel_graph = spark.read.format("csv").schema(graph_schema).load("file:///var/lib/jupyter/data/Marvel-graph.txt")

In [123]:
hero_names = spark.read.format("csv").option("delimiter", " ").schema(hero_schema).load("file:///var/lib/jupyter/data/Marvel-names.txt")

### Creating UDF to split the rows and make a list 

In [95]:
def get_connection_size(line):
    return len(line.split())

get_connection_size_udf = f.udf(get_connection_size)

In [96]:
def get_hero_cd(line):
    return line.split()[0]

get_hero_cd_udf = f.udf(get_hero_cd)

In [97]:
marvel_graph_udf = marvel_graph.withColumn("connections", get_connection_size_udf(marvel_graph.hero_graph)) \
            .withColumn("hero_cd", get_hero_cd_udf(marvel_graph.hero_graph)) \
            .drop(marvel_graph.hero_graph)

### Doing the same thing using functions method

In [98]:
marvel_graph_func = marvel_graph.withColumn("connections", f.size(f.split(marvel_graph.hero_graph," "))-1) \
                                .withColumn("hero_cd", f.split(marvel_graph.hero_graph," ")[0]) \
                                .drop(marvel_graph.hero_graph)

In [114]:
popular_super_hero = marvel_graph_func \
        .groupBy(marvel_graph_func.hero_cd) \
        .agg(f.sum(marvel_graph_func.connections) \
        .alias("total_connections")) \
        .orderBy("total_connections", ascending=False) \
        .first()

In [130]:
best_hero = hero_names.filter(hero_names.hero_cd == int(popular_super_hero[0])).select(hero_names.hero_name).collect()[0][0]

In [152]:
obscure_super_hero = marvel_graph_func \
                .groupBy(marvel_graph_func.hero_cd) \
                .agg(f.sum(marvel_graph_func.connections) \
                .alias("total_connections")) \
                .join(hero_names, marvel_graph_func.hero_cd == hero_names.hero_cd, "inner") \
                .filter(f.col("total_connections")==1)

In [170]:
obscure_heroes = obscure_super_hero.select(obscure_super_hero.hero_name).collect()

In [172]:
print "The most popular hero is {} and the most obscure superheros are {}".format(best_hero, [hero[0].encode("ascii", "ignore") for hero in obscure_heroes])

The most popular hero is CAPTAIN AMERICA and the most obscure superheros are ['BERSERKER II', 'BLARE/', 'MARVEL BOY II/MARTIN', 'MARVEL BOY/MARTIN BU', 'GIURESCU, RADU', 'CLUMSY FOULUP', 'FENRIS', 'RANDAK', 'SHARKSKIN', 'CALLAHAN, DANNY', 'DEATHCHARGE', 'RUNE', 'SEA LEOPARD', 'RED WOLF II', 'ZANTOR', 'JOHNSON, LYNDON BAIN', 'LUNATIK II', 'KULL', 'GERVASE, LADY ALYSSA']


In [140]:
obscure_hero

u'24-HOUR MAN/EMMANUEL'