### Boiler plate code

In [1]:
from os import environ
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.types import (StructField, StructType, 
                               IntegerType, StringType)

### Define schema

In [2]:
file_path = "file:///"+environ['DATA_LAKE']
graph_fields = [
        StructField(name="hero_graph", dataType=StringType(), nullable=True)
        ]
graph_schema = StructType(graph_fields)

hero_fields = [
        StructField(name="hero_cd", dataType=IntegerType(), nullable=False),
        StructField(name="hero_name", dataType=StringType(), nullable=False)
            ]
hero_schema = StructType(hero_fields)

### Load data

In [3]:
spark = SparkSession.builder.appName("marvel_characters").getOrCreate()

In [4]:
marvel_graph = spark.read.format("csv").schema(graph_schema).load(file_path+"Marvel-graph.txt")

In [5]:
hero_names = spark.read.format("csv").option("delimiter", " ").schema(hero_schema).load(file_path+"Marvel-names.txt")

### Creating UDF to split the rows and make a list 

In [6]:
def get_connection_size(line):
    return len(line.split())

get_connection_size_udf = f.udf(get_connection_size)

In [7]:
def get_hero_cd(line):
    return line.split()[0]

get_hero_cd_udf = f.udf(get_hero_cd)

In [8]:
marvel_graph_udf = marvel_graph.withColumn("connections", get_connection_size_udf(marvel_graph.hero_graph)) \
            .withColumn("hero_cd", get_hero_cd_udf(marvel_graph.hero_graph)) \
            .drop(marvel_graph.hero_graph)

### Doing the same thing using functions method

In [9]:
marvel_graph_func = marvel_graph.withColumn("connections", f.size(f.split(marvel_graph.hero_graph," "))-1) \
                                .withColumn("hero_cd", f.split(marvel_graph.hero_graph," ")[0]) \
                                .drop(marvel_graph.hero_graph)

In [10]:
popular_super_hero = marvel_graph_func \
        .groupBy(marvel_graph_func.hero_cd) \
        .agg(f.sum(marvel_graph_func.connections) \
        .alias("total_connections")) \
        .orderBy("total_connections", ascending=False) \
        .first()

In [11]:
best_hero = hero_names.filter(hero_names.hero_cd == int(popular_super_hero[0])).select(hero_names.hero_name).collect()[0][0]

In [12]:
obscure_super_hero = marvel_graph_func \
                .groupBy(marvel_graph_func.hero_cd) \
                .agg(f.sum(marvel_graph_func.connections) \
                .alias("total_connections")) \
                .join(hero_names, marvel_graph_func.hero_cd == hero_names.hero_cd, "inner") \
                .filter(f.col("total_connections")==1)

In [13]:
obscure_heroes = obscure_super_hero.select(obscure_super_hero.hero_name).collect()

In [14]:
print("The most popular hero is {} and the most obscure superheros are {}".format(best_hero, [hero[0].encode("ascii", "ignore") for hero in obscure_heroes]))

The most popular hero is CAPTAIN AMERICA and the most obscure superheros are [b'BERSERKER II', b'BLARE/', b'MARVEL BOY II/MARTIN', b'MARVEL BOY/MARTIN BU', b'GIURESCU, RADU', b'CLUMSY FOULUP', b'FENRIS', b'RANDAK', b'SHARKSKIN', b'CALLAHAN, DANNY', b'DEATHCHARGE', b'RUNE', b'SEA LEOPARD', b'RED WOLF II', b'ZANTOR', b'JOHNSON, LYNDON BAIN', b'LUNATIK II', b'KULL', b'GERVASE, LADY ALYSSA']


In [15]:
obscure_heroes

[Row(hero_name='BERSERKER II'),
 Row(hero_name='BLARE/'),
 Row(hero_name='MARVEL BOY II/MARTIN'),
 Row(hero_name='MARVEL BOY/MARTIN BU'),
 Row(hero_name='GIURESCU, RADU'),
 Row(hero_name='CLUMSY FOULUP'),
 Row(hero_name='FENRIS'),
 Row(hero_name='RANDAK'),
 Row(hero_name='SHARKSKIN'),
 Row(hero_name='CALLAHAN, DANNY'),
 Row(hero_name='DEATHCHARGE'),
 Row(hero_name='RUNE'),
 Row(hero_name='SEA LEOPARD'),
 Row(hero_name='RED WOLF II'),
 Row(hero_name='ZANTOR'),
 Row(hero_name='JOHNSON, LYNDON BAIN'),
 Row(hero_name='LUNATIK II'),
 Row(hero_name='KULL'),
 Row(hero_name='GERVASE, LADY ALYSSA')]

In [16]:
spark.stop()