In [1]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("Superhero")
sc = SparkContext.getOrCreate(conf = conf)

names = sc.textFile("/FileStore/tables/Marvel_Names")
lines = sc.textFile("/FileStore/tables/Marvel_Graph")

In [2]:
def parseNames(line):
    fields = line.split('\"')
    return (int(fields[0]), fields[1].encode("utf8"))
  
def countCoOccurences(line):
    elements = line.split()
    return (int(elements[0]), len(elements) - 1)


namesRdd = names.map(parseNames)

pairings = lines.map(countCoOccurences)

In [3]:
for i in namesRdd.collect():
  print(i)

In [4]:
for i in pairings.collect():
  print(i)

In [5]:
#The above output is (Superhero Id, Count of friends), where SuperheroID is the KEY.

totalFriendsByCharacter = pairings.reduceByKey(lambda x,y: x+y)

for i in totalFriendsByCharacter.collect():
  print (i)

In [6]:
# Now we have to find the Superhero ID with the maximum number of friends:

#METHOD 1: Flipping the (key, value) to (value, key) and using max function in pyspark.

flipped = totalFriendsByCharacter.map(lambda x: (x[1], x[0]))
mostPopular = flipped.max()

mostPopular

In [7]:
# Using the lookup function, we can associate the ID with the name from 'namesRdd' we created:

mostPopularName = namesRdd.lookup(mostPopular[1])  #Looking for name in nameRdd using mostPopular[1] i.e. Id
mostPopularName

In [8]:
print (str(mostPopularName) + "is the most popular Superhero with " + \
       str(mostPopular[0]) + " number of co-appearances.")

In [9]:
#METHOD 2: Sorting the "totalFriendsByCharacter" in descending value of co-appearances and taking the highest value (first element)

sorted = totalFriendsByCharacter.sortBy(lambda x: x[1], ascending = False)

highestElement = sorted.collect()[0]
highestElement

In [10]:
#Now use the Id to map it to Superhero name

mostPopularName = namesRdd.lookup(highestElement[0]) #Taking ID with highestElement[0]

In [11]:
print (str(mostPopularName) + "is the most popular Superhero with " + \
       str(highestElement[1]) + " number of co-appearances.")