In [30]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as func
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    FloatType,
)

In [31]:
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [32]:
schema = StructType(
    [
        StructField("ID", IntegerType(), True),
        StructField("Name", StringType(), True),
    ]
)
names = spark.read.schema(schema).option("sep", " ").csv("./marvel/Marvel_Names.csv")
names.show(5)

+---+--------------------+
| ID|                Name|
+---+--------------------+
|  1|24-HOUR MAN/EMMANUEL|
|  2|3-D MAN/CHARLES CHAN|
|  3|    4-D MAN/MERCURIO|
|  4|             8-BALL/|
|  5|                   A|
+---+--------------------+
only showing top 5 rows



In [33]:
names.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)



In [34]:
lines = spark.read.text("./marvel/Marvel_Graph.csv")
lines.show(5)

+--------------------+
|               value|
+--------------------+
|5988 748 1722 375...|
|5989 4080 4264 44...|
|5982 217 595 1194...|
|5983 1165 3836 43...|
|5980 2731 3712 15...|
+--------------------+
only showing top 5 rows



In [None]:
# key = id, connections = number of friends of the character
connections = (
    lines.withColumn("id", func.split(func.col("value"), " ")[0])
    .withColumn("connections", func.size(func.split(func.col("value"), " ")) - 1)
    .groupBy("id")
    .agg(func.sum("connections").alias("connections"))
    .orderBy("connections", ascending=False)
)
connections.show(5)

+----+-----------+
|  id|connections|
+----+-----------+
| 859|       1937|
|5306|       1745|
|2664|       1532|
|5716|       1429|
|6306|       1397|
+----+-----------+
only showing top 5 rows



In [None]:
# most popular superhero
mostPopular = connections.orderBy("connections", ascending=False).first()

mostPopularName = (
    names.filter(func.col("ID") == mostPopular["id"]).select("Name").first()[0]
)

print(
    f"Most popular superhero is {mostPopularName} with {mostPopular['connections']} connections"
)

Most popular superhero is CAPTAIN AMERICA with 1937 connections


In [None]:
# top 5 popular superheroes
connections.join(names, "ID").orderBy("connections", ascending=False).show(5)

+----+-----------+--------------------+
|  id|connections|                Name|
+----+-----------+--------------------+
| 859|       1937|     CAPTAIN AMERICA|
|5306|       1745|SPIDER-MAN/PETER PAR|
|2664|       1532|IRON MAN/TONY STARK |
|5716|       1429|THING/BENJAMIN J. GR|
|6306|       1397|    WOLVERINE/LOGAN |
+----+-----------+--------------------+
only showing top 5 rows



In [None]:
# top 5 obscure superheroes
connections.join(names, "ID").orderBy("connections").show(5)

+----+-----------+--------------------+
|  id|connections|                Name|
+----+-----------+--------------------+
|3490|          1|MARVEL BOY II/MARTIN|
|1089|          1|       CLUMSY FOULUP|
| 467|          1|        BERSERKER II|
| 577|          1|              BLARE/|
|3489|          1|MARVEL BOY/MARTIN BU|
+----+-----------+--------------------+
only showing top 5 rows



In [52]:
# superhero degrees of separation
# id, connections_list, distance, color

def converToBFS(line):
    fields = line.split()
    heroID = int(fields[0])
    connections = [int(connection) for connection in fields[1:]]

    color = "WHITE"
    distance = 9999
    if heroID == startCharacterID:
        color = "GRAY"
        distance = 0

    return (heroID, (connections, distance, color))


startCharacterID = 5306  # Spiderman
targetCharacterID = 14

hitCounter = spark.sparkContext.accumulator(0)
degreeSep = spark.sparkContext.textFile("./marvel/Marvel_Graph.csv").map(converToBFS)
degreeSep.cache()
degreeSep.take(5)

[(5988,
  ([748,
    1722,
    3752,
    4655,
    5743,
    1872,
    3413,
    5527,
    6368,
    6085,
    4319,
    4728,
    1636,
    2397,
    3364,
    4001,
    1614,
    1819,
    1585,
    732,
    2660,
    3952,
    2507,
    3891,
    2070,
    2239,
    2602,
    612,
    1352,
    5447,
    4548,
    1596,
    5488,
    1605,
    5517,
    11,
    479,
    2554,
    2043,
    17,
    865,
    4292,
    6312,
    473,
    534,
    1479,
    6375,
    4456],
   9999,
   'WHITE')),
 (5989,
  ([4080,
    4264,
    4446,
    3779,
    2430,
    2297,
    6169,
    3530,
    3272,
    4282,
    6432,
    2548,
    4140,
    185,
    105,
    3878,
    2429,
    1334,
    4595,
    2767,
    3956,
    3877,
    4776,
    4946,
    3407,
    128,
    269,
    5775,
    5121,
    481,
    5516,
    4758,
    4053,
    1044,
    1602,
    3889,
    1535,
    6038,
    533,
    3986],
   9999,
   'WHITE')),
 (5982,
  ([217,
    595,
    1194,
    3308,
    2940,
    1815,
    794

In [None]:
# item based collaborative filtering, cache(), persist()
