In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder \
        .master("local[4]") \
        .appName("PySpark Finding Super-hero") \
        .getOrCreate()  

In [3]:
marvel_name_schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True)])

In [10]:
names = spark.read.schema(marvel_name_schema).option("sep"," ").csv("data/marvel_names.txt")
names.show(5)

+---+--------------------+
| id|                name|
+---+--------------------+
|  1|24-HOUR MAN/EMMANUEL|
|  2|3-D MAN/CHARLES CHAN|
|  3|    4-D MAN/MERCURIO|
|  4|             8-BALL/|
|  5|                   A|
+---+--------------------+
only showing top 5 rows



In [9]:
lines = spark.read.text("data/marvel_graph.txt")
lines.show(5)

+--------------------+
|               value|
+--------------------+
|5988 748 1722 375...|
|5989 4080 4264 44...|
|5982 217 595 1194...|
|5983 1165 3836 43...|
|5980 2731 3712 15...|
+--------------------+
only showing top 5 rows



## SparkDataFrameAPI

In [None]:
id_connections = lines.withColumn("id", f.split(f.trim(f.col("value")), " ")[0]) \
    .withColumn("connections", f.size(f.split(f.trim(f.col("value")), " ")) -1)

In [17]:
id_connections.select("id","connections").show(5)

+----+-----------+
|  id|connections|
+----+-----------+
|5988|         48|
|5989|         40|
|5982|         42|
|5983|         14|
|5980|         24|
+----+-----------+
only showing top 5 rows



In [18]:
connections = id_connections.groupBy("id").agg(f.sum("connections").alias("connections"))
connections.show(5)

+----+-----------+
|  id|connections|
+----+-----------+
| 691|          6|
|1159|         11|
|3959|        142|
|1572|         35|
|2294|         14|
+----+-----------+
only showing top 5 rows



In [19]:
mostPopular = connections.sort(f.col("connections").desc()).first()
mostPopular

Row(id='859', connections=1933)

In [20]:
mostPopularName = names.filter(f.col("id") == mostPopular[0]).select("name").first()
print(mostPopularName[0] + "가 가장 인기있는 수퍼히어로이며 총 " + str(mostPopular[1]) + "번 다른 히어로들과 출연했습니다")

CAPTAIN AMERICA가 가장 인기있는 수퍼히어로이며 총 1933번 다른 히어로들과 출연했습니다


## SparkSQL

In [21]:
id_connections.createOrReplaceTempView("id_connections")
names.createOrReplaceTempView("names")

In [22]:
spark.sql("""
SELECT c.id, n.name, SUM(c.connections)
FROM id_connections c
JOIN names n ON c.id = n.id
GROUP BY 1, 2
ORDER BY 3 DESC""").show()

+----+--------------------+----------------+
|  id|                name|sum(connections)|
+----+--------------------+----------------+
| 859|     CAPTAIN AMERICA|            1933|
|5306|SPIDER-MAN/PETER PAR|            1741|
|2664|IRON MAN/TONY STARK |            1528|
|5716|THING/BENJAMIN J. GR|            1426|
|6306|    WOLVERINE/LOGAN |            1394|
|3805|MR. FANTASTIC/REED R|            1386|
|2557|HUMAN TORCH/JOHNNY S|            1371|
|4898|SCARLET WITCH/WANDA |            1345|
|5736|THOR/DR. DONALD BLAK|            1289|
| 403|BEAST/HENRY &HANK& P|            1280|
|6066|             VISION |            1263|
|2650|INVISIBLE WOMAN/SUE |            1244|
|2399|                HAWK|            1176|
|1289|CYCLOPS/SCOTT SUMMER|            1104|
|5467|STORM/ORORO MUNROE S|            1095|
| 133|ANGEL/WARREN KENNETH|            1094|
|6148|WASP/JANET VAN DYNE |            1093|
| 154|ANT-MAN/DR. HENRY J.|            1092|
|5046|SHE-HULK/JENNIFER WA|            1080|
|1602|DR. 

## UDF를 사용해서 히어로 ID를 룩업해보자

In [24]:
id2Names = names.rdd.collectAsMap()
len(id2Names)

19428

In [25]:
for idx, id in enumerate(id2Names):
    if idx >= 10:
        break
    print(id, id2Names[id])

1 24-HOUR MAN/EMMANUEL
2 3-D MAN/CHARLES CHAN
3 4-D MAN/MERCURIO
4 8-BALL/
5 A
6 A'YIN
7 ABBOTT, JACK
8 ABCISSA
9 ABEL
10 ABOMINATION/EMIL BLO


In [28]:
broadcast_id2Names = spark.sparkContext.broadcast(id2Names)
broadcast_id2Names

<pyspark.broadcast.Broadcast at 0x1a8227e5390>

In [29]:
broadcast_id2Names.value.get("859")

'CAPTAIN AMERICA'

In [None]:
def returnName(id: str) -> str:
    return broadcast_id2Names.value.get(id)

In [None]:
spark.udf.register("returnName", returnName, StringType())

In [31]:
id_connections.withColumn("name", f.expr("returnName(id)")).show()

+--------------------+----+-----------+--------------------+
|               value|  id|connections|                name|
+--------------------+----+-----------+--------------------+
|5988 748 1722 375...|5988|         48|VALKYRIE II | MUTANT|
|5989 4080 4264 44...|5989|         40|         VALKYRIE IV|
|5982 217 595 1194...|5982|         42|VAGABOND/PRISCILLA L|
|5983 1165 3836 43...|5983|         14|              VAGUE/|
|5980 2731 3712 15...|5980|         24|         UTGARD-LOKI|
|5981 3569 5353 40...|5981|         17|              VACUUM|
|5986 2658 3712 26...|5986|        142|             VALINOR|
|5987 2614 5716 17...|5987|         81|              VALKIN|
|5984 590 4898 745...|5984|         41|              VAKUME|
|5985 3233 2254 21...|5985|         19|             VALERIA|
|6294 4898 1127 32...|6294|         13|WOLF SPIRIT/OWAYODAT|
|270 2658 3003 380...| 270|         42|AUGUST PERSONAGE IN |
|271 4935 5716 430...| 271|          9|      AUNTIE FREEZE/|
|272 2717 4363 408...| 2

In [None]:
def returnName(id: str) -> str:
    return id2Names.get(id)

In [33]:
spark.udf.register("returnName", returnName, StringType())

<function __main__.returnName(id: str) -> str>

In [34]:
id_connections.withColumn("name", f.expr("returnName(id)")).show()

+--------------------+----+-----------+--------------------+
|               value|  id|connections|                name|
+--------------------+----+-----------+--------------------+
|5988 748 1722 375...|5988|         48|VALKYRIE II | MUTANT|
|5989 4080 4264 44...|5989|         40|         VALKYRIE IV|
|5982 217 595 1194...|5982|         42|VAGABOND/PRISCILLA L|
|5983 1165 3836 43...|5983|         14|              VAGUE/|
|5980 2731 3712 15...|5980|         24|         UTGARD-LOKI|
|5981 3569 5353 40...|5981|         17|              VACUUM|
|5986 2658 3712 26...|5986|        142|             VALINOR|
|5987 2614 5716 17...|5987|         81|              VALKIN|
|5984 590 4898 745...|5984|         41|              VAKUME|
|5985 3233 2254 21...|5985|         19|             VALERIA|
|6294 4898 1127 32...|6294|         13|WOLF SPIRIT/OWAYODAT|
|270 2658 3003 380...| 270|         42|AUGUST PERSONAGE IN |
|271 4935 5716 430...| 271|          9|      AUNTIE FREEZE/|
|272 2717 4363 408...| 2