In [1]:
from pyspark.sql import (
    functions as f,
    Row,
    SparkSession,
    types as t
)

spark = SparkSession.builder.appName("df_most_popular").getOrCreate()

In [2]:
# https://www.kaggle.com/datasets/csanhueza/the-marvel-universe-social-network
csv_file_path = "file:///home/jovyan/work/hero-network.csv"

df = spark.read\
            .option("header", "true")\
            .option("inferSchema", "true").csv(csv_file_path)
df.printSchema()
df.show(truncate=False)

root
 |-- hero1: string (nullable = true)
 |-- hero2: string (nullable = true)

+--------------------+--------------------+
|hero1               |hero2               |
+--------------------+--------------------+
|LITTLE, ABNER       |PRINCESS ZANDA      |
|LITTLE, ABNER       |BLACK PANTHER/T'CHAL|
|BLACK PANTHER/T'CHAL|PRINCESS ZANDA      |
|LITTLE, ABNER       |PRINCESS ZANDA      |
|LITTLE, ABNER       |BLACK PANTHER/T'CHAL|
|BLACK PANTHER/T'CHAL|PRINCESS ZANDA      |
|STEELE, SIMON/WOLFGA|FORTUNE, DOMINIC    |
|STEELE, SIMON/WOLFGA|ERWIN, CLYTEMNESTRA |
|STEELE, SIMON/WOLFGA|IRON MAN/TONY STARK |
|STEELE, SIMON/WOLFGA|IRON MAN IV/JAMES R.|
|STEELE, SIMON/WOLFGA|RAVEN, SABBATH II/EL|
|RAVEN, SABBATH II/EL|FORTUNE, DOMINIC    |
|RAVEN, SABBATH II/EL|ERWIN, CLYTEMNESTRA |
|RAVEN, SABBATH II/EL|IRON MAN/TONY STARK |
|RAVEN, SABBATH II/EL|IRON MAN IV/JAMES R.|
|IRON MAN IV/JAMES R.|FORTUNE, DOMINIC    |
|IRON MAN IV/JAMES R.|ERWIN, CLYTEMNESTRA |
|IRON MAN IV/JAMES R.|IRON MAN/TONY STAR

In [3]:
data = df.groupBy("hero1").agg(f.collect_set("hero2").alias("connection"))\
        .withColumnRenamed("hero1", "hero")
data.show()

+--------------------+--------------------+
|                hero|          connection|
+--------------------+--------------------+
|             ABCISSA|[ELSIE DEE, FURY,...|
|ABSORBING MAN | MUTA|[DRAX | MUTANT X-...|
|ABSORBING MAN/CARL C|[SOMMERS, APRIL, ...|
|    ADAMSON, REBECCA|[KABALLA, GOLEM I...|
|   ADVENT/KYLE GROBE|[JUSTICE II/VANCE...|
|      AGAMEMNON III/|[ASTER, LUCIAN, H...|
|            AGAMOTTO|[MUNIPOOR, DORMAM...|
|             AGGAMON|[DR. STRANGE/STEP...|
|              AGINAR|[SIF, REJECT/RAN-...|
|                AGON|[MARISTA, BLACK B...|
|               AINET|[STORM/ORORO MUNR...|
|    AKUTAGAWA, OSAMU|[HUMAN TORCH/JOHN...|
|ALDEN, PROF. MEREDIT|[CABE, BETHANY, S...|
|             ALISTRO|[ENCHANTRESS/AMOR...|
|       ALVAREZ, PAUL|[ATOR, GENERAL, Z...|
|   AMERICAN SAMURAI/|[PAGE, KAREN, DAR...|
|             AMPERE/|[QUICKSILVER/PIET...|
|           ANCESTOR/|[RECORDER II, FOU...|
|ANCIENT ONE/BARON MO|[BLOODSTORM | MUT...|
|    ANDERSSEN, TANYA|[KA-ZAR/KE

In [5]:
data.withColumn("connection", f.concat_ws(",", f.col("connection")))\
    .coalesce(1).write.option("header", True).csv("output")

In [7]:
csv_file_path = "file:///home/jovyan/work/output"
df = spark.read\
        .option("header", "true")\
        .option("inferSchema", "true")\
        .csv(csv_file_path)
df.show()

+--------------------+--------------------+
|                hero|          connection|
+--------------------+--------------------+
|             ABCISSA|ELSIE DEE,FURY, C...|
|ABSORBING MAN | MUTA|DRAX | MUTANT X-V...|
|ABSORBING MAN/CARL C|SOMMERS, APRIL,HE...|
|    ADAMSON, REBECCA|KABALLA,GOLEM III...|
|   ADVENT/KYLE GROBE|JUSTICE II/VANCE ...|
|      AGAMEMNON III/|ASTER, LUCIAN,HOG...|
|            AGAMOTTO|MUNIPOOR,DORMAMMU...|
|             AGGAMON| DR. STRANGE/STEPHEN|
|              AGINAR|SIF,REJECT/RAN-SA...|
|                AGON|MARISTA,BLACK BOL...|
|               AINET|STORM/ORORO MUNRO...|
|    AKUTAGAWA, OSAMU|HUMAN TORCH/JOHNN...|
|ALDEN, PROF. MEREDIT|CABE, BETHANY,STO...|
|             ALISTRO|ENCHANTRESS/AMORA...|
|       ALVAREZ, PAUL|ATOR, GENERAL,ZAR...|
|   AMERICAN SAMURAI/|PAGE, KAREN,DARED...|
|             AMPERE/|QUICKSILVER/PIETR...|
|           ANCESTOR/|RECORDER II,FOUND...|
|ANCIENT ONE/BARON MO|BLOODSTORM | MUTA...|
|    ANDERSSEN, TANYA|KA-ZAR/KEV

In [11]:
df = df.withColumn(
    "connection_size",
    f.size(
        f.split(f.col("connection"), ",")))\
    .orderBy(f.desc("connection_size"))
df.show()

+--------------------+--------------------+---------------+
|                hero|          connection|connection_size|
+--------------------+--------------------+---------------+
|     CAPTAIN AMERICA|URICH, DORIS,ARMA...|           1795|
|SPIDER-MAN/PETER PAR|MAGMA II/JONATHAN...|           1737|
| IRON MAN/TONY STARK|RED SHIFT,SABRETO...|           1443|
|     WOLVERINE/LOGAN|SABRETOOTH/VICTOR...|           1278|
|THING/BENJAMIN J. GR|CHORD, ANDREW,CAT...|           1262|
| SCARLET WITCH/WANDA|SABRETOOTH/VICTOR...|           1246|
|HUMAN TORCH/JOHNNY S|CAT KING,BUZZ,MAK...|           1202|
|MR. FANTASTIC/REED R|ARMADILLO/ANTONIO...|           1200|
|THOR/DR. DONALD BLAK|PARKER, MAY | TIM...|           1183|
| INVISIBLE WOMAN/SUE|CAPTAIN MARVEL II...|           1143|
|BEAST/HENRY &HANK& P|AMERICAN EAGLE II...|           1140|
|              VISION|PHOSPHORUS,AMERIC...|           1110|
|                HAWK|AMERICAN EAGLE II...|           1086|
|CYCLOPS/SCOTT SUMMER|SABRETOOTH/VICTOR.