In [10]:
import pyspark.sql.functions as F
import os

from pyspark.sql import SparkSession
from pyspark.sql.window import Window

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config('spark.dynamicAllocation.shuffleTrackingEnabled', 'true')
    .config('spark.dynamicAllocation.executorIdleTimeout', '60')
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .enableHiveSupport()
    .getOrCreate()
)

In [4]:
spark.conf.set('spark.sql.adaptive.enabled', 'true')
spark.conf.set('spark.sql.adaptive.coalescePartitions.enabled', 'true')
spark.conf.set('spark.sql.autoBroadcastJoinThreshold', '-1')

In [5]:
df_path = r"F:\Datasets\CSV datasets\TMDB_movie_dataset_v11.csv"

In [6]:
df = (spark
      .read
      .format("csv")
      .option("header", "true")
      .option("inferSchema", "true")
      .load(df_path))

In [7]:
df.dtypes

[('id', 'int'),
 ('title', 'string'),
 ('vote_average', 'string'),
 ('vote_count', 'string'),
 ('status', 'string'),
 ('release_date', 'string'),
 ('revenue', 'string'),
 ('runtime', 'string'),
 ('adult', 'string'),
 ('backdrop_path', 'string'),
 ('budget', 'string'),
 ('homepage', 'string'),
 ('imdb_id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'string'),
 ('poster_path', 'string'),
 ('tagline', 'string'),
 ('genres', 'string'),
 ('production_companies', 'string'),
 ('production_countries', 'string'),
 ('spoken_languages', 'string'),
 ('keywords', 'string')]

In [9]:
df.show(5)

+------+---------------+------------+----------+--------+------------+----------+-------+-----+--------------------+---------+--------------------+---------+-----------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    id|          title|vote_average|vote_count|  status|release_date|   revenue|runtime|adult|       backdrop_path|   budget|            homepage|  imdb_id|original_language| original_title|            overview|          popularity|         poster_path|             tagline|              genres|production_companies|production_countries|    spoken_languages|            keywords|
+------+---------------+------------+----------+--------+------------+----------+-------+-----+--------------------+---------+--------------------+---------+-----------------+---------------+--------------------+--------------------+-----

In [14]:
windows_spec = Window.partitionBy('original_language').orderBy(F.col('popularity').desc())

ranked_movies = (
    df.select(
        'title',
        'original_language',
        'popularity',
    ).withColumn(
        'rank', F.rank().over(windows_spec)
    )
)

ranked_movies.orderBy('original_language', 'rank').show(10)

+--------------------+-----------------+-------------------------------------+----+
|               title|original_language|                           popularity|rank|
+--------------------+-----------------+-------------------------------------+----+
|"Kami Kouen Yotei...|             NULL|  神公演予定」* 諸般の事情により、...|   1|
|"""I'm Not A Kid ...|             NULL| 『私もう子供じゃないよ…胸は小さい...|   2|
|"""You Missed You...|             NULL|「終電ないの！？じゃあウチおいでよ...|   3|
|"""You Missed The...|             NULL|「終電ないの！？じゃあウチおいで」...|   4|
|"""I Want You To ...|             NULL|「私のおっぱいでもっともっと気持ち...|   5|
|"""I Didn't Want ...|             NULL|「私、したくもないのにヤラされまし...|   6|
|           """Lately|             NULL| 「最近下半身のむくみがひどくて…リ...|   7|
|"""I really like ...|             NULL|「ホントは寝バックが好きなんです」...|   8|
|"""No Matter How ...|             NULL| 「イッても舐め続けます」 射精後も...|   9|
|"""When My Big Si...|             NULL|「お見舞いに来てくれた姉の無防備な...|  10|
+--------------------+-----------------+---------------------------------

In [15]:
df.createOrReplaceTempView('movies')

In [24]:
spark.sql("""
  SELECT
    title,
    original_language,
    popularity,
    RANK() OVER (
    PARTITION BY original_language
    ORDER BY popularity DESC
    ) AS rank
  FROM
    movies
  ORDER BY
    original_language, rank
""").show(5)

+--------------------+-----------------+-------------------------------------+----+
|               title|original_language|                           popularity|rank|
+--------------------+-----------------+-------------------------------------+----+
|"Kami Kouen Yotei...|             NULL|  神公演予定」* 諸般の事情により、...|   1|
|"""I'm Not A Kid ...|             NULL| 『私もう子供じゃないよ…胸は小さい...|   2|
|"""You Missed You...|             NULL|「終電ないの！？じゃあウチおいでよ...|   3|
|"""You Missed The...|             NULL|「終電ないの！？じゃあウチおいで」...|   4|
|"""I Want You To ...|             NULL|「私のおっぱいでもっともっと気持ち...|   5|
+--------------------+-----------------+-------------------------------------+----+
only showing top 5 rows



In [37]:
df = df.withColumn("release_date", F.to_date(df["release_date"]))

In [39]:
df.dtypes

[('id', 'int'),
 ('title', 'string'),
 ('vote_average', 'string'),
 ('vote_count', 'string'),
 ('status', 'string'),
 ('release_date', 'date'),
 ('revenue', 'string'),
 ('runtime', 'string'),
 ('adult', 'string'),
 ('backdrop_path', 'string'),
 ('budget', 'string'),
 ('homepage', 'string'),
 ('imdb_id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'string'),
 ('poster_path', 'string'),
 ('tagline', 'string'),
 ('genres', 'string'),
 ('production_companies', 'string'),
 ('production_countries', 'string'),
 ('spoken_languages', 'string'),
 ('keywords', 'string')]

In [46]:
base_df = df.select(
    'title',
    F.year('release_date').alias('release_year'),
    'revenue'
)

windows_spec = Window.partitionBy('release_year').orderBy(F.col('revenue').desc())

ranked_movies = (base_df
                 .withColumn('rank', F.rank().over(windows_spec))
                 .orderBy('release_year', 'rank')
                 )

ranked_movies.show(5)

+--------------------+------------+--------+----+
|               title|release_year| revenue|rank|
+--------------------+------------+--------+----+
|"So I Asked This ...|        NULL|Released|   1|
|"""I really like ...|        NULL|Released|   1|
|"""You Missed You...|        NULL|Released|   1|
|"Kami Kouen Yotei...|        NULL|Released|   1|
|               """Eh|        NULL|Released|   1|
+--------------------+------------+--------+----+
only showing top 5 rows



In [59]:
spark.sql("""
  SELECT
    title,
    YEAR(release_date) AS release_year,
    revenue,
    RANK() OVER (
    PARTITION BY YEAR(release_date)
    ORDER BY revenue DESC
    ) AS rank
  FROM
    movies
  ORDER BY
    release_year, rank
""").show()

+--------------------+------------+---------+----+
|               title|release_year|  revenue|rank|
+--------------------+------------+---------+----+
|"So I Asked This ...|        NULL| Released|   1|
|"""I really like ...|        NULL| Released|   1|
|"""You Missed You...|        NULL| Released|   1|
|"Kami Kouen Yotei...|        NULL| Released|   1|
|               """Eh|        NULL| Released|   1|
|"""Ghosts and Fir...|        NULL| Released|   1|
|"""I'm Not A Kid ...|        NULL| Released|   1|
|"""When My Big Si...|        NULL| Released|   1|
|    """Masochist Boy|        NULL| Released|   1|
|"""I Want You To ...|        NULL| Released|   1|
|  "Skyworks: ""Light|        NULL| Released|   1|
|"""I Didn't Want ...|        NULL| Released|   1|
|           """Lately|        NULL| Released|   1|
|"""You Missed The...|        NULL| Released|   1|
|"Something in the...|        NULL| Released|   1|
|"S.S. ""Kinau"" L...|        NULL| Released|   1|
|"""No Matter How ...|        N

In [69]:
window_spec = Window.partitionBy('release_date').rowsBetween(Window.unboundedPreceding, 0)

running_total_df = df.select(
    'title',
    'release_date',
    'vote_count'
).withColumn(
    'running_total',
    F.sum('vote_count').over(window_spec)
).orderBy(
    'release_date'
)

running_total_df.show(5)

+----------+------------+----------+-------------+
|     title|release_date|vote_count|running_total|
+----------+------------+----------+-------------+
|    Return|        NULL|       129|        129.0|
|    Return|        NULL|       124|        253.0|
|       Box|        NULL|        44|        297.0|
|      I am|        NULL|        37|        334.0|
|kino-react|        NULL|        37|        371.0|
+----------+------------+----------+-------------+
only showing top 5 rows



In [73]:
spark.sql("""
    SELECT
        title,
        release_date,
        vote_count,
        sum(vote_count) OVER (
            ORDER BY release_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
        ) AS running_total
    FROM
        movies
    ORDER BY release_date
""").show()

+--------------------+------------+----------+-------------+
|               title|release_date|vote_count|running_total|
+--------------------+------------+----------+-------------+
|              Return|        NULL|       129|        129.0|
|              Return|        NULL|       124|        253.0|
|                 Box|        NULL|        44|        297.0|
|                I am|        NULL|        37|        334.0|
|          kino-react|        NULL|        37|        371.0|
|Hey Qween - Holig...|        NULL|        33|        404.0|
|                   1|        NULL|        31|        435.0|
|                Star|        NULL|        30|        465.0|
|                 Box|        NULL|        28|        493.0|
|              Movies|        NULL|        28|        521.0|
|            S.T.A.R.|        NULL|        24|        545.0|
|Fight Club (Russi...|        NULL|        23|        568.0|
|America's Funnies...|        NULL|        21|        589.0|
|        Emerald City|  