# Imports

In [10]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from schemas import data_frames

# Question 1


> The most successful actor-director collaborations based on average ratings

In [3]:
actor_director_collab = (
    data_frames["title_principals"]
    .join(
        data_frames["title_crew"],
        data_frames["title_principals"].tconst == data_frames["title_crew"].tconst
    )
    .join(
        data_frames["title_ratings"],
        data_frames["title_principals"].tconst == data_frames["title_ratings"].tconst
    )
    .join(
        data_frames["name_basics"].alias("actors"),
        data_frames["title_principals"].nconst == F.col("actors.nconst")
    )
    .join(
        data_frames["name_basics"].alias("directors"),
        data_frames["title_crew"].directors == F.col("directors.nconst")
    )
    .filter("category = 'actor'")
    .groupBy("actors.primaryName", "directors.primaryName")
    .agg(
        F.avg("averageRating").alias("avgRating"),
        F.count(data_frames["title_principals"].tconst).alias("movieCount")
    )
    .orderBy(F.desc("avgRating"))
    .select(
        F.col("actors.primaryName").alias("actor"),
        F.col("directors.primaryName").alias("director"),
        "avgRating",
        "movieCount"
    )
)

actor_director_collab.limit(20).show()

total_records = actor_director_collab.count()
print(f"Total records: {total_records}")

+--------------------+-------------------+---------+----------+
|               actor|           director|avgRating|movieCount|
+--------------------+-------------------+---------+----------+
|         Adam Keller|      Elena Kritter|     10.0|         1|
|       Adrian Maeers|         Joe Martin|     10.0|         1|
|       Adam Chisnall|      Sam Delfavero|     10.0|         1|
|           Alec Zais|   Darius Ratchford|     10.0|         1|
|         Ajay Sharma|        Irfan Shekh|     10.0|         1|
|        Adrian Quinn|      Shannon Crome|     10.0|         1|
|        Aaron Joshua|     David Anderson|     10.0|         2|
|    Aaliyah Sullivan|    Lauren Anderson|     10.0|         1|
|       Aaron Maguire|      Christian Kim|     10.0|         1|
|Adrianna 'Caz' Br...|      Katie O'Grady|     10.0|         1|
|      Adam Norlander|    Arvid Magnusson|     10.0|         1|
|        Aaron Isaacs|    Kristina Harris|     10.0|         1|
|      Aaron Peterson|         Blake Bar

# Question 2

> Average runtime of movies by previous decade and genre

In [4]:
data_frames["title_basics"] = data_frames["title_basics"].withColumn(
    "decade",
    (F.floor(F.col("startYear") / 10) * 10).cast("integer")
)

window_spec = Window.partitionBy("decade", "genre").orderBy(F.desc("avgRuntime"))

runtime_by_decade_genre = (
    data_frames["title_basics"]
    .join(data_frames["title_ratings"], "tconst")
    .withColumn("genre", F.split(F.col("genres"), ",")[0])
    .filter(F.col("runtimeMinutes").isNotNull())
    .filter(F.col("genres").isNotNull())
    .filter(F.col("decade").isNotNull())
    .groupBy("decade", "genre")
    .agg(F.avg("runtimeMinutes").alias("avgRuntime"))
)

ranked_runtime = runtime_by_decade_genre.withColumn("rank", F.row_number().over(window_spec))
ranked_runtime_filtered = ranked_runtime.filter(F.col("rank") <= 20)

ranked_runtime_filtered = ranked_runtime_filtered.orderBy(F.desc("decade"))
ranked_runtime_filtered.show()

total_records = ranked_runtime_filtered.count()
print(f"Total records: {total_records}")

+------+-----------+------------------+----+
|decade|      genre|        avgRuntime|rank|
+------+-----------+------------------+----+
|  2020|     Action|57.063425063041336|   1|
|  2020|     Sci-Fi| 45.30483271375465|   1|
|  2020|      Adult|111.11711711711712|   1|
|  2020|  Adventure|55.345279486493716|   1|
|  2020|  Animation| 23.32125874125874|   1|
|  2020|  Biography| 71.15062862669245|   1|
|  2020|     Comedy| 52.27763157894737|   1|
|  2020|      Crime| 59.88291307991163|   1|
|  2020|Documentary| 66.65007818967896|   1|
|  2020|      Drama| 63.38263827642686|   1|
|  2020|     Family| 72.58669833729216|   1|
|  2020|    Fantasy| 49.90248390064397|   1|
|  2020|  Game-Show| 56.57914292791677|   1|
|  2020|    History|54.423857868020306|   1|
|  2020|     Horror|58.743200150065654|   1|
|  2020|      Music| 66.68802902055623|   1|
|  2020|    Musical| 75.06481481481481|   1|
|  2020|    Mystery| 62.05336426914153|   1|
|  2020|       News| 59.08650519031142|   1|
|  2020| R

# Question 3

> Which countries produce the most internationally recognized films (films with titles in multiple languages)

In [5]:
multi_language_films = (
    data_frames["title_akas"]
    .join(
        data_frames["title_basics"],
        data_frames["title_akas"].titleId == data_frames["title_basics"].tconst
    )
    .groupBy("titleId", "region")
    .agg(F.countDistinct("language").alias("distinctLanguages"))
    .filter("distinctLanguages > 1")
    .join(
        data_frames["title_basics"],
        F.col("titleId") == data_frames["title_basics"].tconst
    )
    .groupBy("region")
    .agg(F.countDistinct("titleId").alias("filmCount"))
    .orderBy(F.desc("filmCount"))
)

multi_language_films.limit(20).show()
total_records = multi_language_films.count()

print(f"Total records: {total_records}")

+------+---------+
|region|filmCount|
+------+---------+
|    CA|    42821|
|    IN|    33185|
|    JP|    25096|
|    US|    12719|
|    FI|     9099|
|    HK|     7506|
|    ES|     7399|
|    BE|     6553|
|    AU|     2267|
|    CN|     1966|
|    PH|     1679|
|  SUHH|     1646|
|   XWW|     1398|
|    GB|     1364|
|    IR|     1169|
|    TR|     1121|
|   XYU|     1030|
|    KZ|     1018|
|    CH|      994|
|    IL|      822|
+------+---------+

Total records: 86


# Question 4

> For TV series that lasted at least 5 seasons, how did ratings trend throughout their run?

In [6]:
tv_series_5_seasons = data_frames["title_episode"] \
    .join(data_frames["title_ratings"], data_frames["title_episode"].tconst == data_frames["title_ratings"].tconst) \
    .groupBy("parentTconst") \
    .agg(F.countDistinct("seasonNumber").alias("seasons")) \
    .filter("seasons >= 5")

tv_ratings_trend = data_frames["title_episode"] \
    .join(data_frames["title_ratings"], data_frames["title_episode"].tconst == data_frames["title_ratings"].tconst) \
    .filter(data_frames["title_episode"].parentTconst.isin([row.parentTconst for row in tv_series_5_seasons.collect()])) \
    .join(data_frames["title_basics"], data_frames["title_episode"].parentTconst == data_frames["title_basics"].tconst) \
    .filter(F.col("primaryTitle").isNotNull()) \
    .filter(F.col("seasonNumber").isNotNull()) \
    .filter(F.col("averageRating").isNotNull()) \
    .groupBy("primaryTitle", "seasonNumber") \
    .agg(F.avg("averageRating").alias("avgRating")) \
    .orderBy("primaryTitle", "seasonNumber")

tv_ratings_trend.limit(20).show(truncate=False)

total_records = tv_ratings_trend.count()
print(f"Total records: {total_records}")

unique_series_count = tv_series_5_seasons.count()
print(f"Amount of unique TV shows with more than 5 seasons: {unique_series_count}")

+------------+------------+------------------+
|primaryTitle|seasonNumber|avgRating         |
+------------+------------+------------------+
|#LikeMe     |1           |8.161538637601412 |
|#LikeMe     |2           |8.092307567596436 |
|#LikeMe     |3           |7.923076959756704 |
|#LikeMe     |4           |8.353846256549541 |
|#LikeMe     |5           |5.900000095367432 |
|'Allo 'Allo!|1           |8.074999988079071 |
|'Allo 'Allo!|2           |8.157142911638532 |
|'Allo 'Allo!|3           |8.200000127156576 |
|'Allo 'Allo!|4           |7.9666666984558105|
|'Allo 'Allo!|5           |7.915384677740244 |
|'Allo 'Allo!|6           |7.8125            |
|'Allo 'Allo!|7           |7.790000009536743 |
|'Allo 'Allo!|8           |7.699999988079071 |
|'Allo 'Allo!|9           |7.766666650772095 |
|07 zglos sie|1           |7.524999976158142 |
|07 zglos sie|2           |7.480000019073486 |
|07 zglos sie|3           |7.759999942779541 |
|07 zglos sie|4           |7.549999952316284 |
|07 zglos sie

# Question 5

> Which genres have the most balanced gender representation in leading roles?

In [7]:
window_spec = Window.orderBy(F.col("genderBalance").asc())

gender_balance_genre = data_frames["title_principals"] \
    .join(data_frames["title_basics"], "tconst") \
    .filter(F.col("ordering") == 1) \
    .filter(F.col("category").isin("actor", "actress")) \
    .withColumn("genre", F.explode(F.split(F.col("genres"), ","))) \
    .withColumn("genre", F.trim(F.col("genre"))) \
    .filter(F.col("genre") != "") \
    .groupBy("genre") \
    .agg(
        F.sum(F.when(F.col("category") == "actor", 1).otherwise(0)).alias("numActors"),
        F.sum(F.when(F.col("category") == "actress", 1).otherwise(0)).alias("numActresses")
    ) \
    .withColumn("genderBalance", F.abs(F.col("numActors") - F.col("numActresses")))

ranked_gender_balance_genre = gender_balance_genre.withColumn("rank", F.row_number().over(window_spec))

top_20_gender_balance_genre = ranked_gender_balance_genre.filter(F.col("rank") <= 20).drop("rank")
top_20_gender_balance_genre.show()

total_records = top_20_gender_balance_genre.count()
print(f"Total records: {total_records}")

+----------+---------+------------+-------------+
|     genre|numActors|numActresses|genderBalance|
+----------+---------+------------+-------------+
| Film-Noir|      651|         212|          439|
|      News|    26715|       19319|         7396|
| Game-Show|    19468|       10535|         8933|
|   Musical|    34095|       23300|        10795|
|       War|    20737|        6575|        14162|
| Biography|    35722|       15651|        20071|
|   Western|    24545|        2905|        21640|
|        \N|   121574|       99437|        22137|
| Talk-Show|    46662|       22410|        24252|
|     Sport|    41247|       11271|        29976|
|     Music|    67734|       37721|        30013|
|Reality-TV|    64427|       32568|        31859|
|  Thriller|    99936|       67058|        32878|
|   History|    61157|       28165|        32992|
|    Horror|   100419|       66913|        33506|
|    Sci-Fi|    64893|       28923|        35970|
|   Mystery|   112821|       70575|        42246|


# Question 6

> What is the correlation between a movie's runtime and its IMDb rating across different genres?

In [8]:
runtime_rating_correlation = data_frames["title_basics"] \
    .join(data_frames["title_ratings"], data_frames["title_basics"].tconst == data_frames["title_ratings"].tconst) \
    .withColumn("genre", F.split(data_frames["title_basics"].genres, ",")[0]) \
    .filter(F.col("runtimeMinutes").isNotNull()) \
    .filter(F.col("averageRating").isNotNull()) \
    .filter(F.col("genre").isNotNull()) \
    .groupBy("genre") \
    .agg(F.corr("runtimeMinutes", "averageRating").alias("correlation")) \
    .orderBy(F.col("correlation"), ascending=False)

runtime_rating_correlation.limit(20).show()

total_records = runtime_rating_correlation.count()
print(f"Total records: {total_records}")

+-----------+--------------------+
|      genre|         correlation|
+-----------+--------------------+
|  Film-Noir| 0.49792428224707524|
|      Adult| 0.18126187018895804|
|      Short| 0.08786138465682988|
|      Sport|  0.0685325456673089|
|      Music| 0.05196345317888905|
|        War|0.048513242386614225|
|    Musical| 0.02523974723010198|
|Documentary|0.011689578979170062|
|         \N|0.009696633072796309|
|  Animation|0.006909303596712093|
|     Comedy|2.702744954132916...|
| Reality-TV|-0.03589961809539111|
|    History|-0.07364957043513691|
|  Biography|-0.08196676267851066|
|   Thriller|-0.08664473665950595|
|  Talk-Show|-0.09157749468273618|
|  Adventure| -0.0946177477323889|
|       News|-0.13014709781339057|
|     Family| -0.1387504520684432|
|  Game-Show|-0.15363750078833976|
+-----------+--------------------+

Total records: 29


# Save

In [9]:
actor_director_collab.write.option("header", "true").csv("data/results/actor_director_collab.csv")
runtime_by_decade_genre.write.option("header", "true").csv("data/results/runtime_by_decade_genre.csv")
multi_language_films.write.option("header", "true").csv("data/results/multi_language_films.csv")
tv_ratings_trend.write.option("header", "true").csv("data/results/tv_ratings_trend.csv")
gender_balance_genre.write.option("header", "true").csv("data/results/gender_balance_genre.csv")
runtime_rating_correlation.write.option("header", "true").csv("data/results/runtime_rating_correlation.csv")