In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col, avg, count, sum, explode, split, dense_rank, desc, year, when, max, rank
from schemas import data_frames

# **Question 1**

In [3]:
#1. Find the highest rated comedy movies released after 2010 with over 10,000 votes

top_rated_films = data_frames["title_basics"] \
    .filter("titleType = 'movie' AND startYear > 2010 AND genres='Comedy'") \
    .join(data_frames["title_ratings"], data_frames["title_basics"].tconst == data_frames["title_ratings"].tconst) \
    .filter("numVotes > 10000") \
    .select("primaryTitle", "startYear", "averageRating", "numVotes","titleType") \
    .orderBy(col("averageRating").desc()) \


top_rated_films.show(20)
total_records = top_rated_films.count()
print(f"Загальна кількість записів у відповіді: {total_records}")

+--------------------+---------+-------------+--------+---------+
|        primaryTitle|startYear|averageRating|numVotes|titleType|
+--------------------+---------+-------------+--------+---------+
|     The Book of Sun|     2020|          7.9|   12006|    movie|
|      Between Family|     2017|          7.6|   27388|    movie|
| Two Lottery Tickets|     2016|          7.5|   12202|    movie|
|   We Are the Nobles|     2013|          7.3|   10996|    movie|
|                Bala|     2019|          7.3|   16715|    movie|
|           Booksmart|     2019|          7.1|  136032|    movie|
|         Çalgi Çengi|     2011|          7.0|   14026|    movie|
|         Snack Shack|     2024|          7.0|   12013|    movie|
| The Grand Seduction|     2013|          7.0|   20080|    movie|
| Serial Bad Weddings|     2014|          7.0|   46797|    movie|
|        Theater Camp|     2023|          6.9|   18429|    movie|
|                 Ted|     2012|          6.9|  681043|    movie|
|Everybody

# **Question 2**

In [4]:
# Actors/actresses with the most American films since 2012

top_actors_usa = data_frames["title_principals"] \
    .join(data_frames["title_akas"], data_frames["title_principals"].tconst == data_frames["title_akas"].titleId, "inner") \
    .filter(col("region") == "US") \
    .join(data_frames["name_basics"], data_frames["title_principals"].nconst == data_frames["name_basics"].nconst, "inner") \
    .join(data_frames["title_ratings"], data_frames["title_principals"].tconst == data_frames["title_ratings"].tconst, "inner") \
    .join(data_frames["title_basics"], data_frames["title_principals"].tconst == data_frames["title_basics"].tconst, "inner") \
    .filter(
    (col("startYear") > 2012) &
    (col("titleType") == "movie") &
    ((col("category") == "actor") | (col("category") == "actress"))
)


top_actors_usa = top_actors_usa \
    .groupBy("primaryName") \
    .agg(
        count(data_frames["title_principals"].tconst).alias("film_count"),  # Кількість фільмів
        max(data_frames["title_ratings"].averageRating).alias("best_film_rating")  # Найвищий рейтинг
    ) \
    .filter(col("film_count") > 0) \
    .orderBy(desc("film_count"))

top_actors_usa.show(20)

# Кількість записів
total_records = top_actors_usa.count()
print(f"Загальна кількість записів у відповіді: {total_records}")


+------------------+----------+----------------+
|       primaryName|film_count|best_film_rating|
+------------------+----------+----------------+
|      Eric Roberts|       327|             8.9|
|      Tom Sizemore|       125|             9.0|
|       Danny Trejo|       115|             8.8|
|   Vennela Kishore|       112|             9.0|
|    Michael Madsen|       110|             7.8|
|         Yogi Babu|       110|             9.6|
|     Ryan Reynolds|       108|             8.0|
|      Michael Paré|       101|             8.2|
|            Nassar|       101|             8.9|
|Scarlett Johansson|       100|             8.4|
|      Willem Dafoe|        95|             8.2|
|        Kj Schrock|        95|             8.2|
|              Ajay|        93|             8.8|
|         Dean Cain|        91|             7.4|
|        Simon Hill|        90|             7.3|
| Tanikella Bharani|        88|             9.1|
|   Woody Harrelson|        88|             8.1|
| Samuel L. Jackson|

# **Question 3**

In [23]:
#Cumulative number of films by year, genre and country
movies_by_year_genre_country = data_frames["title_basics"] \
    .filter("titleType = 'movie' AND startYear IS NOT NULL") \
    .join(data_frames["title_akas"], data_frames["title_basics"]["tconst"] == data_frames["title_akas"]["titleId"]) \
    .groupBy("startYear", "genres", "region") \
    .agg(F.countDistinct("tconst").alias("num_movies"))

window_spec = Window.partitionBy("region", "genres") \
    .orderBy("startYear") \
    .rowsBetween(Window.unboundedPreceding, 0)

movies_by_year_genre_country = movies_by_year_genre_country \
    .withColumn("cumulative_total", F.sum("num_movies").over(window_spec)) \
    .orderBy("region", "genres", "startYear")

movies_by_year_genre_country.show(20)
total_records = movies_by_year_genre_country.count()
print(f"Загальна кількість записів у відповіді: {total_records}")


+---------+-------------------+------+----------+----------------+
|startYear|             genres|region|num_movies|cumulative_total|
+---------+-------------------+------+----------+----------------+
|     1975|              Adult|    AD|         1|               1|
|     2016|Crime,Drama,Mystery|    AD|         1|               1|
|     2016|        Documentary|    AD|         1|               1|
|     2020|        Documentary|    AD|         1|               2|
|     2021|        Documentary|    AD|         1|               3|
|     2022|        Documentary|    AD|         1|               4|
|     2023|        Documentary|    AD|         1|               5|
|     2021|    Documentary,War|    AD|         1|               1|
|     2016|              Drama|    AD|         1|               1|
|     1996|                 \N|    AD|         1|               1|
|     1986|             Action|    AE|         1|               1|
|     2001|             Action|    AE|         1|             

# **Question 4**

In [15]:


# Top 5 highest-rated films in each genre

window_spec = Window.partitionBy("genres").orderBy(desc("averageRating"))

ranked_films = data_frames["title_basics"] \
    .join(data_frames["title_ratings"], data_frames["title_basics"].tconst == data_frames["title_ratings"].tconst, "inner") \
    .filter("titleType = 'movie' AND numVotes > 10000") \
    .withColumn("rank", rank().over(window_spec)) \
    .select("primaryTitle", "genres", "averageRating", "rank") \
    .filter(col("rank") <= 5)  # Топ-5 фільмів за рейтингом у кожному жанрі

ranked_films.show(20)
total_records = ranked_films.count()
print(f"Загальна кількість записів у відповіді: {total_records}")


+--------------------+--------------------+-------------+----+
|        primaryTitle|              genres|averageRating|rank|
+--------------------+--------------------+-------------+----+
|Once Upon a Time ...|              Action|          7.2|   1|
|   War of the Arrows|              Action|          7.1|   2|
|           Ong Bak 2|              Action|          6.2|   3|
|          Velayudham|              Action|          6.1|   4|
|            Bairavaa|              Action|          5.8|   5|
|Raiders of the Lo...|    Action,Adventure|          8.4|   1|
|Indiana Jones and...|    Action,Adventure|          8.2|   2|
|Indiana Jones and...|    Action,Adventure|          7.5|   3|
|              Batman|    Action,Adventure|          7.5|   3|
|     Shogun Assassin|    Action,Adventure|          7.3|   5|
|Attack on Titan t...|Action,Adventure,...|          9.2|   1|
|Ramayana: The Leg...|Action,Adventure,...|          9.1|   2|
|Solo Leveling: Re...|Action,Adventure,...|          8.

# **Question 5**

In [17]:


# Find the top 3 lowest rated movies in each country with > 50,000 votes
window_spec = Window.partitionBy("region").orderBy("min_rating")

top_films_by_country = data_frames["title_akas"] \
    .join(data_frames["title_ratings"], data_frames["title_akas"].titleId == data_frames["title_ratings"].tconst, "inner") \
    .filter("region IS NOT NULL AND numVotes > 50000") \
    .join(data_frames["title_basics"], data_frames["title_akas"].titleId == data_frames["title_basics"].tconst, "inner") \
    .groupBy("region", "title", "genres") \
    .agg(F.min("averageRating").alias("min_rating")) \
    .withColumn("row_number", F.row_number().over(window_spec)) \
    .filter(F.col("row_number") <= 3)

top_films_by_country.show(20)
total_records = top_films_by_country.count()
print(f"Загальна кількість записів у відповіді: {total_records}")


+------+--------------------+--------------------+----------+----------+
|region|               title|              genres|min_rating|row_number|
+------+--------------------+--------------------+----------+----------+
|    AE|             Sadak 2|        Action,Drama|       1.2|         1|
|    AE|               Radhe|     Action,Thriller|       1.8|         2|
|    AE|      Disaster Movie|       Comedy,Sci-Fi|       1.9|         3|
|    AF|        Czarna lista| Crime,Drama,Mystery|       7.9|         1|
|    AL|Pesëdhjetë hijet ...|Drama,Romance,Thr...|       4.2|         1|
|    AL| Legjenda e Herkulit|Action,Adventure,...|       4.3|         2|
|    AL|Pesëdhjetë hije t...|Drama,Mystery,Rom...|       4.5|         3|
|    AM|Hing gisher Fredi...|Horror,Mystery,Th...|       5.4|         1|
|    AM|         vat'sunhing|Action,Adventure,...|       5.4|         2|
|    AM|   Asasini davananqe|Action,Adventure,...|       5.6|         3|
|    AR|  La reina Cleopatra|Documentary,Drama...| 

# **Question 6**

In [3]:


# Find the top directors by the number of films in the "Action" genre with the highest rating
top_action_directors = data_frames["title_crew"] \
    .join(data_frames["title_ratings"], data_frames["title_crew"].tconst == data_frames["title_ratings"].tconst, "inner") \
    .join(data_frames["title_basics"], data_frames["title_crew"].tconst == data_frames["title_basics"].tconst, "inner") \
    .join(data_frames["name_basics"], data_frames["title_crew"].directors == data_frames["name_basics"].nconst, "inner") \
    .filter("directors IS NOT NULL AND genres LIKE '%Action%'") \
    .groupBy("primaryName") \
    .agg(
        F.count(data_frames["title_crew"].tconst).alias("num_movies"),
        F.max(data_frames["title_ratings"].averageRating).alias("max_film_rating")
    ) \
    .orderBy(F.desc("num_movies")) \


top_action_directors.show(20)
total_records = top_action_directors.count()
print(f"Загальна кількість записів у відповіді: {total_records}")


+--------------------+----------+---------------+
|         primaryName|num_movies|max_film_rating|
+--------------------+----------+---------------+
|   Luke Lerdwichagul|       789|            9.6|
| Yasuichirô Yamamoto|       482|            9.2|
|      Daisuke Nishio|       469|            9.4|
|          Kevin Dunn|       372|            9.1|
|Paulo Viníccius S...|       311|           10.0|
|         Hayato Date|       297|            9.6|
|      Iginio Straffi|       241|            9.4|
|         James Rolfe|       238|            9.1|
|            Onur Tan|       234|            9.7|
|     Kôichi Sakamoto|       220|            9.6|
|            Dan Riba|       215|            9.7|
|        Ryuta Tasaki|       200|            9.7|
|      Takashi Watabe|       188|            8.9|
|       Hatsuki Tsuji|       185|            9.1|
|         Masato Satô|       173|            9.6|
|  James Whitmore Jr.|       172|            9.1|
|       Ben B. Singer|       169|            9.6|


In [None]:

top_rated_films.write.option("header", "true").csv("/content/top_rated_films.csv")

top_actors_usa.write.option("header", "true").csv("/content/top_actors_usa.csv")

movies_by_year_genre_country.write.option("header", "true").csv("/content/movies_by_year_genre_country.csv")

ranked_films.write.option("header", "true").csv("/content/ranked_films.csv")

top_films_by_country.write.option("header", "true").csv("/content/top_films_by_country.csv")

top_action_directors.write.option("header", "true").csv("/content/top_action_directors.csv")