In [None]:
from schemas import data_frames

Which years were the most successful for dramas over 120 minutes with ratings over 8.0?

In [5]:
# Search for dramas sorted by descending rating + grouped by year
filtered_dramas = data_frames["title_basics"] \
    .join(data_frames["title_ratings"], "tconst") \
    .filter(
        (col("genres").contains("Drama")) &
        (col("runtimeMinutes") > 120) &
        (col("averageRating") > 8.0)
    ) \
    .select("primaryTitle", "startYear", "runtimeMinutes", "averageRating", "genres")

# Grouping by year and calculating the average drama rating
grouped_dramas = filtered_dramas.groupBy("startYear") \
    .agg(avg("averageRating").alias("avg_rating")) \
    .orderBy(col("avg_rating").desc())

grouped_dramas.show(10, truncate=False)

+---------+-----------------+
|startYear|avg_rating       |
+---------+-----------------+
|NULL     |9.25             |
|2015     |8.912766040639674|
|1942     |8.899999618530273|
|2016     |8.880952444530669|
|2023     |8.81970442222257 |
|2025     |8.7629629417702  |
|2024     |8.724218767136335|
|2021     |8.68999998304579 |
|2014     |8.687951765864728|
|1968     |8.676923164954552|
+---------+-----------------+
only showing top 10 rows



Find actors and actresses who played the main role (category = 'actor'/'actress') in movies with a rating > 9.0

In [None]:
high_rated_actors = data_frames["title_principals"] \
    .filter(col("category").isin("actor", "actress")) \
    .join(data_frames["title_ratings"], "tconst") \
    .filter(col("averageRating") > 9.0) \
    .join(data_frames["name_basics"], "nconst") \
    .join(data_frames["title_basics"], "tconst") \
    .select("primaryName", "primaryTitle", "category", "averageRating")

high_rated_actors.show(10, truncate=False)

+---------------+------------+--------+-------------+
|primaryName    |primaryTitle|category|averageRating|
+---------------+------------+--------+-------------+
|Frank Faylen   |Bombalera   |actor   |9.2          |
|Mikhail Rasumny|Bombalera   |actor   |9.2          |
|Muriel Tovar   |Bombalera   |actress |9.2          |
|Johnny Johnston|Bombalera   |actor   |9.2          |
|Billy Daniel   |Bombalera   |actor   |9.2          |
|Chatita Tovar  |Bombalera   |actress |9.2          |
|Lollita Tovar  |Bombalera   |actress |9.2          |
|Tana Tovar     |Bombalera   |actor   |9.2          |
|Olga San Juan  |Bombalera   |actress |9.2          |
|Conrad Tovar   |Bombalera   |actor   |9.2          |
+---------------+------------+--------+-------------+
only showing top 10 rows



Медіанна тривалість фільмів за жанром. (group)

In [None]:
from pyspark.sql.functions import col, split, explode, count, row_number
from pyspark.sql.window import Window

# Select films with a certain duration
movies = data_frames["title_basics"].filter(
    (col("titleType") == "movie") &
    (col("runtimeMinutes").isNotNull()) &
    (col("genres").isNotNull())
)

# Break genres into separate lines
movies_exploded = movies.withColumn("genre", explode(split(col("genres"), ",")))

# Creating a window for calculating the median
window_spec = Window.partitionBy("genre").orderBy("runtimeMinutes")

# Rank movies in each genre
ranked_movies = movies_exploded.withColumn("row_num", row_number().over(window_spec))

# Find the number of movies in each genre
genre_counts = ranked_movies.groupBy("genre").agg(count("*").alias("total"))

# Combining to calculate the median
movies_with_counts = ranked_movies.join(genre_counts, on="genre")

# Select the rows corresponding to the median position
median_movies = movies_with_counts.filter(
    (col("row_num") == (col("total") / 2).cast("int")) |
    (col("row_num") == ((col("total") + 1) / 2).cast("int"))
)

# Group to find the average of the two central values ​​(if there are two)
median_result = median_movies.groupBy("genre").agg(avg("runtimeMinutes").alias("median_runtime"))


median_result.orderBy("median_runtime", ascending=False).show(30, truncate=False)

+-----------+--------------+
|genre      |median_runtime|
+-----------+--------------+
|Romance    |95.0          |
|Action     |94.0          |
|Drama      |93.0          |
|Musical    |93.0          |
|Thriller   |93.0          |
|Mystery    |92.0          |
|Crime      |92.0          |
|War        |92.0          |
|Comedy     |91.0          |
|Adventure  |90.0          |
|Family     |90.0          |
|Fantasy    |90.0          |
|History    |90.0          |
|Reality-TV |90.0          |
|Sci-Fi     |90.0          |
|Sport      |90.0          |
|Biography  |88.0          |
|Horror     |88.0          |
|Music      |87.0          |
|Film-Noir  |82.0          |
|Animation  |81.0          |
|\N         |79.0          |
|Adult      |77.0          |
|Talk-Show  |76.5          |
|News       |75.0          |
|Documentary|74.0          |
|Western    |70.0          |
|Game-Show  |55.0          |
+-----------+--------------+



Найчастіші 20 пар жанрів за весь час (group)

In [None]:
# Select films with valid genres
two_genres_combinations = data_frames["title_basics"].filter(
    (col("titleType") == "movie") &
    (col("genres").isNotNull()) &
    (col("genres") != "\\N")
)

# Convert the genre string to an array, sort, merge back into a string
two_genres = two_genres_combinations.withColumn(
    "genres_array", split(col("genres"), ",")
).filter(
    size(col("genres_array")) == 2  # тільки комбінації з 2 жанрів
).withColumn(
    "sorted_genres", sort_array(col("genres_array"))
).withColumn(
    "genre_combo", concat_ws(",", col("sorted_genres"))
)

# Group by genre
top_two_genre_combinations = two_genres.groupBy("genre_combo") \
    .agg(count("*").alias("count")) \
    .orderBy(col("count").desc())


top_two_genre_combinations.show(20, truncate=False)


+---------------------+-----+
|genre_combo          |count|
+---------------------+-----+
|Comedy,Drama         |14269|
|Drama,Romance        |13820|
|Comedy,Romance       |7792 |
|Crime,Drama          |6178 |
|Drama,Thriller       |5069 |
|Action,Drama         |4509 |
|Horror,Thriller      |4159 |
|Documentary,Music    |3738 |
|Biography,Documentary|3582 |
|Drama,War            |2816 |
|Drama,Family         |2711 |
|Comedy,Horror        |2435 |
|Documentary,History  |2209 |
|Documentary,Drama    |2189 |
|Action,Thriller      |2187 |
|Action,Comedy        |2086 |
|Documentary,Sport    |1890 |
|Drama,History        |1827 |
|Comedy,Musical       |1701 |
|Action,Crime         |1667 |
+---------------------+-----+
only showing top 20 rows



Для кожного режисера — знайти його фільм з найвищим рейтингом серед тих, що мають понад 10,000 голосів

In [None]:
director_movies = data_frames["title_basics"] \
    .join(data_frames["title_ratings"], "tconst") \
    .join(data_frames["title_crew"], "tconst") \
    .filter(col("numVotes") > 10000) \
    .filter(col("directors").isNotNull()) \
    .select("tconst", "primaryTitle", "averageRating", "numVotes", "directors")

# Window by director, sorted by rating
window_spec = Window.partitionBy("directors").orderBy(col("averageRating").desc())

top_director_movies = director_movies \
    .withColumn("rank", rank().over(window_spec)) \
    .filter(col("rank") == 1) \
    .select("primaryTitle", "averageRating", "numVotes")

top_director_movies.show(10, truncate=False)

+--------------------------------------------------+-------------+--------+
|primaryTitle                                      |averageRating|numVotes|
+--------------------------------------------------+-------------+--------+
|Red Dead Redemption II                            |9.7          |65156   |
|Clash! Shanks vs. Eustass Kid                     |9.7          |26977   |
|Kejime o Tsukeru: Shirohige vs Kurohige Kaizokudan|9.7          |49289   |
|Scenes from a Marriage                            |8.5          |17557   |
|One-Eyed Jacks                                    |7.1          |14193   |
|Nights of Cabiria                                 |8.1          |54745   |
|Rear Window                                       |8.5          |544069  |
|Psycho                                            |8.5          |749457  |
|Sherlock Jr.                                      |8.2          |61358   |
|Hello, Dolly!                                     |7.0          |17687   |
+-----------

Тренд популярності фільмів по країнам — як змінювався середній рейтинг у США, Франції та Японії по роках

In [None]:
# Combining movies with titles and ratings
titles_with_region = data_frames["title_akas"] \
    .filter(col("region").isin("US", "FR", "JP")) \
    .join(data_frames["title_ratings"], data_frames["title_akas"].titleId == data_frames["title_ratings"].tconst) \
    .join(data_frames["title_basics"], data_frames["title_akas"].titleId == data_frames["title_basics"].tconst) \
    .filter(col("startYear").isNotNull())

# Window by country and year — average rating
window_spec = Window.partitionBy("region", "startYear")

avg_rating_by_year_country = titles_with_region \
    .withColumn("avg_rating", avg("averageRating").over(window_spec)) \
    .select("region", "startYear", "primaryTitle", "averageRating", "avg_rating") \
    .distinct() \
    .orderBy("region", "startYear")

avg_rating_by_year_country.show(20, truncate=False)


+------+---------+--------------------+-------------+-----------------+
|region|startYear|primaryTitle        |averageRating|avg_rating       |
+------+---------+--------------------+-------------+-----------------+
|FR    |1874     |Passage de Venus    |6.8          |6.800000190734863|
|FR    |1878     |Le singe musicien   |6.1          |5.258064500747189|
|FR    |1878     |Dzing. Boom. Boom!  |5.2          |5.258064500747189|
|FR    |1878     |L'Aquarium          |5.6          |5.258064500747189|
|FR    |1878     |Le Repas des Poulets|5.2          |5.258064500747189|
|FR    |1878     |Le Moulin à Eau     |5.2          |5.258064500747189|
|FR    |1878     |Le Déjeuner de Bébé |4.9          |5.258064500747189|
|FR    |1878     |La Nageuse          |5.1          |5.258064500747189|
|FR    |1878     |La Balançoire       |5.3          |5.258064500747189|
|FR    |1878     |Les Deux Espiègles  |5.3          |5.258064500747189|
|FR    |1878     |Le Jeu de Grâces    |5.2          |5.258064500