# Imports

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from schemas import data_frames

# Question 1
> All Ukrainian movies released after 1990 with an IMDb rating above 5.0

In [None]:
window_spec = Window.orderBy(F.col("startYear").desc())
ukrainian_movies = data_frames["title_akas"] \
    .filter("language = 'uk'") \
    .join(data_frames["title_basics"], data_frames["title_akas"].titleId == data_frames["title_basics"].tconst) \
    .join(data_frames["title_ratings"], data_frames["title_basics"].tconst == data_frames["title_ratings"].tconst) \
    .filter("startYear > 1990 AND averageRating > 5.0") \
    .select("primaryTitle", "startYear", "averageRating", "language")

ordered_movies = ukrainian_movies.orderBy(F.col("startYear").desc())
ordered_movies.show()

total_records = ordered_movies.count()
print(f"Загальна кількість записів у відповіді: {total_records}")


+--------------------+---------+-------------+--------+
|        primaryTitle|startYear|averageRating|language|
+--------------------+---------+-------------+--------+
|     The Glassworker|     2024|          7.4|      uk|
|            I, Robot|     2019|          6.3|      uk|
|Yobi, the Five Ta...|     2007|          6.7|      uk|
|   War of the Worlds|     2005|          6.5|      uk|
| With Fire and Sword|     1999|          7.0|      uk|
|       The Lion King|     1994|          8.5|      uk|
|     Milenkiy ty moy|     1992|          6.2|      uk|
|     Milenkiy ty moy|     1992|          6.2|      uk|
|     Hopeless Wombat|     1991|          8.0|      uk|
|        Myna Mazaylo|     1991|          7.0|      uk|
|Chelovek v zelyon...|     1991|          5.7|      uk|
|              Eneida|     1991|          8.0|      uk|
|    Karpatske zoloto|     1991|          6.1|      uk|
|Doraemon: Nobita'...|     1991|          7.3|      uk|
|    Karpatske zoloto|     1991|          6.1|  

# Question 2
> Top 20 directors with the highest average movie ratings for films with at least 1000 votes

In [None]:
window_spec = Window.orderBy(F.desc("avgRating"))
directors_ratings = data_frames["title_crew"] \
    .join(data_frames["title_ratings"], data_frames["title_crew"].tconst == data_frames["title_ratings"].tconst) \
    .filter("numVotes >= 1000") \
    .join(
        data_frames["name_basics"].alias("directors"),
        data_frames["title_crew"].directors == F.col("directors.nconst")
    ) \
    .groupBy("directors.primaryName") \
    .agg(F.avg("averageRating").alias("avgRating"))

ranked_directors = directors_ratings.withColumn("rank", F.row_number().over(window_spec))

top_20_directors = ranked_directors.filter(F.col("rank") <= 20).drop("rank")

top_20_directors.show()

total_records = top_20_directors.count()
print(f"Загальна кількість записів у відповіді: {total_records}")


+--------------------+-----------------+
|         primaryName|        avgRating|
+--------------------+-----------------+
|            Anh Phan|9.899999618530273|
|Garnepudi Radha K...|9.800000190734863|
|          Tyler Bell|9.800000190734863|
|        Bahri Baykal|9.800000190734863|
|    Tatsuya Ishihara|9.800000190734863|
|     Serkan Ipekören|9.783333460489908|
|       Cevdet Mercan|9.718181899099639|
|   Vinod Kumar Singh|9.699999809265137|
|    Slaheddine Essid|9.699999809265137|
|      Masato Mitsuka|9.699999809265137|
|      Gregory Prange|9.699999809265137|
|     István Zorkóczy|9.699999809265137|
|       Serdar Dönmez|9.699999809265137|
|      George Mastras|9.699999809265137|
|     Richard Jeffery|9.650000095367432|
|    Chizuru Miyawaki|9.614285741533552|
|           Joe Brumm|9.600000381469727|
|    Yasuto Nishikata|9.600000381469727|
|Konrad Tomaszkiewicz|9.600000381469727|
|         Cansu Arica|9.600000381469727|
+--------------------+-----------------+

Загальна кількі

# Question 3
> TV series with the most consistent ratings across all episodes

In [None]:
tv_series_ratings = data_frames["title_episode"] \
    .join(data_frames["title_ratings"], "tconst") \
    .join(
        data_frames["title_basics"],
        data_frames["title_episode"].parentTconst == data_frames["title_basics"].tconst
    ) \
    .groupBy("parentTconst", "primaryTitle") \
    .agg(
        F.stddev("averageRating").alias("rating_stddev"),
        F.count(data_frames["title_episode"].tconst).alias("num_episodes")
    ) \
    .filter(F.col("num_episodes") >= 5) \
    .orderBy(F.desc("rating_stddev")) \
    .select("primaryTitle", "rating_stddev", "num_episodes")

tv_series_ratings.limit(20).show(truncate=False)

total_records = tv_series_ratings.count()

print(f"Загальна кількість записів у відповіді: {total_records}")



+-------------------------------------+------------------+------------+
|primaryTitle                         |rating_stddev     |num_episodes|
+-------------------------------------+------------------+------------+
|Here's Hollywood                     |4.280420430887691 |5           |
|The Doctors                          |4.166190448976482 |8           |
|Loose Women                          |3.7118729279488747|5           |
|Question Time                        |3.7071550177120933|5           |
|Dr. Sommerfeld - Neues vom Bülowbogen|3.7064358803258775|6           |
|The Real                             |3.663624388442638 |8           |
|This Morning                         |3.620082855843896 |5           |
|Come Dine with Me                    |3.559026160304422 |10          |
|Movie Magic                          |3.498571262298427 |5           |
|Tavis Smiley                         |3.4981343714789395|9           |
|Larry King Now                       |3.462369150932518 |5     

# Question 4
> Top most popular genres by decade based on average rating and number of votes

In [None]:
data_frames["title_basics"] = data_frames["title_basics"] \
    .withColumn("decade", (F.col("startYear") / 10).cast("int") * 10)

genre_by_decade = data_frames["title_basics"] \
    .join(data_frames["title_ratings"], data_frames["title_basics"].tconst == data_frames["title_ratings"].tconst) \
    .withColumn("genre", F.split(data_frames["title_basics"].genres, ",")[0]) \
    .groupBy("decade", "genre") \
    .agg(F.avg("averageRating").alias("avgRating"), F.sum("numVotes").alias("totalVotes")) \
    .orderBy(F.desc("totalVotes"))

genre_by_decade.limit(20).show()

total_records = genre_by_decade.count()

print(f"Загальна кількість записів у відповіді: {total_records}")


+------+---------+------------------+----------+
|decade|    genre|         avgRating|totalVotes|
+------+---------+------------------+----------+
|  2010|   Action| 7.165545971000893| 210414789|
|  2000|   Action|7.0290417243177465| 116083763|
|  2010|    Drama|7.0493540098362475|  95420853|
|  2010|   Comedy| 7.080939846052681|  93826909|
|  2000|   Comedy| 6.894717393612873|  85694066|
|  2020|   Action| 7.291785928923785|  81888897|
|  2000|    Drama| 6.898112578915886|  60235951|
|  2010|    Crime| 7.269374991395223|  52720751|
|  2000|Adventure|7.0315394857283495|  48919270|
|  1990|   Action|  6.76407315327134|  48798073|
|  2010|Adventure| 7.180359728240258|  48500923|
|  1990|   Comedy| 6.943198078509985|  44559235|
|  2000|    Crime| 7.157674599078984|  39146299|
|  2020|    Drama| 7.264382981095291|  37594778|
|  1990|    Drama| 6.716693814177882|  34507288|
|  2020|   Comedy| 7.083187245543777|  33743850|
|  2010|Biography| 7.291644996932243|  33609145|
|  1990|    Crime| 6

# Question 5
> Actors who have played the most diverse roles (appeared in the most different genres)

In [None]:
actor_genre_diversity = data_frames["title_principals"] \
    .join(data_frames["title_basics"], data_frames["title_principals"].tconst == data_frames["title_basics"].tconst) \
    .withColumn("genre", F.split(data_frames["title_basics"].genres, ",")) \
    .select("nconst", "genre") \
    .withColumn("genre_count", F.size("genre")) \
    .groupBy("nconst") \
    .agg(F.sum("genre_count").alias("totalGenres")) \
    .orderBy(F.desc("totalGenres"))

actor_genre_diversity = actor_genre_diversity \
    .join(data_frames["name_basics"], actor_genre_diversity.nconst == data_frames["name_basics"].nconst) \
    .select("primaryName", "totalGenres")

actor_genre_diversity = actor_genre_diversity.repartition(4)

actor_genre_diversity.limit(20).show()

total_records = actor_genre_diversity.count()
print(f"Загальна кількість записів у відповіді: {total_records}")


+--------------------+-----------+
|         primaryName|totalGenres|
+--------------------+-----------+
|          Bree Janes|          2|
|       Antony Miller|          1|
|          Aldo Duque|         41|
|         H. Rosinski|          2|
|         Kevin Kelly|          2|
|       Karine Dufour|          5|
|          Kamil Onte|          2|
|Chatchawit Techar...|        424|
|     Colin McInerney|          2|
|        Jospeh Cerda|          1|
|Narashingh Mohapatra|          1|
|               Carly|          3|
|       Mark Lawrence|         10|
|       Fabian Prager|          5|
|       Nevin Martell|          1|
|    Francesca Cullen|          2|
|       Armand Holmes|          4|
|     Karl-Heinz Lotz|         25|
|   Yvan Stefanovitch|          5|
|           Adim Duke|          2|
+--------------------+-----------+

Загальна кількість записів у відповіді: 6638383


# Question 6
> All movies that have different titles in at least 2 languages and have high ratings

In [None]:
multi_language_movies = data_frames["title_akas"] \
    .join(data_frames["title_ratings"], data_frames["title_akas"].titleId == data_frames["title_ratings"].tconst) \
    .join(data_frames["title_basics"], data_frames["title_akas"].titleId == data_frames["title_basics"].tconst) \
    .groupBy("titleId", "primaryTitle") \
    .agg(
        F.countDistinct("language").alias("distinctLanguages"),
        F.avg("averageRating").alias("avgRating")
    ) \
    .filter("distinctLanguages >= 2 AND avgRating > 7.0") \
    .select("primaryTitle", "distinctLanguages", "avgRating")
multi_language_movies.limit(20).show()

total_records = multi_language_movies.count()

print(f"Загальна кількість записів у відповіді: {total_records}")

+--------------------+-----------------+-----------------+
|        primaryTitle|distinctLanguages|        avgRating|
+--------------------+-----------------+-----------------+
| Salaviinanpolttajat|                3|              8.5|
|        Max virtuose|                3|7.699999809265137|
|            Miraklet|                2|7.300000190734863|
| Gertie the Dinosaur|                2|7.099999904632568|
|             The Spy|                2|              7.5|
|    Romeo and Juliet|                2|7.199999809265137|
|      The Gray Ghost|                2|              8.0|
| In Pursuit of Polly|                2|7.699999809265137|
|     Broken Blossoms|                9|7.199999809265137|
|   The Cinema Murder|                2|8.100000381469727|
|       The Dark Star|                2|7.900000095367432|
|    His Bridal Night|                2|7.599999904632568|
|            Theodora|                2|7.400000095367432|
|       The Two Girls|                3|7.30000019073486

# Save

In [None]:
ukrainian_movies.write.option("header", "true").csv("data/results/ukrainian_movies.csv")
directors_ratings.write.option("header", "true").csv("data/results/top_20_directors.csv")
tv_series_ratings.write.option("header", "true").csv("data/results/tv_series_consistent_ratings.csv")
genre_by_decade.write.option("header", "true").csv("data/results/popular_genres_by_decade.csv")
actor_genre_diversity.write.option("header", "true").csv("data/results/actors_diverse_roles.csv")
multi_language_movies.write.option("header", "true").csv("data/results/multi_language_movies.csv")
