In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName("imdb_query").getOrCreate()

24/10/11 19:21:46 WARN Utils: Your hostname, codespaces-cba268 resolves to a loopback address: 127.0.0.1; using 10.0.0.71 instead (on interface eth0)
24/10/11 19:21:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/11 19:21:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/11 19:21:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/10/11 19:21:47 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
from pathlib import Path

imdb_tables = {}
data_folder = Path("./data/raw")

for file in list(data_folder.glob("*.tsv.gz")):
    table_name = "_".join(file.name.split(".")[:2])
    print(table_name)
    imdb_tables[table_name] = (
        spark.read.options(
            **{
                "sep": "\t",
                "header": True,
                "compression": "gzip",
                "nullValue": r"\N",
            }
        ).csv(f"{data_folder/file.name}")
        # .limit(100)
    )

title_akas
title_principals
title_ratings
title_basics
name_basics
title_episode
title_crew


In [4]:
from transformation import bronze as xform_bronze


bronze_transformation = {
    "name_basics": xform_bronze.name_basics.transformation,
    "title_akas": xform_bronze.title_akas.transformation,
    "title_basics": xform_bronze.title_basics.transformation,
    "title_crew": xform_bronze.title_crew.transformation,
    "title_episode": xform_bronze.title_episode.transformation,
    "title_principals": xform_bronze.title_principals.transformation,
    "title_ratings": xform_bronze.title_ratings.transformation,
}

cleansed_imdb_tables = {
    _table_name: _sdf.transform(bronze_transformation[_table_name])
    for _table_name, _sdf in imdb_tables.items()
}

# You can now access the cleansed table on cleansed_imdb_tables["table_name"]

---

In [5]:
from transformation.tasks import (
    task1,
    task2a,
    task2b,
)

# Task 1

## Retrieve the top 10 movies with a minimum of 500 votes with the ranking determined by

`(numVotes/averageNumberOfVotes)*averageRating`

In [6]:
# Getting the value of the average number of votes across all titles including movies and short
average_number_of_voters = cleansed_imdb_tables["title_ratings"].agg(
    F.avg("numVotes").alias("averageNumberOfVotes")
).head()["averageNumberOfVotes"]

print(f"{average_number_of_voters = }")

# Ranking Logics is given from the PDF
ranking_logics = (
    F.col("numVotes")
    * F.col("averageRating")
    / F.lit(average_number_of_voters)
    # F.col("numVotes") * F.col("averageRating")
)

top_10_movies_with_min_500_votes_sdf = task1(
    title_basics_sdf=cleansed_imdb_tables["title_basics"],
    title_ratings_sdf=cleansed_imdb_tables["title_ratings"],
    ranking_logics=ranking_logics,
)

top_10_movies_with_min_500_votes_sdf.show(truncate=False)
top_10_movies_with_min_500_votes_sdf.cache()

                                                                                

average_number_of_voters = 1030.1932654153331


24/10/11 19:22:02 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


+---------+---------------------------------------------+--------+------------------+
|tconst   |primaryTitle                                 |numVotes|ranking           |
+---------+---------------------------------------------+--------+------------------+
|tt0111161|The Shawshank Redemption                     |2949309 |25765.825589335996|
|tt0468569|The Dark Knight                              |2930213 |25598.998639704645|
|tt1375666|Inception                                    |2601001 |20198.159606111418|
|tt0137523|Fight Club                                   |2381226 |18491.48954814791 |
|tt0068646|The Godfather                                |2055854 |17960.402791548488|
|tt0109830|Forrest Gump                                 |2307341 |17917.73312802445 |
|tt0167260|The Lord of the Rings: The Return of the King|2018856 |17637.17994475017 |
|tt0110912|Pulp Fiction                                 |2264831 |17587.620311898736|
|tt0816692|Interstellar                               

                                                                                

DataFrame[tconst: string, primaryTitle: string, numVotes: int, ranking: double]

# Task 2

## For these 10 movies, list the persons who are most often credited

In [7]:
task2a(
    top_10_movies_with_min_500_votes_sdf,
    title_principals_sdf=cleansed_imdb_tables["title_principals"],
    name_basics_sdf=cleansed_imdb_tables["name_basics"],
).show(truncate=False)

[Stage 22:>                                                         (0 + 1) / 1]

+---------+-----+-----------------+
|nconst   |count|primaryName      |
+---------+-----+-----------------+
|nm0634240|9    |Christopher Nolan|
+---------+-----+-----------------+



                                                                                

## For these 10 movies, list the different titles of the 10 movies

In [8]:
task2b(
    top_10_movies_with_min_500_votes_sdf,
    title_akas_sdf=cleansed_imdb_tables["title_akas"],
).show(truncate=False, vertical=True, n=10)



-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                