In [85]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("IMDB Ingestion") \
    .getOrCreate()

# Read a file from MinIO
title_df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/title.basics.tsv.gz")

title_rename_columns_df = title_df.withColumnsRenamed({"tconst": "title_id", "titleType": "title_type", "primaryTitle": "primary_title", "originalTitle": "original_title", "isAdult": "is_adult", "startYear": "start_year", "endYear": "end_year", "runtimeMinutes": "runtime_minutes"}) 

set_null_values_df = title_rename_columns_df \
    .withColumn("end_year",when(col("end_year") == "\\N", None).otherwise(col("end_year")))


set_null_values_df.schema
set_null_values_df.show(5)



+---------+----------+--------------------+--------------------+--------+----------+--------+---------------+--------------------+
| title_id|title_type|       primary_title|      original_title|is_adult|start_year|end_year|runtime_minutes|              genres|
+---------+----------+--------------------+--------------------+--------+----------+--------+---------------+--------------------+
|tt0000001|     short|          Carmencita|          Carmencita|       0|      1894|    NULL|              1|   Documentary,Short|
|tt0000002|     short|Le clown et ses c...|Le clown et ses c...|       0|      1892|    NULL|              5|     Animation,Short|
|tt0000003|     short|        Poor Pierrot|      Pauvre Pierrot|       0|      1892|    NULL|              5|Animation,Comedy,...|
|tt0000004|     short|         Un bon bock|         Un bon bock|       0|      1892|    NULL|             12|     Animation,Short|
|tt0000005|     short|    Blacksmith Scene|    Blacksmith Scene|       0|      1893

In [82]:
title_akas_df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/title.akas.tsv.gz")

title_akas_rename_columns_df = title_akas_df.withColumnsRenamed({"titleId": "title_id", "isOriginalTitle": "is_original_title"}) 

set_null_values_df = title_akas_rename_columns_df \
    .withColumn("region",when(col("region") == "\\N", None).otherwise(col("region"))) \
    .withColumn("language",when(col("language") == "\\N", None).otherwise(col("language"))) \
    .withColumn("types",when(col("types") == "\\N", None).otherwise(col("types"))) \
    .withColumn("attributes",when(col("attributes") == "\\N", None).otherwise(col("attributes"))) 

set_null_values_df.schema
set_null_values_df.show(5)

+---------+--------+--------------------+------+--------+-----------+-------------+-----------------+
| title_id|ordering|               title|region|language|      types|   attributes|is_original_title|
+---------+--------+--------------------+------+--------+-----------+-------------+-----------------+
|tt0000001|       1|          Carmencita|  NULL|    NULL|   original|         NULL|                1|
|tt0000001|       2|          Carmencita|    DE|    NULL|       NULL|literal title|                0|
|tt0000001|       3|          Carmencita|    US|    NULL|imdbDisplay|         NULL|                0|
|tt0000001|       4|Carmencita - span...|    HU|    NULL|imdbDisplay|         NULL|                0|
|tt0000001|       5|          Καρμενσίτα|    GR|    NULL|imdbDisplay|         NULL|                0|
+---------+--------+--------------------+------+--------+-----------+-------------+-----------------+
only showing top 5 rows



In [76]:
title_crew_df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/title.crew.tsv.gz")

title_crew_rename_columns_df = title_crew_df.withColumnsRenamed({"tconst": "title_id", "directors": "director_id"})

set_null_values_df = title_crew_rename_columns_df \
    .withColumn("writers",when(col("writers") == "\\N", None).otherwise(col("writers"))) 

set_null_values_df.schema
set_null_values_df.show(5)




+---------+-----------+---------+
| title_id|director_id|  writers|
+---------+-----------+---------+
|tt0000001|  nm0005690|     NULL|
|tt0000002|  nm0721526|     NULL|
|tt0000003|  nm0721526|nm0721526|
|tt0000004|  nm0721526|     NULL|
|tt0000005|  nm0005690|     NULL|
+---------+-----------+---------+
only showing top 5 rows



In [63]:
title_episode_df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/title.episode.tsv.gz")

title_episode_rename_columns_df = title_episode_df.withColumnsRenamed({"tconst": "title_id", "parentTconst": "parent_title_id", "seasonNumber": "season_number", "episodeNumber": "episode_number"})


set_null_values_df = title_episode_rename_columns_df \
    .withColumn("season_number",when(col("season_number") == "\\N", None).otherwise(col("season_number"))) \
    .withColumn("episode_number",when(col("episode_number") == "\\N", None).otherwise(col("episode_number")))

# df.schema
set_null_values_df.show(5)

+--------+---------------+-------------+--------------+
|title_id|parent_title_id|season_number|episode_number|
+--------+---------------+-------------+--------------+
+--------+---------------+-------------+--------------+

+---------+---------------+-------------+--------------+
| title_id|parent_title_id|season_number|episode_number|
+---------+---------------+-------------+--------------+
|tt0031458|     tt32857063|         NULL|          NULL|
|tt0041951|      tt0041038|            1|             9|
|tt0042816|      tt0989125|            1|            17|
|tt0042889|      tt0989125|         NULL|          NULL|
|tt0043426|      tt0040051|            3|            42|
+---------+---------------+-------------+--------------+
only showing top 5 rows



In [60]:
title_principals_df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/title.principals.tsv.gz")

title_principals_rename_columns_df = title_principals_df.withColumnsRenamed({"tconst": "title_id", "nconst": "name_id"})

set_null_values_df = title_principals_rename_columns_df \
    .withColumn("job",when(col("job") == "\\N", None).otherwise(col("job"))) \
    .withColumn("characters",when(col("characters") == "\\N", None).otherwise(col("characters")))

# df.schema
set_null_values_df.show(5)

+---------+--------+---------+---------------+--------------------+----------+
| title_id|ordering|  name_id|       category|                 job|characters|
+---------+--------+---------+---------------+--------------------+----------+
|tt0000001|       1|nm1588970|           self|                NULL|  ["Self"]|
|tt0000001|       2|nm0005690|       director|                NULL|      NULL|
|tt0000001|       3|nm0005690|       producer|            producer|      NULL|
|tt0000001|       4|nm0374658|cinematographer|director of photo...|      NULL|
|tt0000002|       1|nm0721526|       director|                NULL|      NULL|
+---------+--------+---------+---------------+--------------------+----------+
only showing top 5 rows



In [56]:
ratings_df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/title.ratings.tsv.gz")

ratings_rename_columns_df = ratings_df.withColumnsRenamed({"tconst": "title_id", "averageRating": "average_rating", "numVotes": "num_votes"})

# df.schema
ratings_rename_columns_df.show(5)


+---------+--------------+---------+
| title_id|average_rating|num_votes|
+---------+--------------+---------+
|tt0000001|           5.7|     2178|
|tt0000002|           5.5|      299|
|tt0000003|           6.4|     2245|
|tt0000004|           5.2|      193|
|tt0000005|           6.2|     2989|
+---------+--------------+---------+
only showing top 5 rows



In [47]:
name_basics_df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/name.basics.tsv.gz")

# name_basics_df.schema
# name_basics_df.show(5)

from pyspark.sql.functions import col, when


name_rename_columns_df = name_basics_df .withColumnsRenamed({"nconst": "id", "primaryName": "primary_name", "birthYear": "birth_year", "deathYear": "death_year", "primaryProfession": "primary_profession", "knownForTitles": "titles"})
set_null_death_year_df = name_rename_columns_df.withColumn("death_year",when(col("death_year") == "\\N", None).otherwise(col("death_year")))

set_null_death_year_df.show(5)


+---------+---------------+----------+----------+--------------------+--------------------+
|       id|   primary_name|birth_year|death_year|  primary_profession|              titles|
+---------+---------------+----------+----------+--------------------+--------------------+
|nm0000001|   Fred Astaire|      1899|      1987|actor,miscellaneo...|tt0050419,tt00723...|
|nm0000002|  Lauren Bacall|      1924|      2014|actress,soundtrac...|tt0037382,tt00752...|
|nm0000003|Brigitte Bardot|      1934|      NULL|actress,music_dep...|tt0057345,tt00491...|
|nm0000004|   John Belushi|      1949|      1982|actor,writer,musi...|tt0072562,tt00779...|
|nm0000005| Ingmar Bergman|      1918|      2007|writer,director,a...|tt0050986,tt00694...|
+---------+---------------+----------+----------+--------------------+--------------------+
only showing top 5 rows

+---+------------+----------+----------+------------------+------+
| id|primary_name|birth_year|death_year|primary_profession|titles|
+---+--------