In [20]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("IMDB Ingestion") \
    .getOrCreate()

# Read a file from MinIO
df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/title.basics.tsv.gz")

df.schema
df.show(5)



+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|        Poor Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             5|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|              

In [6]:
df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/title.akas.tsv.gz")

df.schema
df.show(5)

+---------+--------+--------------------+------+--------+-----------+-------------+---------------+
|  titleId|ordering|               title|region|language|      types|   attributes|isOriginalTitle|
+---------+--------+--------------------+------+--------+-----------+-------------+---------------+
|tt0000001|       1|          Carmencita|    \N|      \N|   original|           \N|              1|
|tt0000001|       2|          Carmencita|    DE|      \N|         \N|literal title|              0|
|tt0000001|       3|          Carmencita|    US|      \N|imdbDisplay|           \N|              0|
|tt0000001|       4|Carmencita - span...|    HU|      \N|imdbDisplay|           \N|              0|
|tt0000001|       5|          Καρμενσίτα|    GR|      \N|imdbDisplay|           \N|              0|
+---------+--------+--------------------+------+--------+-----------+-------------+---------------+
only showing top 5 rows



In [8]:
df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/title.crew.tsv.gz")

df.schema
# df.show(5)

StructType([StructField('tconst', StringType(), True), StructField('directors', StringType(), True), StructField('writers', StringType(), True)])

In [11]:
df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/title.episode.tsv.gz")

df.schema
df.show(5)

+---------+------------+------------+-------------+
|   tconst|parentTconst|seasonNumber|episodeNumber|
+---------+------------+------------+-------------+
|tt0031458|  tt32857063|          \N|           \N|
|tt0041951|   tt0041038|           1|            9|
|tt0042816|   tt0989125|           1|           17|
|tt0042889|   tt0989125|          \N|           \N|
|tt0043426|   tt0040051|           3|           42|
+---------+------------+------------+-------------+
only showing top 5 rows



In [13]:
df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/title.principals.tsv.gz")

# df.schema
df.show(5)

+---------+--------+---------+---------------+--------------------+----------+
|   tconst|ordering|   nconst|       category|                 job|characters|
+---------+--------+---------+---------------+--------------------+----------+
|tt0000001|       1|nm1588970|           self|                  \N|  ["Self"]|
|tt0000001|       2|nm0005690|       director|                  \N|        \N|
|tt0000001|       3|nm0005690|       producer|            producer|        \N|
|tt0000001|       4|nm0374658|cinematographer|director of photo...|        \N|
|tt0000002|       1|nm0721526|       director|                  \N|        \N|
+---------+--------+---------+---------------+--------------------+----------+
only showing top 5 rows



In [15]:
df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/title.ratings.tsv.gz")

# df.schema
df.show(5)

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    2178|
|tt0000002|          5.5|     299|
|tt0000003|          6.4|    2245|
|tt0000004|          5.2|     193|
|tt0000005|          6.2|    2989|
+---------+-------------+--------+
only showing top 5 rows



In [19]:
df = spark.read \
    .option("header", True) \
    .option("sep", "\t") \
    .csv("s3a://data/imdb/year=2025/month=09/day=17/name.basics.tsv.gz")

# df.schema
df.show(5)


+---------+---------------+---------+---------+--------------------+--------------------+
|   nconst|    primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+---------------+---------+---------+--------------------+--------------------+
|nm0000001|   Fred Astaire|     1899|     1987|actor,miscellaneo...|tt0050419,tt00723...|
|nm0000002|  Lauren Bacall|     1924|     2014|actress,soundtrac...|tt0037382,tt00752...|
|nm0000003|Brigitte Bardot|     1934|       \N|actress,music_dep...|tt0057345,tt00491...|
|nm0000004|   John Belushi|     1949|     1982|actor,writer,musi...|tt0072562,tt00779...|
|nm0000005| Ingmar Bergman|     1918|     2007|writer,director,a...|tt0050986,tt00694...|
+---------+---------------+---------+---------+--------------------+--------------------+
only showing top 5 rows

