In [34]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as f
import pyspark.sql.types as t

In [2]:
spark_session = (SparkSession.builder
                .master('local')
                .appName('Big Data Project')
                .config(conf=SparkConf())
                .getOrCreate())

### Define the schemas

In [3]:
name_basics_schema = t.StructType([
    t.StructField("nconst", t.StringType(), nullable=False),
    t.StructField("primaryName", t.StringType(), nullable=False),
    t.StructField("birthYear", t.IntegerType(), nullable=True),
    t.StructField("deathYear", t.IntegerType(), nullable=True),
    t.StructField("primaryProfession", t.StringType(), nullable=True),
    t.StructField("knownForTitles", t.StringType(), nullable=True)
])

In [4]:
# Define the schema for the 'title.basics.tsv' table
title_basics_schema = t.StructType([
    t.StructField("tconst", t.StringType(), nullable=False),
    t.StructField("titleType", t.StringType(), nullable=True),
    t.StructField("primaryTitle", t.StringType(), nullable=True),
    t.StructField("originalTitle", t.StringType(), nullable=True),
    t.StructField("isAdult", t.IntegerType(), nullable=True),
    t.StructField("startYear", t.IntegerType(), nullable=True),
    t.StructField("endYear", t.StringType(), nullable=True),
    t.StructField("runtimeMinutes", t.IntegerType(), nullable=True),
    t.StructField("genres", t.StringType(), nullable=True)
])


In [5]:
# Define the schema for the 'title.akas.tsv' table
title_akas_schema = t.StructType([
    t.StructField("titleId", t.StringType(), nullable=False),
    t.StructField("ordering", t.IntegerType(), nullable=True),
    t.StructField("title", t.StringType(), nullable=True),
    t.StructField("region", t.StringType(), nullable=True),
    t.StructField("language",t.StringType(), nullable=True),
    t.StructField("types", t.StringType(), nullable=True),
    t.StructField("attributes", t.StringType(), nullable=True),
    t.StructField("isOriginalTitle", t.IntegerType(), nullable=True)
])

In [6]:
# Define the schema for the 'title.crew.tsv' table
title_crew_schema = t.StructType([
    t.StructField("tconst", t.StringType(), nullable=False),
    t.StructField("directors", t.StringType(), nullable=True),
    t.StructField("writers", t.StringType(), nullable=True)
])

In [7]:
# Define the schema for the 'title.principals.tsv' table
title_principals_schema = t.StructType([
    t.StructField("tconst", t.StringType(), nullable=False),
    t.StructField("ordering", t.IntegerType(), nullable=True),
    t.StructField("nconst", t.StringType(), nullable=True),
    t.StructField("category", t.StringType(), nullable=True),
    t.StructField("job", t.StringType(), nullable=True),
    t.StructField("characters", t.StringType(), nullable=True)
])

In [8]:
# Define the schema for the 'title.ratings.tsv' table
title_ratings_schema = t.StructType([
    t.StructField("tconst", t.StringType(), nullable=False),
    t.StructField("averageRating", t.FloatType(), nullable=True),
    t.StructField("numVotes", t.IntegerType(), nullable=True)
])

In [9]:
# Define the schema for the 'title.episode.tsv' table
title_episode_schema = t.StructType([
    t.StructField("tconst", t.StringType(), nullable=False),
    t.StructField("parentTconst", t.StringType(), nullable=True),
    t.StructField("seasonNumber", t.IntegerType(), nullable=True),
    t.StructField("episodeNumber", t.IntegerType(), nullable=True)
])

### Read the datasets

In [10]:
name_basics_df = spark_session.read.csv("imdb/name.basics.tsv", sep=r'\t', header=True, schema=name_basics_schema)
title_basics_df = spark_session.read.csv("imdb/title.basics.tsv", sep=r'\t', header=True, schema=title_basics_schema)
title_akas_df = spark_session.read.csv("imdb/title.akas.tsv", sep=r'\t', header=True, schema=title_akas_schema)
title_crew_df = spark_session.read.csv("imdb/title.crew.tsv", sep=r'\t', header=True, schema=title_crew_schema)
title_principals_df = spark_session.read.csv("imdb/title.principals.tsv", sep=r'\t', header=True, schema=title_principals_schema)
title_ratings_df = spark_session.read.csv("imdb/title.ratings.tsv", sep=r'\t', header=True, schema=title_ratings_schema)
title_episode_df = spark_session.read.csv("imdb/title.episode.tsv", sep=r'\t', header=True, schema=title_episode_schema)

In [11]:
dfs = [name_basics_df, title_basics_df, title_akas_df, title_crew_df, title_principals_df, title_ratings_df, title_episode_df]
df_names = ["name.basics", "title.basics.tsv", "title.akas.tsv", "title.crew.tsv", "title.principals.tsv", "title.ratings.tsv", "title.episode.tsv"]

In [12]:
for df, name in zip(dfs, df_names):
    print(name, ":", df.columns)
    df.show()

name.basics : ['nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession', 'knownForTitles']
+---------+-------------------+---------+---------+--------------------+--------------------+
|   nconst|        primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+-------------------+---------+---------+--------------------+--------------------+
|nm0000001|       Fred Astaire|     1899|     1987|soundtrack,actor,...|tt0050419,tt00319...|
|nm0000002|      Lauren Bacall|     1924|     2014|  actress,soundtrack|tt0117057,tt00752...|
|nm0000003|    Brigitte Bardot|     1934|     null|actress,soundtrac...|tt0054452,tt00491...|
|nm0000004|       John Belushi|     1949|     1982|actor,soundtrack,...|tt0072562,tt00804...|
|nm0000005|     Ingmar Bergman|     1918|     2007|writer,director,a...|tt0069467,tt00839...|
|nm0000006|     Ingrid Bergman|     1915|     1982|actress,soundtrac...|tt0038109,tt00387...|
|nm0000007|    Humphrey Bogart|     1899|     195

In [13]:
for df, name in zip(dfs, df_names):
    print(name, df.count())

name.basics 12725345
title.basics.tsv 10039412
title.akas.tsv 36740603
title.crew.tsv 10041816
title.principals.tsv 57395516
title.ratings.tsv 1333579
title.episode.tsv 7639320


### Data Preprocessing

#### Explode columns with multiple values

In [14]:
def explode_and_drop(df, old_col, new_col):
    df = df.withColumn(old_col, f.split(f.col(old_col), ","))
    df = df.select("*", f.explode(f.col(old_col)).alias(new_col))
    df = df.drop(old_col)
    return df

In [15]:
name_basics_df = explode_and_drop(name_basics_df, "primaryProfession", "Profession")

In [16]:
name_basics_df = explode_and_drop(name_basics_df, "knownForTitles", "knownForTitle")

In [17]:
title_basics_df = explode_and_drop(title_basics_df, "genres", "genre")

In [18]:
title_crew_df = explode_and_drop(title_crew_df, "directors", "director")

#### Drop useless columns

In [19]:
title_basics_df = title_basics_df.drop("endYear")

In [20]:
title_akas_df = title_akas_df.drop("language", "isOriginalTitle")

In [21]:
title_crew_df = title_crew_df.drop("writers")

In [22]:
title_principals_df = title_principals_df.drop("characters", "job")

In [23]:
dfs = [name_basics_df, title_basics_df, title_akas_df, title_crew_df, title_principals_df, title_ratings_df, title_episode_df]


### Business Questions

#### Q1: Which directors are associated with the highest number of titles (movies/TV shows) in the database?

In [25]:
directors_with_num_of_firected_films_df = title_crew_df.groupBy('director').agg(f.count("tconst").alias("Titles")).orderBy("Titles", ascending=False)
directors_with_num_of_firected_films_df.show(6)

+---------+-------+
| director| Titles|
+---------+-------+
|       \N|4266256|
|nm1203430|  12790|
|nm1966600|  12368|
|nm8467983|  11585|
|nm1409127|  10365|
|nm1667633|   8984|
+---------+-------+
only showing top 6 rows



#### Q2: Highest ranked films where there are more than 2 million votes Films directed by the director of the best rated film

In [27]:
highest_ranked_films_df_filtered = title_ratings_df.filter(title_ratings_df.numVotes > 2000000).orderBy("averageRating", ascending=False)

In [28]:
highest_ranked_films_df_filtered.show()

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0903747|          9.5| 2009512|
|tt0111161|          9.3| 2773389|
|tt0944947|          9.2| 2185187|
|tt0468569|          9.0| 2749040|
|tt0110912|          8.9| 2128442|
|tt0109830|          8.8| 2156978|
|tt0137523|          8.8| 2209229|
|tt1375666|          8.8| 2439868|
+---------+-------------+--------+



In [29]:
highest_ranked_films_df_joined = highest_ranked_films_df_filtered.join(title_basics_df.dropDuplicates((['tconst'])), on='tconst', how='inner')
highest_ranked_films_df_joined.show()

+---------+-------------+--------+---------+--------------------+--------------------+-------+---------+--------------+------+
|   tconst|averageRating|numVotes|titleType|        primaryTitle|       originalTitle|isAdult|startYear|runtimeMinutes| genre|
+---------+-------------+--------+---------+--------------------+--------------------+-------+---------+--------------+------+
|tt1375666|          8.8| 2439868|    movie|           Inception|           Inception|      0|     2010|           148|Action|
|tt0137523|          8.8| 2209229|    movie|          Fight Club|          Fight Club|      0|     1999|           139| Drama|
|tt0468569|          9.0| 2749040|    movie|     The Dark Knight|     The Dark Knight|      0|     2008|           152|Action|
|tt0111161|          9.3| 2773389|    movie|The Shawshank Red...|The Shawshank Red...|      0|     1994|           142| Drama|
|tt0944947|          9.2| 2185187| tvSeries|     Game of Thrones|     Game of Thrones|      0|     2011|       

#### Q3: Which TV special for adults has the longest duration

In [32]:
tv_special_df = title_basics_df.filter((title_basics_df.titleType == "tvSpecial") & f.col("runtimeMinutes").isNotNull())
tv_special_adults_df = tv_special_df.where(f.col("isAdult") == 1).dropDuplicates(["tconst"]).orderBy("runtimeMinutes", ascending=False)
tv_special_adults_df.show()

+----------+---------+--------------------+--------------------+-------+---------+--------------+-----+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|runtimeMinutes|genre|
+----------+---------+--------------------+--------------------+-------+---------+--------------+-----+
| tt9694594|tvSpecial|Best in Sex: 2019...|Best in Sex: 2019...|      1|     2019|            90|Adult|
| tt1821320|tvSpecial|The 2008 AVN Adul...|The 2008 AVN Adul...|      1|     2008|            87|Adult|
| tt8475784|tvSpecial|Best in Sex: 2018...|Best in Sex: 2018...|      1|     2018|            87|Adult|
| tt1627882|tvSpecial|The 2009 AVN Adul...|The 2009 AVN Adul...|      1|     2009|            80| News|
| tt6741116|tvSpecial|Best in Sex: 2016...|Best in Sex: 2016...|      1|     2016|            80|Adult|
|tt11889164|tvSpecial|Best in Sex: 2020...|Best in Sex: 2020...|      1|     2020|            71|Adult|
|tt14570830|tvSpecial|Majuu Jouka Shouj...|Majuu Jouka Shouj...|

#### Q4: What are the most popular genres for every content type (titleType)

In [37]:
# First, calculate the count of each genre within its corresponding titleType partition
content_genres_window = Window.partitionBy("titleType", "genre").orderBy("titleType")
content_genres_df = title_basics_df.withColumn("Count", f.count(f.col("genre")).over(content_genres_window)).select("titleType", "genre", "Count").distinct()
content_genres_df.show()

+---------+-----------+------+
|titleType|      genre| Count|
+---------+-----------+------+
|    movie|     Action| 54425|
|    movie|      Adult|  9216|
|    movie|  Adventure| 28421|
|    movie|  Animation|  9229|
|    movie|  Biography| 17411|
|    movie|     Comedy|109996|
|    movie|      Crime| 37595|
|    movie|Documentary|124025|
|    movie|      Drama|236457|
|    movie|     Family| 17801|
|    movie|    Fantasy| 15500|
|    movie|  Film-Noir|   882|
|    movie|  Game-Show|    25|
|    movie|    History| 14428|
|    movie|     Horror| 37612|
|    movie|      Music| 13777|
|    movie|    Musical| 10294|
|    movie|    Mystery| 17274|
|    movie|       News|  1449|
|    movie| Reality-TV|   522|
+---------+-----------+------+
only showing top 20 rows



In [43]:
# Then, find the genre with the highest count for each titleType
genre_rank_window = Window.partitionBy("titleType").orderBy(f.col("Count").desc())
content_genres_df_with_rank = content_genres_df.withColumn("Rank", f.max(f.col("Count")).over(genre_rank_window))
content_genres_max_df = content_genres_df_with_rank.where(f.col("Count") == f.col("Rank")).drop("Rank")
content_genres_max_df.show()

+------------+-----------+-------+
|   titleType|      genre|  Count|
+------------+-----------+-------+
|       movie|      Drama| 236457|
|       short|      Short| 887483|
|   tvEpisode|      Drama|2224130|
|tvMiniSeries|      Drama|  11724|
|     tvMovie|Documentary|  45738|
|     tvPilot|         \N|      1|
|    tvSeries|     Comedy|  57246|
|     tvShort|      Short|   9244|
|   tvSpecial|      Music|  11268|
|       video|      Short| 115960|
|   videoGame|     Action|  14566|
+------------+-----------+-------+



#### Q5: How many episodes are there on average per TV show?

#### Q6: Which actors or actresses have appeared in the most episodes?

#### Q7: Actors that lived the most

#### Q8: Actors that worket in yhe film industry the most

#### Q9: What is the average rating of movies in the database?