## 

# 🎬 Movies Data Pipeline
Ce notebook contient un pipeline de traitement des données du fichier `TMDB_all_movies.csv`, dans le cadre d'un projet de data engineering.

## 1. Ingestion

In [448]:
from pyspark.sql import SparkSession
import pyspark
from pyspark.sql.functions import count, col, when, array, split, size, sum as _sum, row_number, mean
from pyspark.sql.window import Window

In [419]:
spark = SparkSession.builder.appName("Movie recommender").getOrCreate()

df = spark.read.csv("../data/TMDB_all_movies.csv", header=True, inferSchema=True, sep=",", quote='"', escape='"', multiLine=True)

                                                                                

In [420]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: double (nullable = true)
 |-- status: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- revenue: double (nullable = true)
 |-- runtime: double (nullable = true)
 |-- budget: double (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- tagline: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- director: string (nullable = true)
 |-- director_of_photography: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- producers: string (nu

In [422]:
# Type de chaque colonne :
    # vote_average: float
    # vote_count: int
    # release_date: date
    # revenue: float
    # runtime: float
    # budget: float
    # popularity: float
    # genres, production_countries, production_companies, spoken, cast, director, writers: string / one-hot encoded ?

cols_to_check = ["title", "original_title", "overview", "release_date", "genres", "production_countries", "production_companies", "spoken_languages", "cast", "director", "writers"]
df = df.withColumn(
    "completeness_score",
    sum([when(col(c).isNotNull(), 1).otherwise(0) for c in cols_to_check])
)

df = df.withColumn("vote_average", df["vote_average"].try_cast("double").try_cast("float")) \
    .withColumn("vote_count", df["vote_count"].try_cast("double").try_cast("int")) \
    .withColumn("release_date", df["release_date"].try_cast("date")) \
    .withColumn("revenue", df["revenue"].try_cast("double").try_cast("float")) \
    .withColumn("runtime", df["runtime"].try_cast("double").try_cast("float")) \
    .withColumn("budget", df["budget"].try_cast("double").try_cast("float")) \
    .withColumn("popularity", df["popularity"].try_cast("double").try_cast("float")) \
    .withColumn("genres_array", when(col("genres").isNotNull(), split(col("genres"), ",\\s*")).otherwise(array())) \
    .withColumn("production_countries_array", when(col("production_countries").isNotNull(), split(col("production_countries"), ",\\s*")).otherwise(array())) \
    .withColumn("production_companies_array", when(col("production_companies").isNotNull(), split(col("production_companies"), ",\\s*")).otherwise(array())) \
    .withColumn("spoken_languages_array", when(col("spoken_languages").isNotNull(), split(col("spoken_languages"), ",\\s*")).otherwise(array())) \
    .withColumn("cast_array", when(col("cast").isNotNull(), split(col("cast"), ",\\s*")).otherwise(array())) \
    .withColumn("director_array", when(col("director").isNotNull(), split(col("director"), ",\\s*")).otherwise(array())) \
    .withColumn("writers_array", when(col("writers").isNotNull(), split(col("writers"), ",\\s*")).otherwise(array()))

df = df.drop("genres", "production_countries", "production_companies", "spoken_languages", "cast", "director", "writers")

df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: float (nullable = true)
 |-- vote_count: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- revenue: float (nullable = true)
 |-- runtime: float (nullable = true)
 |-- budget: float (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: float (nullable = true)
 |-- tagline: string (nullable = true)
 |-- director_of_photography: string (nullable = true)
 |-- producers: string (nullable = true)
 |-- music_composer: string (nullable = true)
 |-- imdb_rating: double (nullable = true)
 |-- imdb_votes: double (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- completeness_score: integer (nullable = false)
 |-- genres_array: array (nullable = true)
 |    |-- element: string

In [None]:
# use CountVectorizer to vectorize the genres, production_countries, production_companies, spoken_languages, cast, director and writers columns
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline
vectorizer_genres = CountVectorizer(inputCol="genres_array", outputCol="genres_vector")
vectorizer_production_countries = CountVectorizer(inputCol="production_countries_array", outputCol="production_countries_vector")
vectorizer_production_companies = CountVectorizer(inputCol="production_companies_array", outputCol="production_companies_vector")
vectorizer_spoken_languages = CountVectorizer(inputCol="spoken_languages_array", outputCol="spoken_languages_vector")
vectorizer_cast = CountVectorizer(inputCol="cast_array", outputCol="cast_vector")
vectorizer_director = CountVectorizer(inputCol="director_array", outputCol="director_vector")
vectorizer_writers = CountVectorizer(inputCol="writers_array", outputCol="writers_vector")

pipeline = Pipeline(stages=[
    vectorizer_genres,
    vectorizer_production_countries,
    vectorizer_production_companies,
    vectorizer_spoken_languages,
    vectorizer_cast,
    vectorizer_director,
    vectorizer_writers
])
pipeline_model = pipeline.fit(df)
df = pipeline_model.transform(df)
df.show(5, truncate=True)

25/07/08 15:26:50 WARN MemoryStore: Not enough space to cache rdd_2004_0 in memory! (computed 99.5 MiB so far)
25/07/08 15:26:50 WARN BlockManager: Persisting block rdd_2004_0 to disk instead.
                                                                                

+---+--------------------------------+------------+----------+--------+------------+-----------+-------+---------+---------+-----------------+--------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## 2. Exploration

In [None]:
df.show(5, truncate=False)

+---+--------------------------------+------------+----------+--------+------------+-----------+-------+---------+---------+-----------------+--------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# null values by columns
df.select([pyspark.sql.functions.count(pyspark.sql.functions.when(pyspark.sql.functions.col(c).isNull(), c)).alias(c) for c in df.columns]).show()

[Stage 667:>                                                        (0 + 1) / 1]

+---+-----+------------+----------+------+------------+-------+-------+------+-------+-----------------+--------------+--------+----------+-------+-----------------------+---------+--------------+-----------+----------+-----------+------------+--------------------------+--------------------------+----------------------+----------+--------------+-------------+-------------+---------------------------+---------------------------+-----------------------+-----------+---------------+--------------+
| id|title|vote_average|vote_count|status|release_date|revenue|runtime|budget|imdb_id|original_language|original_title|overview|popularity|tagline|director_of_photography|producers|music_composer|imdb_rating|imdb_votes|poster_path|genres_array|production_countries_array|production_companies_array|spoken_languages_array|cast_array|director_array|writers_array|genres_vector|production_countries_vector|production_companies_vector|spoken_languages_vector|cast_vector|director_vector|writers_vector|
+-

                                                                                

In [None]:
# overview null but vote_count > 100 count
df.filter(
    (df["overview"].isNull()) & (df["vote_count"] > 10)
).count()

                                                                                

690

In [None]:
# overview null but vote_count > 100
df.filter(
    (df["overview"].isNull()) & (df["vote_count"] > 5)
).show(100, truncate=False)

[Stage 612:>                                                        (0 + 1) / 1]

+-----+---------------------------------------------------------------------------------------+------------+----------+--------+------------+--------+-------+---------+---------+-----------------+-----------------------------------------------------------------------+--------+----------+---------------------------+----------------------------------------------------------------------------------------------+-----------------------------------------------------------+---------------------------------------------------------------------+-----------+----------+--------------------------------+------------------------------------------------+--------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [None]:
# genres null but vote_count > 100
df.filter(
    size(col("genres_array")) == 0
).show(100, truncate=False)

+-----+-----------------------------------------------------------------------------+------------+----------+--------+------------+---------+-------+---------+---------+-----------------+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# genres null but vote_count > 100
df.filter(
    (size(col("genres_array")) == 0) & (df["vote_count"] > 10)
).count()

                                                                                

589

In [None]:
# get duplicate titles

duplicate_titles = (
    df.groupBy("title") \
      .agg(count("*").alias("count"))\
      .filter(col("count") > 1)\
)

duplicate_titles.show(100)

[Stage 617:>                                                        (0 + 1) / 1]

+--------------------+-----+
|               title|count|
+--------------------+-----+
|       Heading South|    2|
|                Nell|    3|
|             Nemesis|   20|
|          Der Tunnel|    6|
|              Deszcz|    2|
|         Deep Rising|    2|
|    Straight to Hell|    2|
|Dance with the Devil|    4|
|              Room 6|    2|
|              Heaven|   30|
|                Silk|   14|
|       Crossing Over|   10|
|   What No One Knows|    2|
|        The Big Bang|    6|
|          Riverworld|    2|
|       Sugar & Spice|    4|
|            Deep Red|    4|
|        Miracle Mile|    2|
|             Larceny|    4|
|     My Name Is Khan|    2|
|        Generation X|    2|
|      A Woman's Face|    3|
|         It's a Gift|    2|
|    La Vie de Bohème|    2|
|             Amateur|   14|
|              Crisis|   17|
|  A Woman Is a Woman|    2|
|       Natural Enemy|    2|
|           Surprise!|   12|
|Diary of a Chambe...|    3|
|         Lesser Evil|    2|
|             

                                                                                

In [443]:
# get duplicate titles with all columns
duplicate_titles_full = (
    df.join(duplicate_titles, "title") \
      .select(df["title"], df["release_date"], df["completeness_score"], df["original_language"], df["original_title"], df["overview"], df["genres_array"], df["production_countries_array"], df["production_companies_array"], df["spoken_languages_array"], df["cast_array"], df["writers_array"], df["director_array"], duplicate_titles["count"])
)
duplicate_titles_full.show(1000, truncate=False)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `completeness_score` cannot be resolved. Did you mean one of the following? [`id`, `title`, `vote_average`, `vote_count`, `status`, `release_date`, `revenue`, `runtime`, `budget`, `imdb_id`, `original_language`, `original_title`, `overview`, `popularity`, `tagline`, `director_of_photography`, `producers`, `music_composer`, `imdb_rating`, `imdb_votes`, `poster_path`, `genres_array`, `production_countries_array`, `production_companies_array`, `spoken_languages_array`, `cast_array`, `director_array`, `writers_array`, `genres_vector`, `production_countries_vector`, `production_companies_vector`, `spoken_languages_vector`, `cast_vector`, `director_vector`, `writers_vector`]. SQLSTATE: 42703

In [426]:
df.select("completeness_score").describe().show()

[Stage 756:>                                                        (0 + 1) / 1]

+-------+------------------+
|summary|completeness_score|
+-------+------------------+
|  count|           1118702|
|   mean| 8.100197371596726|
| stddev|2.5031281958716334|
|    min|                 0|
|    max|                11|
+-------+------------------+



                                                                                

In [434]:
# count rows with completeness_score < 8
df.filter(
    df["completeness_score"] < 4
).show(100, truncate=False)

[Stage 778:>                                                        (0 + 1) / 1]

+-----+-------------------------------------------------------------+------------+----------+--------+------------+-------+-------+------+---------+-----------------+-------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------+-----------------------+---------+--------------+-----------+----------+-

                                                                                

In [None]:
# count rows who have duplicate titles and same release_date, with not null release_date
duplicate_titles_count = (
    df.groupBy("title", "release_date") \
        .agg(count("*").alias("count"))\
        .filter((col("count") > 1) & (col("release_date").isNotNull()))\
)
duplicate_titles_count.show(100, truncate=False)

[Stage 810:>                                                        (0 + 3) / 3]

+-----+------------+-----+
|title|release_date|count|
+-----+------------+-----+
+-----+------------+-----+



                                                                                

In [None]:
# get rows with null production_companies and vote_count > 100
from pyspark.sql.functions import expr

df.filter(
    (size(col("production_companies_array")) == 0) &
    (expr("try_cast(vote_count as double) > 100.0"))
).show(truncate=False)

+-----+-----------------------------------------+------------+----------+--------+------------+---------+-------+---------+---------+-----------------+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+---------------------------------------------------------------------------+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# release_date null
df.filter(
    df["release_date"].isNull()
).show(100, truncate=False)

+-----+----------------------------------------------------------------------+------------+----------+---------------+------------+-------+-------+---------+---------+-----------------+----------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# count rows with release_date, production_companies, production_countries, spoken_languages, cast, director and writers null
df.filter(
    df.release_date.isNull() &
    df.overview.isNull() &
    (size(col("production_companies_array")) == 0) &
    (size(col("production_countries_array")) == 0) &
    (size(col("spoken_languages_array")) == 0) &
    (size(col("cast_array")) == 0) &
    (size(col("director_array")) == 0) &
    (size(col("writers_array")) == 0) &
    (size(col("genres_array")) == 0)
).count()

                                                                                

21531

In [None]:
df.select("popularity", "vote_count", "vote_average", "revenue", "runtime", "budget").describe().show()

[Stage 819:>                                                        (0 + 4) / 4]

+-------+-----------------+-----------------+------------------+--------------------+------------------+-----------------+
|summary|       popularity|       vote_count|      vote_average|             revenue|           runtime|           budget|
+-------+-----------------+-----------------+------------------+--------------------+------------------+-----------------+
|  count|          1107853|          1107853|           1107853|             1107853|           1107853|          1107853|
|   mean|1.015261073489557|22.59034908060907|2.0512829310576777|   782980.4781816722| 44.70439128656961|304286.0185873035|
| stddev|2.587350815976056|360.6852464047417|3.0778827293290805|1.9652703339092076E7|57.654902969895765|5343272.154854029|
|    min|              0.0|                0|               0.0|               -12.0|               0.0|              0.0|
|    max|         658.8665|            37627|              10.0|               5.0E9|           14400.0|            1.0E9|
+-------+-------

                                                                                

In [450]:
# count values = 0 in popularity, vote_count, vote_average, revenue, runtime, budget
df.select(
    _sum(when(col("popularity") == 0, 1).otherwise(0)).alias("popularity_0"),
    _sum(when(col("vote_count") == 0, 1).otherwise(0)).alias("vote_count_0"),
    _sum(when(col("vote_average") == 0, 1).otherwise(0)).alias("vote_average_0"),
    _sum(when(col("revenue") == 0, 1).otherwise(0)).alias("revenue_0"),
    _sum(when(col("runtime") == 0, 1).otherwise(0)).alias("runtime_0"),
    _sum(when(col("budget") == 0, 1).otherwise(0)).alias("budget_0")
).show()

[Stage 837:>                                                        (0 + 4) / 4]

+------------+------------+--------------+---------+---------+--------+
|popularity_0|vote_count_0|vote_average_0|revenue_0|runtime_0|budget_0|
+------------+------------+--------------+---------+---------+--------+
|        7090|      729538|        729859|  1081807|        0| 1037923|
+------------+------------+--------------+---------+---------+--------+



                                                                                

## 3. Nettoyage

In [None]:
# enlever toutes les lignes qui ne sont pas en released
df = df.filter(df["status"] == "Released")
df.select("status").distinct().show()

[Stage 621:>                                                        (0 + 1) / 1]

+--------+
|  status|
+--------+
|Released|
+--------+



                                                                                

In [None]:
# drop columns that are not useful for the calculation
df = df.drop("status", "imdb_id", "tagline", "director_of_photography", "producers", "imdb_rating", "imdb_votes", "music_composer")
df.show(5, truncate=False)

+---+--------------------------------+------------+----------+------------+-----------+-------+---------+-----------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# drop rows that have no title
df = df.filter(df["title"].isNotNull() & (df["title"] != ""))

In [None]:
# drop rows who have completeness_score < 4
df = df.filter(df["completeness_score"] >= 4)

In [None]:
# drop rows with release_date, production_companies, production_countries, spoken_languages, cast, director, writers, overview and genres null
df = df.filter(
    ~(
        df.release_date.isNull() &
        df.overview.isNull() &
        (size(col("production_companies_array")) == 0) &
        (size(col("production_countries_array")) == 0) &
        (size(col("spoken_languages_array")) == 0) &
        (size(col("cast_array")) == 0) &
        (size(col("director_array")) == 0) &
        (size(col("writers_array")) == 0) &
        (size(col("genres_array")) == 0)
    )
)

In [None]:
# vérifier les doublons de titres avec release_date, et supprimer la ligne avec le completeness_score le plus bas
window = Window.partitionBy("title", "release_date").orderBy(col("completeness_score").desc())

# Garder la ligne la plus complète
df = df.withColumn("row_num", row_number().over(window)) \
       .filter(col("row_num") == 1) \
       .drop("row_num")

In [451]:
# vérifier les doublons de titres avec overview, et supprimer la ligne avec le completeness_score le plus bas
window = Window.partitionBy("title", "overview").orderBy(col("completeness_score").desc())

# Garder la ligne la plus complète
df = df.withColumn("row_num", row_number().over(window)) \
       .filter(col("row_num") == 1) \
       .drop("row_num", "completeness_score")

{"ts": "2025-07-08 16:32:54.504", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `completeness_score` cannot be resolved. Did you mean one of the following? [`genres_vector`, `writers_vector`, `cast_vector`, `poster_path`, `release_date`]. SQLSTATE: 42703", "context": {"file": "jdk.internal.reflect.GeneratedMethodAccessor471.invoke(Unknown Source)", "line": "", "fragment": "col", "errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o18026.withColumn.\n: org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `completeness_score` cannot be resolved. Did you mean one of the following? [`genres_vector`, `writers_vector`, `cast_vector`, `poster_path`, `release_date`]. SQLSTATE: 42703;\n'Project [id#28333, title#28334, vote_average#2836

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `completeness_score` cannot be resolved. Did you mean one of the following? [`genres_vector`, `writers_vector`, `cast_vector`, `poster_path`, `release_date`]. SQLSTATE: 42703;
'Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#30881, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, genres_array#28370, production_countries_array#28371, production_companies_array#28372, spoken_languages_array#28373, ... 11 more fields]
+- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, CASE WHEN (cast(runtime#28367 as double) = cast(0 as double)) THEN 44.70439128656961 ELSE cast(runtime#28367 as double) END AS runtime#30881, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, genres_array#28370, production_countries_array#28371, production_companies_array#28372, spoken_languages_array#28373, ... 10 more fields]
   +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, genres_array#28370, production_countries_array#28371, production_companies_array#28372, spoken_languages_array#28373, ... 10 more fields]
      +- Filter (row_num#29618 = 1)
         +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, completeness_score#28362, genres_array#28370, production_countries_array#28371, production_companies_array#28372, ... 12 more fields]
            +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, completeness_score#28362, genres_array#28370, production_countries_array#28371, production_companies_array#28372, ... 13 more fields]
               +- Window [row_number() windowspecdefinition(title#28334, release_date#28365, completeness_score#28362 DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_num#29618], [title#28334, release_date#28365], [completeness_score#28362 DESC NULLS LAST]
                  +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, completeness_score#28362, genres_array#28370, production_countries_array#28371, production_companies_array#28372, ... 11 more fields]
                     +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, completeness_score#28362, genres_array#28370, production_countries_array#28371, production_companies_array#28372, ... 11 more fields]
                        +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, completeness_score#28362, genres_array#28370, production_countries_array#28371, production_companies_array#28372, ... 10 more fields]
                           +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, completeness_score#28362, genres_array#28370, production_countries_array#28371, production_companies_array#28372, ... 9 more fields]
                              +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, completeness_score#28362, genres_array#28370, production_countries_array#28371, production_companies_array#28372, ... 8 more fields]
                                 +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, completeness_score#28362, genres_array#28370, production_countries_array#28371, production_companies_array#28372, ... 7 more fields]
                                    +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, completeness_score#28362, genres_array#28370, production_countries_array#28371, production_companies_array#28372, ... 6 more fields]
                                       +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, completeness_score#28362, genres_array#28370, production_countries_array#28371, production_companies_array#28372, ... 5 more fields]
                                          +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, director_of_photography#28354, producers#28356, music_composer#28357, imdb_rating#28358, imdb_votes#28359, poster_path#28360, completeness_score#28362, genres_array#28370, production_countries_array#28371, production_companies_array#28372, ... 4 more fields]
                                             +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 11 more fields]
                                                +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 10 more fields]
                                                   +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 9 more fields]
                                                      +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 8 more fields]
                                                         +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 7 more fields]
                                                            +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 6 more fields]
                                                               +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28369, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 5 more fields]
                                                                  +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, try_cast(try_cast(popularity#28346 as double) as float) AS popularity#28369, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 4 more fields]
                                                                     +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, runtime#28367, try_cast(try_cast(budget#28341 as double) as float) AS budget#28368, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28346, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 4 more fields]
                                                                        +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, revenue#28366, try_cast(try_cast(runtime#28340 as double) as float) AS runtime#28367, budget#28341, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28346, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 4 more fields]
                                                                           +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, release_date#28365, try_cast(try_cast(revenue#28339 as double) as float) AS revenue#28366, runtime#28340, budget#28341, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28346, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 4 more fields]
                                                                              +- Project [id#28333, title#28334, vote_average#28363, vote_count#28364, status#28337, try_cast(release_date#28338 as date) AS release_date#28365, revenue#28339, runtime#28340, budget#28341, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28346, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 4 more fields]
                                                                                 +- Project [id#28333, title#28334, vote_average#28363, try_cast(try_cast(vote_count#28336 as double) as int) AS vote_count#28364, status#28337, release_date#28338, revenue#28339, runtime#28340, budget#28341, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28346, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 4 more fields]
                                                                                    +- Project [id#28333, title#28334, try_cast(try_cast(vote_average#28335 as double) as float) AS vote_average#28363, vote_count#28336, status#28337, release_date#28338, revenue#28339, runtime#28340, budget#28341, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28346, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 4 more fields]
                                                                                       +- Project [id#28333, title#28334, vote_average#28335, vote_count#28336, status#28337, release_date#28338, revenue#28339, runtime#28340, budget#28341, imdb_id#28342, original_language#28343, original_title#28344, overview#28345, popularity#28346, tagline#28347, genres#28348, production_companies#28349, production_countries#28350, spoken_languages#28351, cast#28352, director#28353, director_of_photography#28354, writers#28355, producers#28356, music_composer#28357, ... 4 more fields]
                                                                                          +- Relation [id#28333,title#28334,vote_average#28335,vote_count#28336,status#28337,release_date#28338,revenue#28339,runtime#28340,budget#28341,imdb_id#28342,original_language#28343,original_title#28344,overview#28345,popularity#28346,tagline#28347,genres#28348,production_companies#28349,production_countries#28350,spoken_languages#28351,cast#28352,director#28353,director_of_photography#28354,writers#28355,producers#28356,music_composer#28357,... 3 more fields] csv


In [449]:
# set la moyenne de runtime si runtime = 0
mean_runtime = df.select(mean("runtime")).first()[0]
df = df.withColumn("runtime", when(col("runtime") == 0, mean_runtime).otherwise(col("runtime")))

                                                                                

In [None]:
# null values by columns 2
df.select([pyspark.sql.functions.count(pyspark.sql.functions.when(pyspark.sql.functions.col(c).isNull(), c)).alias(c) for c in df.columns]).show()

[Stage 625:>                                                        (0 + 1) / 1]

+---+-----+------------+----------+------------+-------+-------+------+-----------------+--------------+--------+----------+-----------+------------+--------------------------+--------------------------+----------------------+----------+--------------+-------------+-------------+---------------------------+---------------------------+-----------------------+-----------+---------------+--------------+
| id|title|vote_average|vote_count|release_date|revenue|runtime|budget|original_language|original_title|overview|popularity|poster_path|genres_array|production_countries_array|production_companies_array|spoken_languages_array|cast_array|director_array|writers_array|genres_vector|production_countries_vector|production_companies_vector|spoken_languages_vector|cast_vector|director_vector|writers_vector|
+---+-----+------------+----------+------------+-------+-------+------+-----------------+--------------+--------+----------+-----------+------------+--------------------------+----------------

                                                                                

In [None]:
# split df to put id, title, oiginal_title, poster_path in a separate df
df_id_title = df.select("id", "title", "original_title", "poster_path", "genres_array", "production_countries_array", "production_companies_array", "spoken_languages_array", "cast_array", "director_array", "writers_array", "vote_count", "vote_average", "budget")
df = df.drop("id", "title", "original_title", "poster_path" , "genres_array", "production_countries_array", "production_companies_array", "spoken_languages_array", "cast_array", "director_array", "writers_array", "vote_count", "vote_average", "budget", "revenue")
df_id_title.show(5, truncate=False)

+---+--------------------------------+--------------------------------+--------------------------------+-------------------------------+--------------------------+----------------------------------------------+-------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------+-------------------------------------------------------------------------+
|i

In [None]:
df.show(5, truncate=False)

+------------+----------+------------+-----------+-------+---------+-----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## 4. Transformation

## 5. Chargement en base