## 

# 🎬 Movies Data Pipeline
Ce notebook contient un pipeline de traitement des données du fichier `TMDB_all_movies.csv`, dans le cadre d'un projet de data engineering.

## 1. Ingestion

In [21]:
from pyspark.sql import SparkSession
import pyspark

In [22]:
spark = SparkSession.builder.appName("Movie recommender").getOrCreate()

df = spark.read.csv("../data/TMDB_all_movies.csv", header=True, inferSchema=True, sep=",", quote='"', escape='"')

                                                                                

In [23]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nullable = true)
 |-- status: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- director: string (nullable = true)
 |-- director_of_photography: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- producers: string (n

In [24]:
# Type de chaque colonne :
    # vote_average: float
    # vote_count: int
    # release_date: date
    # revenue: float
    # runtime: float
    # budget: float
    # popularity: float
    # genres, production_countries, production_companies, spoken, cast, director, writers: string / one-hot encoded ?

df = df.withColumn("vote_average", df["vote_average"].try_cast("double").try_cast("float")) \
    .withColumn("vote_count", df["vote_count"].try_cast("double").try_cast("int")) \
    .withColumn("release_date", df["release_date"].try_cast("date")) \
    .withColumn("revenue", df["revenue"].try_cast("double").try_cast("float")) \
    .withColumn("runtime", df["runtime"].try_cast("double").try_cast("float")) \
    .withColumn("budget", df["budget"].try_cast("double").try_cast("float")) \
    .withColumn("popularity", df["popularity"].try_cast("double").try_cast("float"))

df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: float (nullable = true)
 |-- vote_count: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- revenue: float (nullable = true)
 |-- runtime: float (nullable = true)
 |-- budget: float (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: float (nullable = true)
 |-- tagline: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- director: string (nullable = true)
 |-- director_of_photography: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- producers: string (nullabl

## 2. Exploration

In [25]:
df.show(5, truncate=False)

+---+--------------------------------+------------+----------+--------+------------+-----------+-------+---------+---------+-----------------+--------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [26]:
# null values by columns
df.select([pyspark.sql.functions.count(pyspark.sql.functions.when(pyspark.sql.functions.col(c).isNull(), c)).alias(c) for c in df.columns]).show()



+---+-----+------------+----------+------+------------+-------+-------+------+-------+-----------------+--------------+--------+----------+-------+------+--------------------+--------------------+----------------+------+--------+-----------------------+-------+---------+--------------+-----------+----------+-----------+
| id|title|vote_average|vote_count|status|release_date|revenue|runtime|budget|imdb_id|original_language|original_title|overview|popularity|tagline|genres|production_companies|production_countries|spoken_languages|  cast|director|director_of_photography|writers|producers|music_composer|imdb_rating|imdb_votes|poster_path|
+---+-----+------------+----------+------+------------+-------+-------+------+-------+-----------------+--------------+--------+----------+-------+------+--------------------+--------------------+----------------+------+--------+-----------------------+-------+---------+--------------+-----------+----------+-----------+
|  0| 4574|        7798|      7977

                                                                                

In [27]:
# overview null but vote_count > 100
df.filter(
    (df["overview"].isNull()) & (df["vote_count"] > 100)
).show(truncate=False)

[Stage 23:>                                                         (0 + 2) / 2]

+-------------------------------------------------------+-------------------------------------------+------------+----------+-------------------------------------------------+------------+--------+-------+---------+---------+-----------------+-------------------------------+--------+----------+--------------+----------------------------------+------------------------------------------------------------------+------------------------+----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [28]:
# get duplicate titles
from pyspark.sql.functions import count, col

duplicate_titles = (
    df.groupBy("title") \
      .agg(count("*").alias("count"))\
      .filter(col("count") > 1)\
)

duplicate_titles.show(100)



+--------------------+-----+
|               title|count|
+--------------------+-----+
|       Heading South|    2|
|                Nell|    3|
|             Nemesis|   20|
|          Der Tunnel|    6|
|              Deszcz|    2|
|         Deep Rising|    2|
|    Straight to Hell|    2|
|Dance with the Devil|    4|
|              Room 6|    2|
|              Heaven|   30|
|                Silk|   14|
|       Crossing Over|   10|
|   What No One Knows|    2|
|        The Big Bang|    6|
|          Riverworld|    2|
|       Sugar & Spice|    4|
|            Deep Red|    4|
|        Miracle Mile|    2|
|             Larceny|    4|
|     My Name Is Khan|    2|
|        Generation X|    2|
|      A Woman's Face|    3|
|         It's a Gift|    2|
|    La Vie de Bohème|    2|
|             Amateur|   14|
|              Crisis|   17|
|  A Woman Is a Woman|    2|
|       Natural Enemy|    2|
|           Surprise!|   12|
|Diary of a Chambe...|    3|
|         Lesser Evil|    2|
|             

                                                                                

In [29]:
# get rows with null production_companies and vote_count > 100
from pyspark.sql.functions import expr

df.filter(
    (col("production_companies").isNull()) &
    (expr("try_cast(vote_count as double) > 100.0"))
).show(truncate=False)

+-----+-----------------------------------------+------------+----------+--------+------------+---------+-------+---------+---------+-----------------+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+---------------------------------------------------------------------------+--------------------------------+--------------------+---------------------------------+----------------------------------+------------------------------------------------------------------------------------------------------

## 3. Nettoyage

In [33]:
# enlever toutes les lignes qui ne sont pas en released
df = df.filter(df["status"] == "Released")
df.select("status").distinct().show()



+--------+
|  status|
+--------+
|Released|
+--------+



                                                                                

In [34]:
# drop columns that are not useful for the calculation
df = df.drop("status", "imdb_id", "tagline", "director_of_photography", "producers", "imdb_rating", "imdb_votes")
df.show(5, truncate=False)

+---+--------------------------------+------------+----------+------------+-----------+-------+---------+-----------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# A faire :

# drop les lignes qui n'ont pas d'overview

# Enlever les données non important pour le calcul et les mettre de coter pour plus tard
  # id, title, original_title, poster_path

# df["genres"]

# supprime les lignes qui n'ont pas d'overview
# df = df.filter(df["overview"].isNotNull() & (df["overview"] != ""))

## 4. Transformation

## 5. Chargement en base