## 

# 🎬 Movies Data Pipeline
Ce notebook contient un pipeline de traitement des données du fichier `TMDB_all_movies.csv`, dans le cadre d'un projet de data engineering.

## 1. Ingestion

In [106]:
import pandas as pd
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Movie recommender").getOrCreate()

print(spark.version)

df = spark.read.csv("../data/TMDB_all_movies.csv", header=True, inferSchema=False, sep=",", quote='"', escape='"')

4.0.0


## 2. Exploration

In [107]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nullable = true)
 |-- status: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- director: string (nullable = true)
 |-- director_of_photography: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- producers: string (n

In [None]:
# df.isna().sum()
# df.isnull().sum() avec spark

# Type de chaque colonne :
    # vote_average: float
    # vote_count: int
    # release_date: date
    # revenue: float
    # runtime: float
    # budget: float
    # popularity: float
    # genres, production_countries, production_companies, spoken, cast, director, writers: string / one-hot encoded ?

df = df.withColumn("vote_average", df["vote_average"].try_cast("double")) \
    .withColumn("vote_count", df["vote_count"].try_cast("double")) \
    .withColumn("release_date", df["release_date"].cast("date")) \
    .withColumn("revenue", df["revenue"].try_cast("double")) \
    .withColumn("runtime", df["runtime"].try_cast("double")) \
    .withColumn("budget", df["budget"].try_cast("double")) \
    .withColumn("popularity", df["popularity"].try_cast("double"))



df.printSchema()




# df.select([pyspark.sql.functions.count(pyspark.sql.functions.when(pyspark.sql.functions.col(c).isNull(), c)).alias(c) for c in df.columns]).show()



root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: float (nullable = true)
 |-- vote_count: float (nullable = true)
 |-- release_date: date (nullable = true)
 |-- revenue: float (nullable = true)
 |-- runtime: float (nullable = true)
 |-- budget: float (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: float (nullable = true)
 |-- genres: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- director: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- music_composer: string (nullable = true)
 |-- poster_path: string (nullable = true)



In [109]:
# A faire :

# enlever toutes les lignes qui ne sont pas en release
# ensuite enlever la colonne "status"
# enlever imdb_id, tagline, director_of_photography, producers, imdb_rating, imdb_votes,
# drop les lignes qui n'ont pas d'overview

# Enlever les données non important pour le calcul et les mettre de coter pour plus tard
  # id, title, original_title, poster_path

# df["status"].unique()
# df["genres"]

# supprime toutes les lignes qui n'ont pas status release
df = df.filter(df["status"] == "Released")
# supprime la colonne status
df = df.drop("status")
# supprime les colonnes qui ne sont pas utiles pour le calcul
df = df.drop("imdb_id", "tagline", "director_of_photography", "producers", "imdb_rating", "imdb_votes")
# supprime les lignes qui n'ont pas d'overview
# df = df.filter(df["overview"].isNotNull() & (df["overview"] != ""))





In [116]:
df_test = df.filter(df["overview"].isNotNull() & (df["overview"] != ""))
df_test.count()

                                                                                

914169

In [None]:
from pyspark.sql.functions import count, col


duplicate_titles = (
    df.groupBy("title") \
      .agg(count("*").alias("count"))\
      .filter(col("count") > 1)\
)

duplicate_titles.show(100)






+--------------------+-----+
|               title|count|
+--------------------+-----+
|       Heading South|    2|
|                Nell|    3|
|             Nemesis|   20|
|          Der Tunnel|    6|
|              Deszcz|    2|
|         Deep Rising|    2|
|    Straight to Hell|    2|
|Dance with the Devil|    4|
|              Room 6|    2|
|              Heaven|   30|
|                Silk|   14|
|       Crossing Over|   10|
|   What No One Knows|    2|
|        The Big Bang|    6|
|          Riverworld|    2|
|       Sugar & Spice|    4|
|            Deep Red|    4|
|        Miracle Mile|    2|
|             Larceny|    4|
|     My Name Is Khan|    2|
|        Generation X|    2|
|      A Woman's Face|    3|
|         It's a Gift|    2|
|    La Vie de Bohème|    2|
|             Amateur|   14|
|              Crisis|   17|
|  A Woman Is a Woman|    2|
|       Natural Enemy|    2|
|           Surprise!|   12|
|Diary of a Chambe...|    3|
|         Lesser Evil|    2|
|             

                                                                                

In [123]:
# Explorer en profondeur productuion_companies vide

# df_null_production = df.filter(col("production_companies").isNull())

# df_null_production = df_null_production.filter(col("vote_count").cast("int") > 100 )

# df_null_production.show(100, truncate=False)
# import IntegerType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import expr

df = df.withColumn("vote_count", col("vote_count").cast("double").cast("int"))

# df.printSchema()
# df_null_production_votes = df.filter(
#     (col("production_companies").isNull()) & (col("vote_count") > 100)
# )



#df_null_production_votes = df.filter(
 #   (col("production_companies").isNull()) &
  #  (expr("try_cast(vote_count as double) > 100.0"))
#)

df_null_production_votes.show(truncate=False)

25/07/07 15:45:43 ERROR Executor: Exception in task 0.0 in stage 97.0 (TID 204)
org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '28.0' of the type "STRING" cannot be cast to "INT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
line 15 in cell [108]

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidInputInCastToNumberError(QueryExecutionErrors.scala:145)
	at org.apache.spark.sql.catalyst.util.UTF8StringUtils$.withException(UTF8StringUtils.scala:51)
	at org.apache.spark.sql.catalyst.util.UTF8StringUtils$.toIntExact(UTF8StringUtils.scala:34)
	at org.apache.spark.sql.catalyst.util.UTF8StringUtils.toIntExact(UTF8StringUtils.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.Buf

NumberFormatException: [CAST_INVALID_INPUT] The value '28.0' of the type "STRING" cannot be cast to "INT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
line 15 in cell [108]


## 3. Nettoyage

## 4. Transformation

## 5. Chargement en base