In [1]:
# Imports nécessaires
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, count, when

# Configuration Spark
spark = SparkSession.builder \
    .appName("MovieLensAnalysis") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "2g") \
    .config("spark.sql.shuffle.partitions", "20") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .getOrCreate()

# Définition des schémas
movies_schema = StructType([
    StructField("movieId", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("genres", StringType(), True)
])

ratings_schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", FloatType(), True),
    StructField("timestamp", TimestampType(), True)
])

try:
    # 1. Chargement et analyse de movies.csv
    print("=== Analyse de movies.csv ===")
    movies_df = spark.read.csv(
        "hdfs://namenode:9000/datasets/movie.csv",
        header=True,
        schema=movies_schema
    ).persist()

    print(f"Nombre total de lignes dans movies.csv: {movies_df.count()}")
    print("\nValeurs manquantes dans movies.csv:")
    for column in movies_df.columns:
        missing = movies_df.filter(col(column).isNull()).count()
        print(f"{column}: {missing} valeurs manquantes")

    print("\nAperçu de movies.csv:")
    movies_df.show(5)

    # 2. Chargement et analyse de ratings.csv
    print("\n=== Analyse de ratings.csv ===")
    ratings_df = spark.read.csv(
        "hdfs://namenode:9000/datasets/rating.csv",
        header=True,
        schema=ratings_schema
    ).persist()

    print(f"Nombre total de lignes dans ratings.csv: {ratings_df.count()}")
    print("\nValeurs manquantes dans ratings.csv:")
    for column in ratings_df.columns:
        missing = ratings_df.filter(col(column).isNull()).count()
        print(f"{column}: {missing} valeurs manquantes")

    print("\nAperçu de ratings.csv:")
    ratings_df.show(5)

except Exception as e:
    print("Erreur:", str(e))
    print("\nVérifications à faire:")
    print("1. HDFS est-il en cours d'exécution? → jps")
    print("2. Les fichiers sont-ils présents? → hdfs dfs -ls /datasets")
    print("3. Le hostname est-il correct dans core-site.xml?")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/05 02:41:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


=== Analyse de movies.csv ===


                                                                                

Nombre total de lignes dans movies.csv: 27278

Valeurs manquantes dans movies.csv:
movieId: 0 valeurs manquantes
title: 0 valeurs manquantes
genres: 0 valeurs manquantes

Aperçu de movies.csv:
+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows


=== Analyse de ratings.csv ===


                                                                                

Nombre total de lignes dans ratings.csv: 20000263

Valeurs manquantes dans ratings.csv:
userId: 0 valeurs manquantes
movieId: 0 valeurs manquantes
rating: 0 valeurs manquantes
timestamp: 0 valeurs manquantes

Aperçu de ratings.csv:
+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+
only showing top 5 rows



In [2]:
# Imports nécessaires
from pyspark.sql.functions import col, count

# Analyse des doublons dans movies.csv
print("\n=== Analyse des doublons dans movies.csv ===")
total_movies = movies_df.count()
unique_movies = movies_df.distinct().count()
duplicates_movies = total_movies - unique_movies

print(f"Nombre total de lignes: {total_movies}")
print(f"Nombre de lignes uniques: {unique_movies}")
print(f"Nombre de doublons: {duplicates_movies}")

if duplicates_movies > 0:
    print("\nExemples de doublons dans movies.csv:")
    movies_df.groupBy(movies_df.columns) \
        .count() \
        .filter(col("count") > 1) \
        .show(5)

# Analyse des doublons dans ratings.csv
print("\n=== Analyse des doublons dans ratings.csv ===")
total_ratings = ratings_df.count()
unique_ratings = ratings_df.distinct().count()
duplicates_ratings = total_ratings - unique_ratings

print(f"Nombre total de lignes: {total_ratings}")
print(f"Nombre de lignes uniques: {unique_ratings}")
print(f"Nombre de doublons: {duplicates_ratings}")

if duplicates_ratings > 0:
    print("\nExemples de doublons dans ratings.csv:")
    ratings_df.groupBy(ratings_df.columns) \
        .count() \
        .filter(col("count") > 1) \
        .show(5)


=== Analyse des doublons dans movies.csv ===
Nombre total de lignes: 27278
Nombre de lignes uniques: 27278
Nombre de doublons: 0

=== Analyse des doublons dans ratings.csv ===




Nombre total de lignes: 20000263
Nombre de lignes uniques: 20000263
Nombre de doublons: 0


                                                                                

In [3]:
# Fusion des dataframes
merged_df = movies_df.join(ratings_df, "movieId")

# Vérification après fusion
print(f"Nombre total de lignes après fusion: {merged_df.count()}")
print(f"Nombre de films uniques: {merged_df.select('movieId').distinct().count()}")

                                                                                

Nombre total de lignes après fusion: 20000263




Nombre de films uniques: 26744


                                                                                

In [4]:
print("\nAperçu des dataframe apres fusion")
merged_df .show(5)


Aperçu des dataframe apres fusion
+-------+--------------------+--------------------+------+------+-------------------+
|movieId|               title|              genres|userId|rating|          timestamp|
+-------+--------------------+--------------------+------+------+-------------------+
|      2|      Jumanji (1995)|Adventure|Childre...|     1|   3.5|2005-04-02 23:53:47|
|     29|City of Lost Chil...|Adventure|Drama|F...|     1|   3.5|2005-04-02 23:31:16|
|     32|Twelve Monkeys (a...|Mystery|Sci-Fi|Th...|     1|   3.5|2005-04-02 23:33:39|
|     47|Seven (a.k.a. Se7...|    Mystery|Thriller|     1|   3.5|2005-04-02 23:32:07|
|     50|Usual Suspects, T...|Crime|Mystery|Thr...|     1|   3.5|2005-04-02 23:29:40|
+-------+--------------------+--------------------+------+------+-------------------+
only showing top 5 rows



In [5]:
from pyspark.sql.functions import split, col, to_timestamp, year, count, avg

# Amélioration du dataset
enriched_df = merged_df \
    .withColumn("genres_array", split(col("genres"), "\\|")) \
    .withColumn("rating_date", to_timestamp(col("timestamp"))) \
    .withColumn("rating_year", year(to_timestamp(col("timestamp"))))

# Calcul des statistiques par film
movie_stats = enriched_df.groupBy("movieId", "title") \
    .agg(
        count("rating").alias("num_ratings"),
        avg("rating").alias("avg_rating")
    )


In [6]:
# Affichage des données
print("=== Aperçu des données enrichies ===")
enriched_df.select(
    "movieId",
    "title",
    "genres_array",
    "rating",
    "rating_date",
    "rating_year"
).show(5, truncate=False)

# Affichage des statistiques par film
print("\n=== Top 10 films les mieux notés (minimum 10 notes) ===")
movie_stats.filter(col("num_ratings") >= 10) \
    .orderBy(col("avg_rating").desc()) \
    .select(
        "title",
        "num_ratings",
        col("avg_rating").cast("decimal(10,2)").alias("avg_rating")
    ) \
    .show(10, truncate=False)

# Résumé statistique
print("\n=== Résumé statistique ===")
print(f"Nombre total de films: {movie_stats.count()}")
print(f"Note moyenne globale: {movie_stats.select(avg('avg_rating')).first()[0]:.2f}")
print(f"Nombre moyen de notes par film: {movie_stats.select(avg('num_ratings')).first()[0]:.2f}")

=== Aperçu des données enrichies ===
+-------+---------------------------------------------------------------+--------------------------------------------+------+-------------------+-----------+
|movieId|title                                                          |genres_array                                |rating|rating_date        |rating_year|
+-------+---------------------------------------------------------------+--------------------------------------------+------+-------------------+-----------+
|2      |Jumanji (1995)                                                 |[Adventure, Children, Fantasy]              |3.5   |2005-04-02 23:53:47|2005       |
|29     |City of Lost Children, The (Cité des enfants perdus, La) (1995)|[Adventure, Drama, Fantasy, Mystery, Sci-Fi]|3.5   |2005-04-02 23:31:16|2005       |
|32     |Twelve Monkeys (a.k.a. 12 Monkeys) (1995)                      |[Mystery, Sci-Fi, Thriller]                 |3.5   |2005-04-02 23:33:39|2005       |
|47     |Seven 

                                                                                

+---------------------------------------------+-----------+----------+
|title                                        |num_ratings|avg_rating|
+---------------------------------------------+-----------+----------+
|Zero Motivation (Efes beyahasei enosh) (2014)|11         |4.50      |
|Shawshank Redemption, The (1994)             |63366      |4.45      |
|Godfather, The (1972)                        |41355      |4.36      |
|Usual Suspects, The (1995)                   |47006      |4.33      |
|Schindler's List (1993)                      |50054      |4.31      |
|Death on the Staircase (Soupçons) (2004)     |21         |4.29      |
|Godfather: Part II, The (1974)               |27398      |4.28      |
|Seven Samurai (Shichinin no samurai) (1954)  |11611      |4.27      |
|Rear Window (1954)                           |17449      |4.27      |
|O Auto da Compadecida (Dog's Will, A) (2000) |13         |4.27      |
+---------------------------------------------+-----------+----------+
only s

                                                                                

Nombre total de films: 26744


                                                                                

Note moyenne globale: 3.13




Nombre moyen de notes par film: 747.84


                                                                                

In [7]:
# Imports nécessaires
from pyspark.sql.functions import col, count, avg, stddev, min, max, explode

# Vérification de la distribution des notes
print("=== Distribution des Notes ===")
ratings_distribution = enriched_df.groupBy("rating") \
    .count() \
    .orderBy("rating")
ratings_distribution.show()

# Statistiques descriptives des notes
print("\n=== Statistiques des Notes ===")
rating_stats = enriched_df.select(
    avg("rating").alias("moyenne"),
    stddev("rating").alias("ecart_type"),
    min("rating").alias("min"),
    max("rating").alias("max")
)
rating_stats.show()

# Distribution temporelle
print("\n=== Distribution par Année ===")
yearly_stats = enriched_df.groupBy("rating_year") \
    .agg(
        count("rating").alias("nombre_notes"),
        avg("rating").alias("note_moyenne")
    ) \
    .orderBy("rating_year")
yearly_stats.show(5)

# Analyse des genres
print("\n=== Distribution par Genre ===")
genre_stats = enriched_df.select("genres_array", "rating") \
    .withColumn("genre", explode(col("genres_array"))) \
    .groupBy("genre") \
    .agg(
        count("rating").alias("nombre_notes"),
        avg("rating").alias("note_moyenne")
    ) \
    .orderBy(col("nombre_notes").desc())
genre_stats.show()

=== Distribution des Notes ===


                                                                                

+------+-------+
|rating|  count|
+------+-------+
|   0.5| 239125|
|   1.0| 680732|
|   1.5| 279252|
|   2.0|1430997|
|   2.5| 883398|
|   3.0|4291193|
|   3.5|2200156|
|   4.0|5561926|
|   4.5|1534824|
|   5.0|2898660|
+------+-------+


=== Statistiques des Notes ===


                                                                                

+------------------+-----------------+---+---+
|           moyenne|       ecart_type|min|max|
+------------------+-----------------+---+---+
|3.5255285642993797|1.051988919294243|0.5|5.0|
+------------------+-----------------+---+---+


=== Distribution par Année ===


                                                                                

+-----------+------------+------------------+
|rating_year|nombre_notes|      note_moyenne|
+-----------+------------+------------------+
|       1995|           4|              3.75|
|       1996|     1612609| 3.545408093344388|
|       1997|      700982| 3.588521531223341|
|       1998|      308070|3.5124841756743597|
|       1999|     1198384| 3.616336666711171|
+-----------+------------+------------------+
only showing top 5 rows


=== Distribution par Genre ===




+------------------+------------+------------------+
|             genre|nombre_notes|      note_moyenne|
+------------------+------------+------------------+
|             Drama|     8857853|3.6742955093068264|
|            Comedy|     7502234|3.4260113054324886|
|            Action|     5614208|  3.44386376493354|
|          Thriller|     5313506|  3.50711121809216|
|         Adventure|     4380351|3.5018926565473865|
|           Romance|     3802002| 3.541802581902903|
|             Crime|     3298335|3.6745276025631113|
|            Sci-Fi|     3150141|3.4367726714455005|
|           Fantasy|     2111403|3.5059453358738244|
|          Children|     1669249|3.4081137685270444|
|           Mystery|     1557282| 3.663508921312903|
|            Horror|     1482737|3.2772238097518307|
|         Animation|     1140476|3.6174939235897994|
|               War|     1048618|3.8095307347384844|
|           Musical|      870915| 3.558090628821412|
|              IMAX|      492366| 3.6559459832

                                                                                

In [8]:
enriched_df.write \
    .mode("overwrite") \
    .parquet("hdfs://namenode:9000/datasets/clean_movies_ratings")

# Vérification
print("=== Vérification de la sauvegarde ===")
saved_df = spark.read.parquet("hdfs://namenode:9000/datasets/clean_movies_ratings")
print(f"Nombre de lignes sauvegardées: {saved_df.count()}")
print("\n=== Aperçu des données sauvegardées ===")
saved_df.show(5)

                                                                                

=== Vérification de la sauvegarde ===
Nombre de lignes sauvegardées: 20000263

=== Aperçu des données sauvegardées ===


[Stage 111:>                                                        (0 + 1) / 1]

+-------+--------------------+--------------------+------+------+-------------------+--------------------+-------------------+-----------+
|movieId|               title|              genres|userId|rating|          timestamp|        genres_array|        rating_date|rating_year|
+-------+--------------------+--------------------+------+------+-------------------+--------------------+-------------------+-----------+
|      2|      Jumanji (1995)|Adventure|Childre...|     1|   3.5|2005-04-02 23:53:47|[Adventure, Child...|2005-04-02 23:53:47|       2005|
|     29|City of Lost Chil...|Adventure|Drama|F...|     1|   3.5|2005-04-02 23:31:16|[Adventure, Drama...|2005-04-02 23:31:16|       2005|
|     32|Twelve Monkeys (a...|Mystery|Sci-Fi|Th...|     1|   3.5|2005-04-02 23:33:39|[Mystery, Sci-Fi,...|2005-04-02 23:33:39|       2005|
|     47|Seven (a.k.a. Se7...|    Mystery|Thriller|     1|   3.5|2005-04-02 23:32:07| [Mystery, Thriller]|2005-04-02 23:32:07|       2005|
|     50|Usual Suspects, T.

                                                                                