In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf() \
    .set("spark.driver.memory", "4g") \
    .set("spark.executor.memory", "4g") \
    .set("spark.executor.cores", "2") \
    .set("spark.driver.maxResultSize", "2g")


In [2]:
spark = SparkSession.builder \
    .appName("BigDataProject") \
    .config(conf=conf) \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/30 10:13:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/30 10:13:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
movies_df = spark.read.csv(
    "hdfs://namenode:9000/datasets/movie.csv",
    header=True,        
    inferSchema=True    
)

In [4]:
movies_df

DataFrame[movieId: int, title: string, genres: string]

In [5]:
rating_df = spark.read.csv(
    "hdfs://namenode:9000/datasets/rating.csv",
    header=True,        
    inferSchema=True    
)


                                                                                

In [6]:
movies_df.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [7]:
rating_df.show(5)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+
only showing top 5 rows



In [8]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

movie_schema = StructType([
    StructField("movieId", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("genres", StringType(), True)
])
print(movie_schema)

StructType([StructField('movieId', IntegerType(), True), StructField('title', StringType(), True), StructField('genres', StringType(), True)])


In [9]:
rating_schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movie", StringType(), True),
    StructField("rating", StringType(), True),
    StructField("timestamp", StringType(), True)
])
print(rating_schema)

StructType([StructField('userId', IntegerType(), True), StructField('movie', StringType(), True), StructField('rating', StringType(), True), StructField('timestamp', StringType(), True)])


In [10]:
movies_df.groupBy("movieId").count().show()
rating_df.groupBy("userId").count().show()


                                                                                

+-------+-----+
|movieId|count|
+-------+-----+
|    148|    1|
|    463|    1|
|    471|    1|
|    496|    1|
|    833|    1|
|   1088|    1|
|   1238|    1|
|   1342|    1|
|   1580|    1|
|   1591|    1|
|   1645|    1|
|   1829|    1|
|   1959|    1|
|   2122|    1|
|   2142|    1|
|   2366|    1|
|   2659|    1|
|   2866|    1|
|   3175|    1|
|   3749|    1|
+-------+-----+
only showing top 20 rows





+------+-----+
|userId|count|
+------+-----+
|   148|  128|
|   463|   80|
|   471|  548|
|   496|  168|
|   833|   47|
|  1088|   60|
|  1238|   97|
|  1342|   25|
|  1580|   42|
|  1591|   50|
|  1645|  108|
|  1829|  288|
|  1959|  226|
|  2122|  115|
|  2142|   29|
|  2366|   42|
|  2659|  101|
|  2866|  940|
|  3175|   22|
|  3749|   44|
+------+-----+
only showing top 20 rows



                                                                                

In [11]:
#vérification des doublouns 

In [12]:
print("Doublons dans movies :", movies_df.count(), "vs", movies_df.dropDuplicates().count())


Doublons dans movies : 27278 vs 27278


In [13]:
print("Doublons dans rating :", rating_df.count(), "vs", rating_df.dropDuplicates().count())


25/04/30 10:13:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:13:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:13:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:13:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:13:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:13:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:13:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:13:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:13:50 WARN RowBasedKeyValueBatch: Calling spill() on

Doublons dans rating : 20000263 vs 20000263


                                                                                

# Vérification des valeurs manquantes 

In [14]:
from pyspark.sql.functions import col, isnan, isnull

# Pour rating
rating_df.select([col(c).isNull().alias(c) for c in rating_df.columns]).summary("").show()

# Pour movies
movies_df.select([col(c).isNull().alias(c) for c in movies_df.columns]).summary("").show()


+-------+
|summary|
+-------+
|       |
+-------+

+-------+
|summary|
+-------+
|       |
+-------+



Fussionner les deux df avec Join

In [15]:
rating_movies = rating_df.join(movies_df, on="movieId", how="inner")
rating_movies.show(5)

+-------+------+------+-------------------+--------------------+--------------------+
|movieId|userId|rating|          timestamp|               title|              genres|
+-------+------+------+-------------------+--------------------+--------------------+
|      2|     1|   3.5|2005-04-02 23:53:47|      Jumanji (1995)|Adventure|Childre...|
|     29|     1|   3.5|2005-04-02 23:31:16|City of Lost Chil...|Adventure|Drama|F...|
|     32|     1|   3.5|2005-04-02 23:33:39|Twelve Monkeys (a...|Mystery|Sci-Fi|Th...|
|     47|     1|   3.5|2005-04-02 23:32:07|Seven (a.k.a. Se7...|    Mystery|Thriller|
|     50|     1|   3.5|2005-04-02 23:29:40|Usual Suspects, T...|Crime|Mystery|Thr...|
+-------+------+------+-------------------+--------------------+--------------------+
only showing top 5 rows



In [16]:
rating_movies.count()

                                                                                

20000263

#Nombre de lignes distinct

In [17]:
rating_movies.select("userId", "movieId").distinct().count()


25/04/30 10:14:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:14:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:14:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:14:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:14:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:14:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:14:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:14:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/30 10:14:26 WARN RowBasedKeyValueBatch: Calling spill() on

20000263

In [18]:
rating_movies.select("movieId").distinct().count()
rating_movies.select("userId").distinct().count()


                                                                                

138493

# filtrage des notes extremes

In [19]:
rating_df = rating_df.filter((col("rating") >= 0.5) & (col("rating") <= 5.0))
rating_df.count()

                                                                                

20000263

# convert datetime

In [20]:
from pyspark.sql.functions import to_timestamp

rating_movies = rating_movies.withColumn("timestamp", to_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss"))

rating_movies.printSchema()
rating_movies.select("timestamp").show(5, truncate=False)


root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------------------+
|timestamp          |
+-------------------+
|2005-04-02 23:53:47|
|2005-04-02 23:31:16|
|2005-04-02 23:33:39|
|2005-04-02 23:32:07|
|2005-04-02 23:29:40|
+-------------------+
only showing top 5 rows



# Importation du modèle "ALS"

In [21]:
!pip install numpy

[0m

In [22]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# Données de train et test 

In [23]:
(training, test) = rating_movies.randomSplit([0.8, 0.2], seed=42)


In [24]:
spark.version

'3.5.1'

In [25]:
from pyspark.ml.recommendation import ALS
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    rank=10,             # nombre de facteurs latents
    maxIter=10,          # nombre d’itérations
    regParam=0.1,        # régularisation
    numItemBlocks=10,    # Nombre de blocs pour paralléliser le calcul 
    nonnegative=True
)
model = als.fit(training)

25/04/30 10:15:54 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

# Evaluation du modèle 

In [27]:
predictions = model.transform(test)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print(f"\nPerformance du modèle:")
print(f"RMSE sur le jeu de test : {rmse:.3f}")





Performance du modèle:
RMSE sur le jeu de test : nan


                                                                                

In [28]:
predictions = model.transform(test)
predictions.select("userId", "movieId", "rating", "prediction").show(10)




+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|    31|      1|   3.0|  3.293149|
|  8947|      1|   5.0|  3.692583|
|  8986|      1|   4.0| 3.4103339|
|  9027|      1|   5.0|  3.087177|
| 17486|      1|   3.0|  3.586236|
| 17536|      1|   3.0|  3.508809|
| 17539|      1|   4.0| 3.0576348|
| 26273|      1|   3.0| 3.2686825|
| 26309|      1|   4.5| 3.3468356|
| 35112|      1|   5.0|   4.07642|
+------+-------+------+----------+
only showing top 10 rows



                                                                                

# Suppression des Nan dans la prediction

In [29]:
from pyspark.sql.functions import isnan, col

predictions.filter(col("prediction").isNull() | isnan(col("prediction"))).count()


                                                                                

960

In [30]:
predictions_clean = predictions.na.drop(subset=["prediction"])

In [31]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions_clean)
print(f"la performance RMSE  : {rmse}")




la performance RMSE  : 0.8166866139718709


                                                                                

In [37]:
model.write().overwrite().save("hdfs://namenode:9000/models/als")


                                                                                