In [1]:
import sys
try:
    import distutils
except ModuleNotFoundError:
    import setuptools._distutils as distutils
    sys.modules["distutils"] = distutils

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, count
from pyspark.sql.functions import explode, split


spark = SparkSession.builder \
    .appName("MoviesRatings") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

In [3]:
movies = spark.read.csv("ml-latest-small/movies.csv", header=True, inferSchema=True)
ratings = spark.read.csv("ml-latest-small/ratings.csv", header=True, inferSchema=True)
movies.show(10)
ratings.show(10)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
+-------+--------------------+--------------------+
only showing top 10 rows

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|

In [4]:
movies.printSchema()
ratings.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [5]:
print(f"Nombre total de films : {movies.count()}")
print(f"Nombre total de notes : {ratings.count()}")

Nombre total de films : 9742
Nombre total de notes : 100836


In [6]:
print(f"Nombre d'utilisateurs uniques : {ratings.select('userId').distinct().count()}")
print(f"Nombre de films notés : {ratings.select('movieId').distinct().count()}")

Nombre d'utilisateurs uniques : 610
Nombre de films notés : 9724


In [7]:
ratings.describe("rating").show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|            100836|
|   mean| 3.501556983616962|
| stddev|1.0425292390606342|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



In [8]:
from pyspark.sql.functions import col
ratings.groupBy("rating").count().orderBy(col("rating")).show()

+------+-----+
|rating|count|
+------+-----+
|   0.5| 1370|
|   1.0| 2811|
|   1.5| 1791|
|   2.0| 7551|
|   2.5| 5550|
|   3.0|20047|
|   3.5|13136|
|   4.0|26818|
|   4.5| 8551|
|   5.0|13211|
+------+-----+



In [9]:
best_movies = ratings.groupBy("movieId") \
    .agg(avg("rating").alias("avg_rating"), count("rating").alias("count_rating")) \
    .filter("count_rating >= 10") \
    .orderBy(col("avg_rating").desc())
best_movies.show(10)

+-------+-----------------+------------+
|movieId|       avg_rating|count_rating|
+-------+-----------------+------------+
|   1041|4.590909090909091|          11|
|   3451|4.545454545454546|          11|
|   1178|4.541666666666667|          12|
|   1104|            4.475|          20|
|   2360|4.458333333333333|          12|
|   1217|4.433333333333334|          15|
|    318|4.429022082018927|         317|
|    951|4.392857142857143|          14|
|   1927|             4.35|          10|
|    922|4.333333333333333|          27|
+-------+-----------------+------------+
only showing top 10 rows



In [10]:
avg_ratings_per_movie = ratings.groupBy("movieId").count().agg({"count": "avg"})
avg_ratings_per_movie.show()

+------------------+
|        avg(count)|
+------------------+
|10.369806663924312|
+------------------+



In [11]:
movies_genres = movies.withColumn("genre", explode(split(col("genres"), "\\|")))
movies_genres.groupBy("genre").count().orderBy(col("count").desc()).show()


+------------------+-----+
|             genre|count|
+------------------+-----+
|             Drama| 4361|
|            Comedy| 3756|
|          Thriller| 1894|
|            Action| 1828|
|           Romance| 1596|
|         Adventure| 1263|
|             Crime| 1199|
|            Sci-Fi|  980|
|            Horror|  978|
|           Fantasy|  779|
|          Children|  664|
|         Animation|  611|
|           Mystery|  573|
|       Documentary|  440|
|               War|  382|
|           Musical|  334|
|           Western|  167|
|              IMAX|  158|
|         Film-Noir|   87|
|(no genres listed)|   34|
+------------------+-----+



In [12]:
movies=movies.na.drop()
movies.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [13]:
ratings=ratings.na.drop()
ratings.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [14]:
movies_r=movies.join(ratings,on="movieId",how="inner")
movies.show()
movies_r.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import VectorAssembler,StandardScaler
from pyspark.sql import Row


In [16]:
ass=VectorAssembler(inputCols=["rating","timestamp"],outputCol="features")
ratings_vect=ass.transform(ratings)
ratings_vect.select("features").show(truncate=False)


+------------------+
|features          |
+------------------+
|[4.0,9.64982703E8]|
|[4.0,9.64981247E8]|
|[4.0,9.64982224E8]|
|[5.0,9.64983815E8]|
|[5.0,9.64982931E8]|
|[3.0,9.649824E8]  |
|[5.0,9.64980868E8]|
|[4.0,9.64982176E8]|
|[5.0,9.64984041E8]|
|[5.0,9.649841E8]  |
|[5.0,9.6498365E8] |
|[5.0,9.64981208E8]|
|[3.0,9.64980985E8]|
|[5.0,9.64981179E8]|
|[4.0,9.64980908E8]|
|[5.0,9.6498168E8] |
|[3.0,9.64982967E8]|
|[3.0,9.6498231E8] |
|[5.0,9.64981179E8]|
|[4.0,9.64982563E8]|
+------------------+
only showing top 20 rows



Standardisation avec StandardScaler

In [17]:
scal=StandardScaler(inputCol="features",outputCol="scal_features", withMean=True,withStd=True)
scal_model=scal.fit(ratings_vect)
ratings_scal=scal_model.transform(ratings_vect)
scal_2=StandardScaler(inputCol="features",outputCol="scal_features", withMean=True,withStd=True)

In [18]:
ratings_scal.select("userId","movieId","scal_features")

DataFrame[userId: int, movieId: int, scal_features: vector]

In [19]:
movies_r=movies.join(ratings_scal,on="movieId",how="inner")
movies_r.show()

+-------+--------------------+--------------------+------+------+---------+------------------+--------------------+
|movieId|               title|              genres|userId|rating|timestamp|          features|       scal_features|
+-------+--------------------+--------------------+------+------+---------+------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|     1|   4.0|964982703|[4.0,9.64982703E8]|[0.47810938792676...|
|      3|Grumpier Old Men ...|      Comedy|Romance|     1|   4.0|964981247|[4.0,9.64981247E8]|[0.47810938792676...|
|      6|         Heat (1995)|Action|Crime|Thri...|     1|   4.0|964982224|[4.0,9.64982224E8]|[0.47810938792676...|
|     47|Seven (a.k.a. Se7...|    Mystery|Thriller|     1|   5.0|964983815|[5.0,9.64983815E8]|[1.43731509893497...|
|     50|Usual Suspects, T...|Crime|Mystery|Thr...|     1|   5.0|964982931|[5.0,9.64982931E8]|[1.43731509893497...|
|     70|From Dusk Till Da...|Action|Comedy|Hor...|     1|   3.0|9649824

ACP

In [20]:
from pyspark.ml.feature import PCA

In [21]:
acp=PCA(k=2,inputCol="scal_features",outputCol="acp_features")
model_acp=acp.fit(movies_r)

In [22]:
df_acp=model_acp.transform(movies_r)

In [23]:
df_acp.select("userId","genres","acp_features").show(truncate=True)


+------+--------------------+--------------------+
|userId|              genres|        acp_features|
+------+--------------------+--------------------+
|     1|Adventure|Animati...|[-1.1259502197759...|
|     1|      Comedy|Romance|[-1.1259549804461...|
|     1|Action|Crime|Thri...|[-1.1259517859580...|
|     1|    Mystery|Thriller|[-1.8042074466711...|
|     1|Crime|Mystery|Thr...|[-1.8042103370781...|
|     1|Action|Comedy|Hor...|[-0.4476903477006...|
|     1|Adventure|Comedy|...|[-1.8042170824508...|
|     1|    Action|Drama|War|[-1.1259519429031...|
|     1|Action|Drama|Roma...|[-1.8042067077210...|
|     1|          Comedy|War|[-1.8042065148092...|
|     1|Action|Romance|We...|[-1.8042079861702...|
|     1|              Comedy|[-1.8042159707558...|
|     1|              Comedy|[-0.4476949743135...|
|     1|    Adventure|Comedy|[-1.8042160655768...|
|     1|        Comedy|Drama|[-1.1259560888714...|
|     1|Action|Adventure|...|[-1.8042144274616...|
|     1|Comedy|Crime|Dram...|[-

In [24]:
print(f"variance expliquée:{model_acp.explainedVariance.toArray()}")

variance expliquée:[0.50290121 0.49709879]


In [25]:
x_train,x_test= movies_r.randomSplit([0.8,0.2])


In [26]:
x_train,x_test= movies_r.randomSplit([0.8,0.2])
als=ALS(maxIter=10, regParam=0.01,userCol="userId",itemCol="movieId",ratingCol="rating",coldStartStrategy="drop",rank=10)



In [27]:
model=als.fit(x_train)
pred=model.transform(x_test)
pred.show()

+-------+--------------------+--------------------+------+------+----------+-------------------+--------------------+----------+
|movieId|               title|              genres|userId|rating| timestamp|           features|       scal_features|prediction|
+-------+--------------------+--------------------+------+------+----------+-------------------+--------------------+----------+
|    356| Forrest Gump (1994)|Comedy|Drama|Roma...|   148|   4.0|1482548476|[4.0,1.482548476E9]|[0.47810938792676...| 3.5648632|
|   1088|Dirty Dancing (1987)|Drama|Musical|Rom...|   463|   3.5|1145460096|[3.5,1.145460096E9]|[-0.0014934675773...| 2.4859412|
|   1092|Basic Instinct (1...|Crime|Mystery|Thr...|   463|   3.0|1145460035|[3.0,1.145460035E9]|[-0.4810963230814...| 3.3892093|
|   3753| Patriot, The (2000)|    Action|Drama|War|   463|   4.0|1145460023|[4.0,1.145460023E9]|[0.47810938792676...| 3.8307564|
|   4310| Pearl Harbor (2001)|Action|Drama|Roma...|   463|   3.0|1145460333|[3.0,1.145460333E9]|[

In [28]:
eval=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")