In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder.appName("RecommederSys").getOrCreate()

In [8]:
ratings_location = "/home/datamaking/Documents/Hadoop/ml-latest/ratings.csv"

ratings = (
            spark.read.csv(
                path=ratings_location,
                header=True,
                sep=",",
                quote='"',
                schema="userId INT, movieId INT, rating DOUBLE, timestamp INT"
                )
                .withColumn("timestamp", f.to_timestamp(f.from_unixtime("timestamp")))
                .cache()
            )


+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|    307|   3.5|2009-10-28 00:00:21|
|     1|    481|   3.5|2009-10-28 00:04:16|
|     1|   1091|   1.5|2009-10-28 00:04:31|
|     1|   1257|   4.5|2009-10-28 00:04:20|
|     1|   1449|   4.5|2009-10-28 00:01:04|
+------+-------+------+-------------------+
only showing top 5 rows



In [None]:
ratings.show(5)

In [9]:
ratings.summary().show()

+-------+------------------+-----------------+------------------+
|summary|            userId|          movieId|            rating|
+-------+------------------+-----------------+------------------+
|  count|          27753444|         27753444|          27753444|
|   mean|141942.01557064414|18487.99983414671|3.5304452124932677|
| stddev| 81707.40009148984| 35102.6252474677|1.0663527502319696|
|    min|                 1|                1|               0.5|
|    25%|             71164|             1099|               3.0|
|    50%|            142014|             2716|               3.5|
|    75%|            212466|             7151|               4.0|
|    max|            283228|           193886|               5.0|
+-------+------------------+-----------------+------------------+



In [10]:
from pyspark.ml.recommendation import ALS

In [14]:
ratings = ratings.drop("timestamp")
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [15]:
model = (
    ALS(userCol="userId",
        itemCol="movieId",
        ratingCol="rating").fit(ratings)
        )

In [16]:
type(model)

pyspark.ml.recommendation.ALSModel

In [18]:
predictions = model.transform(ratings)

In [19]:
type(predictions)

pyspark.sql.dataframe.DataFrame

In [20]:
predictions.show(30, False)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|107339|148    |4.0   |3.3795822 |
|93112 |148    |3.0   |2.9064484 |
|106148|148    |2.5   |2.839487  |
|234926|148    |4.0   |2.9622898 |
|253535|148    |4.0   |3.0288193 |
|50155 |148    |3.0   |2.9910944 |
|65991 |148    |4.0   |3.0593572 |
|146376|148    |5.0   |3.672718  |
|207939|148    |3.0   |2.7276554 |
|41788 |148    |3.0   |2.780521  |
|220572|148    |2.0   |2.7654417 |
|244192|148    |3.0   |2.586741  |
|273242|148    |4.0   |3.2985163 |
|52620 |148    |1.0   |2.7473435 |
|98426 |148    |3.0   |2.4786794 |
|102642|148    |4.0   |3.2133152 |
|108082|148    |3.0   |2.9138048 |
|264081|148    |3.0   |2.919073  |
|60382 |148    |4.0   |3.496842  |
|275860|148    |3.0   |2.7759407 |
|8350  |148    |4.0   |2.8493783 |
|245316|148    |1.0   |2.0724003 |
|51571 |148    |3.0   |3.041218  |
|211963|148    |3.0   |2.258726  |
|52772 |148    |3.0   |3.3986502 |
|167692|148    |3.0 