In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('movie').getOrCreate()

In [5]:
movie = spark.read\
            .options(header=True,inferschema=True)\
            .csv("data/movies.csv")

rating = spark.read.options(header=True,inferschema=True).csv("data/ratings.csv")

movie.printSchema()
rating.printSchema()


root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [7]:
rate = rating.join(movie,"movieId","left")

In [8]:
rate.show(truncate=False)

+-------+------+------+---------+-----------------------------------------+-------------------------------------------+
|movieId|userId|rating|timestamp|title                                    |genres                                     |
+-------+------+------+---------+-----------------------------------------+-------------------------------------------+
|1      |1     |4.0   |964982703|Toy Story (1995)                         |Adventure|Animation|Children|Comedy|Fantasy|
|3      |1     |4.0   |964981247|Grumpier Old Men (1995)                  |Comedy|Romance                             |
|6      |1     |4.0   |964982224|Heat (1995)                              |Action|Crime|Thriller                      |
|47     |1     |5.0   |964983815|Seven (a.k.a. Se7en) (1995)              |Mystery|Thriller                           |
|50     |1     |5.0   |964982931|Usual Suspects, The (1995)               |Crime|Mystery|Thriller                     |
|70     |1     |3.0   |964982400|From Du

In [9]:
train,test = rate.randomSplit([0.8,0.2])

In [10]:
from pyspark.ml.recommendation import ALS
als = ALS(userCol='userId',itemCol='movieId',ratingCol='rating',nonnegative=True,coldStartStrategy="drop",implicitPrefs=False)

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator

In [12]:
param_grid = ParamGridBuilder() \
    .addGrid(als.rank,[10, 50, 100, 150]) \
        .addGrid(als.regParam,[0.01,0.05,0.1,0.15]) \
            .build()

In [13]:
regressor = RegressionEvaluator(
    metricName='rmse',
    labelCol='rating',
    predictionCol='prediction'
)

In [14]:
cv = CrossValidator(estimator=als,estimatorParamMaps=param_grid,evaluator=regressor,numFolds=5)

In [15]:
model = cv.fit(train)
# best_model = cv.bestModel
# testPrediction = best_model.transform(test)
# Rmse = regressor.evaluate(test)
# print(Rmse)

KeyboardInterrupt: 

In [34]:
best_model=model.bestModel

In [36]:
testPrediction = best_model.transform(test)
Rmse = regressor.evaluate(testPrediction)
print(Rmse)

0.8651490998420742


In [37]:
recommendation = best_model.recommendForAllUsers(5)

In [38]:
df = recommendation

In [41]:
df.show(
    
)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[68945, 4.738127...|
|   463|[[170355, 4.77379...|
|   496|[[6818, 4.5608644...|
|   148|[[170355, 4.89657...|
|   540|[[170355, 5.34455...|
|   392|[[68945, 4.673846...|
|   243|[[67618, 5.613619...|
|    31|[[33649, 5.117580...|
|   516|[[4429, 4.7351284...|
|   580|[[170355, 4.72950...|
|   251|[[68945, 5.74595]...|
|   451|[[68945, 5.340492...|
|    85|[[1140, 4.850751]...|
|   137|[[68945, 4.957126...|
|    65|[[68945, 4.836951...|
|   458|[[67618, 5.161386...|
|   481|[[25906, 3.908367...|
|    53|[[68945, 6.789389...|
|   255|[[3525, 3.8541474...|
|   588|[[170355, 4.65545...|
+------+--------------------+
only showing top 20 rows



In [42]:
from pyspark.sql.functions import col,explode
df2 = df.withColumn('movieid_rating',explode('recommendations'))

In [43]:
df2.show()

+------+--------------------+-------------------+
|userId|     recommendations|     movieid_rating|
+------+--------------------+-------------------+
|   471|[[68945, 4.738127...| [68945, 4.7381277]|
|   471|[[68945, 4.738127...|[170355, 4.7381277]|
|   471|[[68945, 4.738127...|  [3379, 4.7381277]|
|   471|[[68945, 4.738127...| [33649, 4.4885697]|
|   471|[[68945, 4.738127...| [171495, 4.488084]|
|   463|[[170355, 4.77379...|[170355, 4.7737927]|
|   463|[[170355, 4.77379...| [68945, 4.7737927]|
|   463|[[170355, 4.77379...|  [3379, 4.7737927]|
|   463|[[170355, 4.77379...|  [33649, 4.572375]|
|   463|[[170355, 4.77379...| [171495, 4.529392]|
|   496|[[6818, 4.5608644...|  [6818, 4.5608644]|
|   496|[[6818, 4.5608644...|[170355, 4.3681946]|
|   496|[[6818, 4.5608644...| [68945, 4.3681946]|
|   496|[[6818, 4.5608644...|  [3379, 4.3681946]|
|   496|[[6818, 4.5608644...| [99764, 4.3654785]|
|   148|[[170355, 4.89657...|[170355, 4.8965735]|
|   148|[[170355, 4.89657...| [68945, 4.8965735]|


In [45]:
df2.select('userId',col('movieid_rating.movieId'),col('movieid_rating.rating')).show()

+------+-------+---------+
|userId|movieId|   rating|
+------+-------+---------+
|   471|  68945|4.7381277|
|   471| 170355|4.7381277|
|   471|   3379|4.7381277|
|   471|  33649|4.4885697|
|   471| 171495| 4.488084|
|   463| 170355|4.7737927|
|   463|  68945|4.7737927|
|   463|   3379|4.7737927|
|   463|  33649| 4.572375|
|   463| 171495| 4.529392|
|   496|   6818|4.5608644|
|   496| 170355|4.3681946|
|   496|  68945|4.3681946|
|   496|   3379|4.3681946|
|   496|  99764|4.3654785|
|   148| 170355|4.8965735|
|   148|  68945|4.8965735|
|   148|   3379|4.8965735|
|   148|  33649|4.7929807|
|   148| 171495|4.5731144|
+------+-------+---------+
only showing top 20 rows



In [50]:
rating.count()

100836

In [51]:
spark.stop()