In [2]:
!pip install pyspark



In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import udf,col,when
import numpy as np
from IPython.display import Image
from IPython.display import display
from IPython.display import clear_output

In [None]:
directory="C:\Users\ATHITH.M.S\Downloads\PROJECT\ml-latest-small.zip"

In [None]:
!ls "C:\Users\ATHITH.M.S\Downloads\PROJECT\ml-latest-small.zip"

In [None]:
spark = SparkSession.builder.appName('Bots_Squad').getOrCreate()

In [None]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [None]:
ratings_df = spark.read.csv(directory+'/ratings.csv', inferSchema=True, header=True)
ratings_df.printSchema()

In [8]:
ratings_df.show(7)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
+------+-------+------+---------+
only showing top 7 rows



In [26]:
movies_df = spark.read.csv(directory+'/movies.csv',inferSchema=True,header=True)
movies_df.printSchema ()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [10]:
movies_df.show(7)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 7 rows



In [27]:
links_df = spark. read. csv(directory+ '/links.csv', inferSchema=True, header=True)
links_df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- imdbId: integer (nullable = true)
 |-- tmdbId: integer (nullable = true)



In [28]:
training_df,validation_df=ratings_df.randomSplit([0.8,0.2])

In [29]:
iterations=10
regularization_parameter=0.1
rank=4
error=[]
err=0

In [30]:
als = ALS(maxIter=iterations,regParam=regularization_parameter,rank=5,userCol="userId",itemCol="movieId",ratingCol="rating")
model=als.fit(training_df)
predictions=model.transform(validation_df)
new_predictions=predictions.filter(col('prediction')!=np.nan)
evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
rmse=evaluator.evaluate(new_predictions)
print("Root Mean Square Error="+str(rmse))

Root Mean Square Error=0.8769362552515131


In [31]:
for rank in range(4,10):
    als = ALS(maxIter=iterations,regParam=regularization_parameter,rank=rank,userCol="userId",itemCol="movieId",ratingCol="rating")
    model=als.fit(training_df)
    predictions=model.transform(validation_df)
    new_predictions=predictions.filter(col('prediction')!=np.nan)
    evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
    rmse=evaluator.evaluate(new_predictions)
    print("Root Mean Square Error="+str(rmse))


Root Mean Square Error=0.8828716662509187
Root Mean Square Error=0.8769362552515114
Root Mean Square Error=0.88231025565326
Root Mean Square Error=0.8818561222345047
Root Mean Square Error=0.8795447046455943
Root Mean Square Error=0.8795777716457857


In [32]:
from pyspark.ml.tuning import *

In [33]:
from pyspark.ml.tuning import ParamGridBuilder
als1 = ALS(maxIter=iterations,regParam=regularization_parameter,rank=rank,userCol="userId",itemCol="movieId",ratingCol="rating")
paramGrid = ParamGridBuilder()\
.addGrid(als1.regParam,[0.1,0.01,0.18])\
.addGrid(als1.rank,range(4,10))\
.build()
evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
crossval=CrossValidator(estimator=als1,estimatorParamMaps=paramGrid,evaluator=evaluator,numFolds=5)
cvModel=crossval.fit(training_df)

In [22]:
predictions.show(n=25)

+------+-------+------+---------+----------+
|userId|movieId|rating|timestamp|prediction|
+------+-------+------+---------+----------+
|     1|     70|   3.0|964982400| 3.4734728|
|     1|    110|   4.0|964982176|  4.811602|
|     1|    163|   5.0|964983650| 4.1751533|
|     1|    216|   5.0|964981208|   3.99536|
|     1|    349|   4.0|964982563| 3.8920357|
|     1|    954|   5.0|964983219| 4.7979712|
|     1|   1031|   5.0|964982653| 4.3012133|
|     1|   1127|   4.0|964982513| 3.9007716|
|     1|   1213|   5.0|964982951|  4.848526|
|     1|   1258|   3.0|964983414|  4.394889|
|     1|   1291|   5.0|964981909|  4.739691|
|     1|   1396|   3.0|964983017| 4.0567617|
|     1|   1473|   4.0|964980875|       NaN|
|     1|   1617|   5.0|964982951|  4.333606|
|     1|   1967|   4.0|964981710|  4.459849|
|     1|   2048|   5.0|964982791| 3.8751962|
|     1|   2078|   5.0|964982838| 4.4790387|
|     1|   2115|   5.0|964982529|  4.453764|
|     1|   2174|   4.0|964981680|  4.204715|
|     1|  

In [34]:
predictions.join(movies_df,"movieId").select("userId","title","genres","prediction").show(15)


+------+--------------------+--------------------+----------+
|userId|               title|              genres|prediction|
+------+--------------------+--------------------+----------+
|   580|Men in Black (a.k...|Action|Comedy|Sci-Fi| 3.2481847|
|    34|Men in Black (a.k...|Action|Comedy|Sci-Fi| 3.3858495|
|   368|Men in Black (a.k...|Action|Comedy|Sci-Fi| 2.8494778|
|   368|Children of the C...|     Horror|Thriller| 1.7317998|
|   101| Galaxy Quest (1999)|Adventure|Comedy|...| 3.0401516|
|   385|Hudsucker Proxy, ...|              Comedy| 3.2788544|
|    28|The Devil's Advoc...|Drama|Mystery|Thr...| 2.5680866|
|   587|Men in Black (a.k...|Action|Comedy|Sci-Fi| 3.7186513|
|    27|American Tail: Fi...|Adventure|Animati...| 2.7136788|
|   606|The Devil's Advoc...|Drama|Mystery|Thr...| 3.4824798|
|   230|Men in Black (a.k...|Action|Comedy|Sci-Fi| 2.3911674|
|   232| Galaxy Quest (1999)|Adventure|Comedy|...| 3.1768372|
|   346|Before Sunset (2004)|       Drama|Romance| 3.5740757|
|   599|

In [35]:
for_one_user= predictions.filter(col("userId")==599).join(movies_df,"movieId").join(links_df,"movieId").select("userId","title","genres","tmdbId","prediction")
for_one_user.show(5)


+------+--------------------+--------------------+------+----------+
|userId|               title|              genres|tmdbId|prediction|
+------+--------------------+--------------------+------+----------+
|   599|Cutthroat Island ...|Action|Adventure|...|  1408| 2.2488456|
|   599|    Assassins (1995)|Action|Crime|Thri...|  9691| 2.2640226|
|   599|     Clueless (1995)|      Comedy|Romance|  9603|  2.817207|
|   599|  Restoration (1995)|               Drama| 35196| 2.0645475|
|   599|     Bio-Dome (1996)|              Comedy|  9536| 1.9533672|
+------+--------------------+--------------------+------+----------+
only showing top 5 rows



In [36]:
import webbrowser
link="https://www.themoviedb.org/movie/"
for movie in for_one_user.take(5):
    movieURL=link+str(movie.tmdbId)
    print(movie.title)
    webbrowser.open(movieURL)

Cutthroat Island (1995)
Assassins (1995)
Clueless (1995)
Restoration (1995)
Bio-Dome (1996)


In [37]:
userRecommends=model.recommendForAllUsers(5)
movieRecommends=model.recommendForAllItems(5)

In [39]:
userRecommends.printSchema()


root
 |-- userId: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- movieId: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [40]:
userRecommends.select("userId","recommendations.movieId").show(10,False)


+------+-----------------------------------+
|userId|movieId                            |
+------+-----------------------------------+
|1     |[3379, 86377, 8477, 4441, 92535]   |
|2     |[5075, 131724, 67618, 71462, 3379] |
|3     |[6835, 5919, 5181, 4518, 7991]     |
|4     |[89118, 5034, 3067, 7025, 446]     |
|5     |[8477, 148881, 86377, 3266, 6818]  |
|6     |[3086, 5466, 85774, 67618, 940]    |
|7     |[86377, 92535, 8477, 86345, 148881]|
|8     |[3379, 951, 7008, 171495, 6666]    |
|9     |[132333, 5915, 3030, 7025, 26681]  |
|10    |[34332, 157296, 4535, 89118, 71579]|
+------+-----------------------------------+
only showing top 10 rows



In [41]:
movieRecommends.printSchema()


root
 |-- movieId: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- userId: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [43]:
movieRecommends.select("movieId","recommendations.userId").show(10,False)


+-------+-------------------------+
|movieId|userId                   |
+-------+-------------------------+
|1      |[53, 43, 201, 1, 452]    |
|3      |[53, 43, 243, 276, 519]  |
|5      |[53, 43, 543, 337, 344]  |
|6      |[53, 93, 276, 452, 243]  |
|9      |[492, 360, 337, 303, 151]|
|12     |[584, 543, 192, 53, 485] |
|13     |[543, 77, 498, 344, 267] |
|15     |[543, 53, 136, 327, 112] |
|16     |[53, 494, 375, 122, 42]  |
|17     |[59, 579, 375, 406, 4]   |
+-------+-------------------------+
only showing top 10 rows



In [44]:
users=ratings_df.select("userId").distinct().limit(5)


In [45]:
users.show()


+------+
|userId|
+------+
|   148|
|   463|
|   471|
|   496|
|   243|
+------+



In [46]:
userSubsetRecs = model.recommendForUserSubset(users,10)
userSubsetRecs.show()


+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[{89904, 5.26302}...|
|   463|[{86377, 5.17106}...|
|   243|[{86377, 6.685316...|
|   496|[{8477, 5.16287},...|
|   148|[{183897, 4.93593...|
+------+--------------------+



In [47]:
userSubsetRecs.select("userId","recommendations.movieId").show(10,False)


+------+------------------------------------------------------------------------+
|userId|movieId                                                                 |
+------+------------------------------------------------------------------------+
|471   |[89904, 8477, 3379, 3266, 7008, 1232, 187, 96004, 215, 171495]          |
|463   |[86377, 80906, 86347, 86345, 55721, 60943, 59018, 92535, 179133, 175431]|
|243   |[86377, 86347, 86345, 92535, 85774, 5075, 80906, 3972, 115713, 115122]  |
|496   |[8477, 148881, 71899, 7767, 3266, 73344, 26131, 7099, 2677, 89759]      |
|148   |[183897, 5075, 93008, 77846, 25906, 5466, 188751, 174909, 98491, 67618] |
+------+------------------------------------------------------------------------+



In [48]:
movies=ratings_df.select("movieId").distinct().limit(5)
movies.show()

+-------+
|movieId|
+-------+
|   1580|
|   2366|
|   3175|
|   1088|
|  32460|
+-------+



In [49]:
movieSubsetRecs = model.recommendForItemSubset(movies,10)
movieSubsetRecs.select("movieId","recommendations.userId").show(10,False)

+-------+-------------------------------------------------+
|movieId|userId                                           |
+-------+-------------------------------------------------+
|1580   |[53, 543, 337, 43, 452, 276, 492, 475, 93, 69]   |
|32460  |[53, 327, 112, 543, 99, 55, 371, 548, 258, 261]  |
|3175   |[43, 53, 452, 276, 169, 579, 456, 201, 93, 492]  |
|2366   |[244, 505, 373, 572, 597, 98, 35, 9, 40, 209]    |
|1088   |[554, 367, 393, 579, 12, 543, 224, 413, 558, 488]|
+-------+-------------------------------------------------+



In [52]:
movie_ids=[1580,3175,2366,1590]
user_ids=[543,543,543,543]
new_user_preds=sqlContext.createDataFrame(zip(movie_ids,user_ids),schema=['movieId','userId'])
new_predictions=model.transform(new_user_preds)
new_predictions.show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|   1580|   543|   4.57226|
|   3175|   543|  4.294949|
|   2366|   543|  3.746213|
|   1590|   543| 2.9072852|
+-------+------+----------+

