<a href="https://colab.research.google.com/github/amruthab91/spark_basics/blob/main/Recommendation_Sys_spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#################################################################################
# In this program, we shall be developing a movie Rec System
#################################################################################


!pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=3b77f90c8864d736cf1dc44d8dc682c7d43fa38b3ae691aa20616aa5c55e5f72
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
import os

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Rec System") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
movies = spark.read.csv("/content/movies.csv",header=True)
ratings = spark.read.csv("/content/ratings.csv",header=True)

In [6]:
ratings.show()


+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [7]:

ratings.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [8]:
from pyspark.sql import functions as F

ratings = ratings.\
    withColumn('userId', F.col('userId').cast('integer')).\
    withColumn('movieId', F.col('movieId').cast('integer')).\
    withColumn('rating', F.col('rating').cast('float')).\
    drop('timestamp')
ratings.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
|     1|    163|   5.0|
|     1|    216|   5.0|
|     1|    223|   3.0|
|     1|    231|   5.0|
|     1|    235|   4.0|
|     1|    260|   5.0|
|     1|    296|   3.0|
|     1|    316|   3.0|
|     1|    333|   5.0|
|     1|    349|   4.0|
+------+-------+------+
only showing top 20 rows



In [10]:
# Sparsity shows how sparse is the matrix, e.g. 99% sparsity shows that, 99% of the entries in the matrix are missing.
# Count the total number of ratings in the dataset
numerator = ratings.select("rating").count()
print("numerator count", numerator)

# Count the number of distinct userIds and distinct movieIds
num_users = ratings.select("userId").distinct().count()
num_movies = ratings.select("movieId").distinct().count()
print("num_users", num_users)
print("num_movies", num_movies)

# Set the denominator equal to the number of users multiplied by the number of movies
denominator = num_users * num_movies
print("denominator count", denominator)

# Divide the numerator by the denominator
sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

numerator count 100836
num_users 610
num_movies 9724
denominator count 5931640
The ratings dataframe is  98.30% empty.


In [11]:
# Group data by userId, count ratings
userId_ratings = ratings.groupBy("userId").count().orderBy('count', ascending=False)
userId_ratings.show()

+------+-----+
|userId|count|
+------+-----+
|   414| 2698|
|   599| 2478|
|   474| 2108|
|   448| 1864|
|   274| 1346|
|   610| 1302|
|    68| 1260|
|   380| 1218|
|   606| 1115|
|   288| 1055|
|   249| 1046|
|   387| 1027|
|   182|  977|
|   307|  975|
|   603|  943|
|   298|  939|
|   177|  904|
|   318|  879|
|   232|  862|
|   480|  836|
+------+-----+
only showing top 20 rows



In [12]:

# Group data by movieId, count ratings
movieId_ratings = ratings.groupBy("movieId").count().orderBy('count', ascending=False)
movieId_ratings.show()

+-------+-----+
|movieId|count|
+-------+-----+
|    356|  329|
|    318|  317|
|    296|  307|
|    593|  279|
|   2571|  278|
|    260|  251|
|    480|  238|
|    110|  237|
|    589|  224|
|    527|  220|
|   2959|  218|
|      1|  215|
|   1196|  211|
|     50|  204|
|   2858|  204|
|     47|  203|
|    780|  202|
|    150|  201|
|   1198|  200|
|   4993|  198|
+-------+-----+
only showing top 20 rows



In [None]:
##################################
# ALS MODEL --- Alternative Least Square Method

In [13]:

# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [14]:
# Split the ratings dataframe into training and test data
(train, test) = ratings.randomSplit([0.8, 0.2], seed=42)

# Set the ALS hyperparameters
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    rank =10,
    maxIter =10,
    regParam =.1,
    coldStartStrategy="drop",
    nonnegative =True,
    implicitPrefs = False
)

# Fit the model to the training_data
als_model = als.fit(train)

# Generate predictions on the test_data
test_predictions = als_model.transform(test)
test_predictions.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   148|   4896|   4.0| 3.5396879|
|   148|   5618|   3.0| 3.5075262|
|   148|   7153|   3.0| 3.4909835|
|   148|  40629|   5.0| 3.3966205|
|   148|  40815|   4.0| 3.4955816|
|   148|  60069|   4.5|  3.702253|
|   148|  68954|   4.0| 3.7125475|
|   148|  69844|   4.0|  3.637459|
|   148|  79132|   1.5| 3.5972383|
|   148|  79702|   4.0|  3.406023|
|   148|  81834|   4.0| 3.7364764|
|   148|  81847|   4.5| 3.4855666|
|   148|  98243|   4.5|  3.537272|
|   148|  98491|   5.0| 3.3859468|
|   148| 108932|   4.0| 2.8276405|
|   463|   1088|   3.5| 3.6450453|
|   463|   1221|   4.5| 3.9429238|
|   463|   2028|   4.5|  4.345107|
|   463|   2167|   3.0| 3.6569993|
|   463|   3448|   3.0| 3.6385453|
+------+-------+------+----------+
only showing top 20 rows



In [15]:
# Import RegressionEvaluator
from pyspark.ml.evaluation import RegressionEvaluator

# Complete the evaluator code
evaluator = RegressionEvaluator(metricName="rmse", labelCol="userId", predictionCol="rating")

# Extract the 3 parameters
print(evaluator.getMetricName())
print(evaluator.getLabelCol())
print(evaluator.getPredictionCol())


# Evaluate the "test_predictions" dataframe
RMSE = evaluator.evaluate(test_predictions)

# Print the RMSE
print ('RMSE = ', RMSE)

rmse
userId
rating
RMSE =  368.0588125025974


In [16]:
# Generate n Recommendations for all users
nrecommendations = als_model.recommendForAllUsers(10)
nrecommendations.limit(10).show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{3266, 5.9061136...|
|     2|[{3925, 5.151428}...|
|     3|[{6835, 4.8548584...|
|     4|[{25825, 5.441375...|
|     5|[{177593, 5.00247...|
|     6|[{67618, 5.124081...|
|     7|[{132333, 5.03759...|
|     8|[{96004, 4.921962...|
|     9|[{8235, 5.0534368...|
|    10|[{32892, 4.699095...|
+------+--------------------+



In [17]:
#Explaining the recommendations

from pyspark.sql.functions import split, explode

nrecommendations = nrecommendations\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('userId', F.col("rec_exp.movieId"), F.col("rec_exp.rating"))

nrecommendations.limit(10).show()

+------+-------+---------+
|userId|movieId|   rating|
+------+-------+---------+
|     1|   3266|5.9061136|
|     1|  25771| 5.713218|
|     1| 177593|5.7120905|
|     1|   8235|5.7109423|
|     1|   6201|5.7109423|
|     1|  33649|5.6039824|
|     1|  27523| 5.550094|
|     1|  58301|5.5411677|
|     1|   3925| 5.532083|
|     1| 132333|5.5211186|
+------+-------+---------+



In [18]:
#Explaining the recommendations
nrecommendations.join(movies, on='movieId').filter('userId = 100').show()


+-------+------+---------+--------------------+--------------------+
|movieId|userId|   rating|               title|              genres|
+-------+------+---------+--------------------+--------------------+
|  67618|   100| 5.249183|Strictly Sexual (...|Comedy|Drama|Romance|
|  33649|   100| 5.140385|  Saving Face (2004)|Comedy|Drama|Romance|
| 171495|   100|4.9778523|              Cosmos|  (no genres listed)|
|   6732|   100|4.9600606|Hello, Dolly! (1969)|Comedy|Musical|Ro...|
|   5867|   100|4.9315147|        Thief (1981)|Crime|Drama|Thriller|
|  74282|   100|4.9226737|Anne of Green Gab...|Children|Drama|Ro...|
| 184245|   100| 4.896219|De platte jungle ...|         Documentary|
| 134796|   100| 4.896219|  Bitter Lake (2015)|         Documentary|
| 117531|   100| 4.896219|    Watermark (2014)|         Documentary|
|  86237|   100| 4.896219|  Connections (1978)|         Documentary|
+-------+------+---------+--------------------+--------------------+



In [19]:
ratings.join(movies, on='movieId').filter('userId = 100').sort('rating', ascending=False).limit(10).show()


+-------+------+------+--------------------+--------------------+
|movieId|userId|rating|               title|              genres|
+-------+------+------+--------------------+--------------------+
|   1101|   100|   5.0|      Top Gun (1986)|      Action|Romance|
|   1958|   100|   5.0|Terms of Endearme...|        Comedy|Drama|
|   2423|   100|   5.0|Christmas Vacatio...|              Comedy|
|   4041|   100|   5.0|Officer and a Gen...|       Drama|Romance|
|   5620|   100|   5.0|Sweet Home Alabam...|      Comedy|Romance|
|    368|   100|   4.5|     Maverick (1994)|Adventure|Comedy|...|
|    934|   100|   4.5|Father of the Bri...|              Comedy|
|    539|   100|   4.5|Sleepless in Seat...|Comedy|Drama|Romance|
|     16|   100|   4.5|       Casino (1995)|         Crime|Drama|
|    553|   100|   4.5|    Tombstone (1993)|Action|Drama|Western|
+-------+------+------+--------------------+--------------------+

