In [1]:
pip install findspark -qqq

In [2]:
# install Apache Spark on Google Colab
!apt-get install openjdk-8-jdk-headless -qq > /dev/null 
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz

In [3]:
!tar xzf spark-3.2.0-bin-hadoop3.2.tgz


In [4]:
import os 
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" 
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"
import findspark 
findspark.init()
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [5]:
spark = SparkSession.builder.appName('recommendation').getOrCreate()

In [6]:
!wget --no-check-certificate \
    https://namespace.co.ke/ml/rating.csv \
    -O /tmp/rating.csv


--2021-12-02 05:41:52--  https://namespace.co.ke/ml/rating.csv
Resolving namespace.co.ke (namespace.co.ke)... 109.106.250.14
Connecting to namespace.co.ke (namespace.co.ke)|109.106.250.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 690353377 (658M) [text/csv]
Saving to: ‘/tmp/rating.csv’


2021-12-02 05:42:19 (25.1 MB/s) - ‘/tmp/rating.csv’ saved [690353377/690353377]



In [7]:
data = spark.read.csv('/tmp/rating.csv',inferSchema=True,header=True)

In [8]:
data = data.dropna()

In [9]:
data.describe().show()

+-------+-----------------+------------------+------------------+-------------------+
|summary|           userId|           movieId|            rating|          timestamp|
+-------+-----------------+------------------+------------------+-------------------+
|  count|         20000263|          20000263|          20000263|           20000263|
|   mean|69045.87258292554| 9041.567330339605|3.5255285642993797|               null|
| stddev|40038.62665316267|19789.477445413264| 1.051988919294229|               null|
|    min|                1|                 1|               0.5|1995-01-09 11:46:44|
|    max|           138493|            131262|               5.0|2015-03-31 06:40:02|
+-------+-----------------+------------------+------------------+-------------------+



In [10]:
data = data.drop("timestamp")

In [11]:
# Split the data into a training and testing set
training_size = 0.8
random_state = 0
test_size = 0.2
training, testing = data.randomSplit([training_size, test_size], seed=random_state)

In [12]:
# Recommendation model using ALS on the training data
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)

In [13]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(testing)

In [14]:
predictions.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|     1|    924|   3.5| 3.3500874|
|     1|   1009|   3.5| 3.2816954|
|     1|   1136|   3.5| 3.8564465|
|     1|   1193|   3.5|   3.50422|
|     1|   1196|   4.5|  3.891396|
|     1|   1215|   4.0| 3.8538246|
|     1|   1217|   3.5|    3.5861|
|     1|   1222|   3.5| 3.4862847|
|     1|   1278|   4.0| 3.5762112|
|     1|   1333|   4.0| 3.3784351|
|     1|   1525|   3.0| 3.0203524|
|     1|   1584|   3.5| 3.6740332|
|     1|   1750|   3.5| 2.5577292|
|     1|   1967|   4.0|  3.989893|
|     1|   2140|   4.0| 3.7100496|
|     1|   2174|   4.0| 3.7961955|
|     1|   2692|   3.5| 3.7352972|
|     1|   2761|   3.0| 3.7934666|
|     1|   2762|   4.0| 3.8256044|
|     1|   3037|   3.5| 3.7623856|
+------+-------+------+----------+
only showing top 20 rows



In [15]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root mean square error {rmse}")

Root mean square error 0.8129117387821271


In [16]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)



In [17]:
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{96255, 9.363403...|
|     3|[{96255, 10.23001...|
|     5|[{96255, 18.40707...|
|     6|[{96255, 21.61186...|
|    12|[{96255, 13.90436...|
|    13|[{71017, 14.02533...|
|    15|[{96255, 12.99082...|
|    16|[{72292, 12.74122...|
|    19|[{96255, 14.84307...|
|    20|[{120813, 11.1303...|
|    22|[{96255, 11.96911...|
|    26|[{96255, 14.26548...|
|    27|[{109953, 10.966}...|
|    28|[{73533, 13.54639...|
|    31|[{80825, 15.15677...|
|    34|[{96255, 17.17280...|
|    37|[{96255, 19.09056...|
|    40|[{98126, 10.91677...|
|    41|[{96255, 10.15501...|
|    43|[{96009, 10.73423...|
+------+--------------------+
only showing top 20 rows



In [18]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|      1|[{98128, 7.062815...|
|      3|[{4727, 5.437759}...|
|      5|[{89129, 5.560975...|
|      6|[{72857, 6.373149...|
|      9|[{35000, 5.352175...|
|     12|[{103796, 6.57376...|
|     13|[{40450, 5.646654...|
|     15|[{21640, 6.509919...|
|     16|[{88994, 6.386674...|
|     17|[{84912, 6.588704...|
|     19|[{107804, 7.64825...|
|     20|[{65918, 5.607259...|
|     22|[{25200, 5.810437...|
|     26|[{133404, 5.72753...|
|     27|[{98642, 6.497676...|
|     28|[{101476, 6.41470...|
|     31|[{103795, 5.70314...|
|     34|[{73969, 8.814581...|
|     35|[{33314, 8.430344...|
|     37|[{71023, 9.008652...|
+-------+--------------------+
only showing top 20 rows



In [19]:
test_user = testing.filter(testing['userId']==23).select(['movieId','userId'])

In [20]:
test_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      1|    23|
|    293|    23|
|    318|    23|
|    356|    23|
|    493|    23|
|   1094|    23|
|   1095|    23|
|   1527|    23|
|   1584|    23|
|   1639|    23|
|   1729|    23|
|   1835|    23|
|   2167|    23|
|   2278|    23|
|   2294|    23|
|   2315|    23|
|   2324|    23|
|   2353|    23|
|   2424|    23|
+-------+------+



In [21]:
recommendation = model.transform(test_user)

In [22]:
recommendation.orderBy('prediction',ascending=False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|   2324|    23| 4.9912987|
|    318|    23|  4.917674|
|   1639|    23|  4.885573|
|    293|    23|  4.752557|
|   2278|    23|  4.396638|
|   2353|    23| 4.3250475|
|    356|    23| 4.2895617|
|   1584|    23| 4.2772045|
|   1527|    23| 4.1323924|
|   1094|    23| 4.1077952|
|   1095|    23| 3.8918378|
|   1729|    23|  3.744111|
|   2167|    23| 3.6783814|
|    493|    23| 3.5285435|
|   1835|    23| 3.5197358|
|   2294|    23| 3.4749477|
|   2424|    23| 3.4253647|
|      1|    23| 3.3861141|
|   2315|    23| 1.0421133|
+-------+------+----------+

