In [1]:

from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('recommender').getOrCreate()

In [2]:
df=spark.read.csv('movie_ratings_df.csv',inferSchema=True,header=True)


In [3]:
print((df.count(), len(df.columns)))


(100000, 3)


In [4]:
df.printSchema()


root
 |-- userId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- rating: integer (nullable = true)



In [5]:
df.groupBy('userId').count().orderBy('count',ascending=False).show(10,False)
df.groupBy('userId').count().orderBy('count',ascending=True).show(10,False)

+------+-----+
|userId|count|
+------+-----+
|405   |737  |
|655   |685  |
|13    |636  |
|450   |540  |
|276   |518  |
|416   |493  |
|537   |490  |
|303   |484  |
|234   |480  |
|393   |448  |
+------+-----+
only showing top 10 rows

+------+-----+
|userId|count|
+------+-----+
|732   |20   |
|631   |20   |
|572   |20   |
|926   |20   |
|93    |20   |
|596   |20   |
|636   |20   |
|34    |20   |
|300   |20   |
|685   |20   |
+------+-----+
only showing top 10 rows



In [6]:
df.groupBy('title').count().orderBy('count',ascending=False).show(10,False)


+-----------------------------+-----+
|title                        |count|
+-----------------------------+-----+
|Star Wars (1977)             |583  |
|Contact (1997)               |509  |
|Fargo (1996)                 |508  |
|Return of the Jedi (1983)    |507  |
|Liar Liar (1997)             |485  |
|English Patient, The (1996)  |481  |
|Scream (1996)                |478  |
|Toy Story (1995)             |452  |
|Air Force One (1997)         |431  |
|Independence Day (ID4) (1996)|429  |
+-----------------------------+-----+
only showing top 10 rows



In [7]:
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer,IndexToString

In [8]:
stringIndexer = StringIndexer(inputCol="title",outputCol="title_new")
model = stringIndexer.fit(df)
indexed = model.transform(df)

In [9]:
indexed.show(10)


+------+------------+------+---------+
|userId|       title|rating|title_new|
+------+------------+------+---------+
|   196|Kolya (1996)|     3|    287.0|
|    63|Kolya (1996)|     3|    287.0|
|   226|Kolya (1996)|     5|    287.0|
|   154|Kolya (1996)|     3|    287.0|
|   306|Kolya (1996)|     5|    287.0|
|   296|Kolya (1996)|     4|    287.0|
|    34|Kolya (1996)|     5|    287.0|
|   271|Kolya (1996)|     4|    287.0|
|   201|Kolya (1996)|     4|    287.0|
|   209|Kolya (1996)|     4|    287.0|
+------+------------+------+---------+
only showing top 10 rows



In [10]:
indexed.groupBy('title_new').count().orderBy('count',ascending=False).show(10,False)


+---------+-----+
|title_new|count|
+---------+-----+
|0.0      |583  |
|1.0      |509  |
|2.0      |508  |
|3.0      |507  |
|4.0      |485  |
|5.0      |481  |
|6.0      |478  |
|7.0      |452  |
|8.0      |431  |
|9.0      |429  |
+---------+-----+
only showing top 10 rows



In [11]:
train,test=indexed.randomSplit([0.75,0.25])
train.count()
test.count()

25169

In [12]:
from pyspark.ml.recommendation import ALS
rec=ALS(maxIter=10,regParam=0.01,userCol='userId',itemCol='title_new',ratingCol='rating',nonnegative=True,coldStartStrategy="drop")
rec_model=rec.fit(train)

In [13]:
predicted_ratings=rec_model.transform(test)
predicted_ratings.printSchema()
predicted_ratings.orderBy(rand()).show(10)
from pyspark.ml.evaluation import RegressionEvaluator
evaluator=RegressionEvaluator(metricName='rmse',predictionCol='prediction',labelCol='rating')
rmse=evaluator.evaluate(predicted_ratings)
print(rmse)

root
 |-- userId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- title_new: double (nullable = false)
 |-- prediction: float (nullable = false)

+------+--------------------+------+---------+----------+
|userId|               title|rating|title_new|prediction|
+------+--------------------+------+---------+----------+
|   540|Long Kiss Goodnig...|     3|    132.0| 3.5311048|
|   707| Shining, The (1980)|     2|    113.0| 3.1329446|
|   343|  Stand by Me (1986)|     5|     86.0| 4.1342735|
|    62|Twelve Monkeys (1...|     4|     13.0| 3.7924788|
|   394|  Hard Target (1993)|     1|    701.0| 3.0835025|
|   286|  Real Genius (1985)|     4|    284.0| 4.0237465|
|   409|Father of the Bri...|     3|    535.0|  3.266024|
|   637|Conspiracy Theory...|     4|     39.0|  3.451449|
|   311|Much Ado About No...|     5|    148.0| 3.9249356|
|   601|Young Frankenstei...|     4|    117.0| 4.2269635|
+------+--------------------+------+-----

In [14]:
unique_movies=indexed.select('title_new').distinct()
unique_movies.count()

1664

In [15]:
a = unique_movies.alias('a')


In [16]:
user_id=85


In [17]:
watched_movies=indexed.filter(indexed['userId'] == user_id).select('title_new').distinct()
watched_movies.count()
b=watched_movies.alias('b')