In [1]:
#Importing Google Drive to access Dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [39]:
#!pip install pyspark

In [4]:
import os
import pyspark
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('movielens').getOrCreate()


In [6]:
from pyspark.sql.functions import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# **Loading Dataset**

In [7]:
movies = spark.read.csv("/content/drive/My Drive/Projects/MovieLens/movies.csv",inferSchema=True,header=True)

In [8]:
movies.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [9]:
movies.show(10)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
+-------+--------------------+--------------------+
only showing top 10 rows



In [40]:
movies.describe().show()

+-------+------------------+--------------------+------------------+
|summary|           movieId|               title|            genres|
+-------+------------------+--------------------+------------------+
|  count|             62423|               62423|             62423|
|   mean|122220.38764557935|                null|              null|
| stddev| 63264.74484425327|                null|              null|
|    min|                 1|"""BLOW THE NIGHT...|(no genres listed)|
|    max|            209171|     줄탁동시 (2012)|           Western|
+-------+------------------+--------------------+------------------+



#### There are a total of 62k movies in the dataset

In [10]:
ratings = spark.read.csv("/content/drive/My Drive/Projects/MovieLens/ratings.csv",inferSchema=True,header=True)
ratings.show(10)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
+------+-------+------+----------+
only showing top 10 rows



In [11]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [12]:
ratings.describe().show()

+-------+-----------------+------------------+------------------+--------------------+
|summary|           userId|           movieId|            rating|           timestamp|
+-------+-----------------+------------------+------------------+--------------------+
|  count|         25000095|          25000095|          25000095|            25000095|
|   mean|81189.28115381162|21387.981943268616| 3.533854451353085|1.2156014431215513E9|
| stddev|46791.71589745776| 39198.86210105973|1.0607439611423535| 2.268758080595386E8|
|    min|                1|                 1|               0.5|           789652009|
|    max|           162541|            209171|               5.0|          1574327703|
+-------+-----------------+------------------+------------------+--------------------+



#### There are close to 25m user ratings in the dataset

# **Exploratory Data Analysis**

##### Movies can have 3-4 different generes, so first I want to check different types of generes and thier count in the dataset

In [14]:
movies.withColumn("genres",explode(split("genres","[|]"))).groupBy("genres").count().show()

+------------------+-----+
|            genres|count|
+------------------+-----+
|             Crime| 5319|
|           Romance| 7719|
|          Thriller| 8654|
|         Adventure| 4145|
|             Drama|25606|
|               War| 1874|
|       Documentary| 5605|
|           Fantasy| 2731|
|           Mystery| 2925|
|           Musical| 1054|
|         Animation| 2929|
|         Film-Noir|  353|
|(no genres listed)| 5062|
|              IMAX|  195|
|            Horror| 5989|
|           Western| 1399|
|            Comedy|16870|
|          Children| 2935|
|            Action| 7348|
|            Sci-Fi| 3595|
+------------------+-----+



Their are 18 different types fo generes and one with no generes. There are 5062 movies with no genere

Most fetured genere is DRAMA followed by COMEDY

### Most Popular Movies

#### These are the movies with the highest number of ratings

In [15]:
popular = ratings.groupBy("movieId").agg(count("userId")).withColumnRenamed("count(userId)",
                                                                            "num_ratings").sort(desc("num_ratings"))

In [16]:
popular.show(10)

+-------+-----------+
|movieId|num_ratings|
+-------+-----------+
|    356|      81491|
|    318|      81482|
|    296|      79672|
|    593|      74127|
|   2571|      72674|
|    260|      68717|
|    480|      64144|
|    527|      60411|
|    110|      59184|
|   2959|      58773|
+-------+-----------+
only showing top 10 rows



Joining the popular dataset with movies dataset to get the names of most popular movies

In [17]:
mostpopular_movies = popular.join(movies, popular.movieId == movies.movieId).sort(desc("num_ratings"))

In [18]:
mostpopular_movies.show(10,truncate= False)

+-------+-----------+-------+-----------------------------------------+--------------------------------+
|movieId|num_ratings|movieId|title                                    |genres                          |
+-------+-----------+-------+-----------------------------------------+--------------------------------+
|356    |81491      |356    |Forrest Gump (1994)                      |Comedy|Drama|Romance|War        |
|318    |81482      |318    |Shawshank Redemption, The (1994)         |Crime|Drama                     |
|296    |79672      |296    |Pulp Fiction (1994)                      |Comedy|Crime|Drama|Thriller     |
|593    |74127      |593    |Silence of the Lambs, The (1991)         |Crime|Horror|Thriller           |
|2571   |72674      |2571   |Matrix, The (1999)                       |Action|Sci-Fi|Thriller          |
|260    |68717      |260    |Star Wars: Episode IV - A New Hope (1977)|Action|Adventure|Sci-Fi         |
|480    |64144      |480    |Jurassic Park (1993)      

#### These are 10 highly rated movies by users. List includes movies like ***Forest Gump, Pulp Fiction, Jurassic Park and Starwars*** ***!!!***

### Top Rated Movie

#### These are the movies with the highest average rating

In [19]:
top_rated = ratings.groupBy("movieId").agg(count("userId"),
                                           avg(col("rating"))).withColumnRenamed("count(userId)",
                                                                                 "num_ratings").withColumnRenamed("avg(rating)", "avg_rating")

In [20]:
top_rated_movies = top_rated.join(movies, top_rated.movieId == movies.movieId).sort(desc("avg_rating"),
                                                                                    desc("num_ratings"))
top_rated_movies.where("num_ratings > 5000").show(10)

+-------+-----------+------------------+-------+--------------------+--------------------+
|movieId|num_ratings|        avg_rating|movieId|               title|              genres|
+-------+-----------+------------------+-------+--------------------+--------------------+
|    318|      81482| 4.413576004516335|    318|Shawshank Redempt...|         Crime|Drama|
|    858|      52498| 4.324336165187245|    858|Godfather, The (1...|         Crime|Drama|
|     50|      55366| 4.284353213163313|     50|Usual Suspects, T...|Crime|Mystery|Thr...|
|   1221|      34188|4.2617585117585115|   1221|Godfather: Part I...|         Crime|Drama|
|   2019|      13367|  4.25476920775043|   2019|Seven Samurai (Sh...|Action|Adventure|...|
|    527|      60411| 4.247579083279535|    527|Schindler's List ...|           Drama|War|
|   1203|      16569| 4.243014062405697|   1203| 12 Angry Men (1957)|               Drama|
|    904|      20162| 4.237947624243627|    904|  Rear Window (1954)|    Mystery|Thriller|

#### This gives us top rated movie with atleast rated by 5000 users. This liost include movies like ***SHAWSHANK REDEMPTION, GODFATHER, and FIGHT CLUB !!!***

### Top Love Or Hate movies

#### These are the movies with the maximum polarising ratings. Movies with max mixed ratings so movines with max stdev in ratings

In [21]:
ratings_stddev = ratings.groupBy("movieId").agg(count("userId").alias("num_ratings"),
                                                avg(col("rating")).alias("avg_rating"),
                                                stddev(col("rating")).alias("std_rating")).where("num_ratings > 500")

In [22]:
lovehate_movies = ratings_stddev.join(movies, ratings_stddev.movieId == movies.movieId).sort(desc("std_rating"))

In [23]:
lovehate_movies.show(10)

+-------+-----------+------------------+------------------+-------+--------------------+--------------------+
|movieId|num_ratings|        avg_rating|        std_rating|movieId|               title|              genres|
+-------+-----------+------------------+------------------+-------+--------------------+--------------------+
|  74754|        670| 2.403731343283582|1.6649650528666515|  74754|    Room, The (2003)|Comedy|Drama|Romance|
|  62912|        611|2.5106382978723403|1.4888552380190527|  62912|High School Music...|             Musical|
|  98203|       1569|2.5242192479286167|1.4560043846864676|  98203|Twilight Saga: Br...|Adventure|Drama|F...|
|  27899|        616|2.7767857142857144| 1.445519028350731|  27899|What the #$*! Do ...|Comedy|Documentar...|
|  91104|       1896|2.3285864978902953| 1.442346851125679|  91104|Twilight Saga: Br...|Adventure|Drama|F...|
|   1924|       2210| 2.613348416289593| 1.417228312413465|   1924|Plan 9 from Outer...|       Horror|Sci-Fi|
|  78772| 

These are popular movies with divided opinions(which is max stdev in ratings). These include ***Twilight movies. Rightlt so!!!!***

# **Data Modeling**

In [24]:
data = movies.join(ratings, movies.movieId == ratings.movieId).drop(ratings.movieId)

In [25]:
data.show(10,truncate = False)

+-------+------------------------------------------------+---------------------------+------+------+----------+
|movieId|title                                           |genres                     |userId|rating|timestamp |
+-------+------------------------------------------------+---------------------------+------+------+----------+
|296    |Pulp Fiction (1994)                             |Comedy|Crime|Drama|Thriller|1     |5.0   |1147880044|
|306    |Three Colors: Red (Trois couleurs: Rouge) (1994)|Drama                      |1     |3.5   |1147868817|
|307    |Three Colors: Blue (Trois couleurs: Bleu) (1993)|Drama                      |1     |5.0   |1147868828|
|665    |Underground (1995)                              |Comedy|Drama|War           |1     |5.0   |1147878820|
|899    |Singin' in the Rain (1952)                      |Comedy|Musical|Romance     |1     |3.5   |1147868510|
|1088   |Dirty Dancing (1987)                            |Drama|Musical|Romance      |1     |4.0   |1147

In [26]:
# Splitting the ratings dataset to 80/20 ratio for trainings
(training, test) = ratings.randomSplit([0.8, 0.2],seed= 42)

## Recommendation model using ALS

In [28]:

als = ALS(maxIter=5, regParam=0.1,rank = 5,userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop",
          nonnegative = True,
          implicitPrefs = False,
          )


model = als.fit(training)

In [29]:
# Test set Predictions 

predictions = model.transform(test)

In [30]:
predictions.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
| 26480|    148|   2.0| 915406133| 2.3129663|
|151614|    148|   1.0| 878170956| 2.7617154|
| 28229|    148|   1.0| 833850593| 2.3704815|
|  6491|    148|   4.0|1500217059|  2.448495|
| 14831|    148|   3.0| 944148276| 2.8325822|
|145182|    148|   3.0| 944952722| 2.6657481|
| 69123|    148|   4.5|1086929593|  2.391535|
|104825|    148|   4.0| 950909863| 3.1199923|
| 41703|    148|   2.0|1311022737| 2.9890924|
| 29213|    148|   5.0|1366840729| 2.5171993|
| 75209|    148|   2.0|1361853682| 2.5997055|
|115912|    148|   3.0| 947795986| 2.6461463|
|118261|    148|   3.0| 833904940|  3.005985|
|138552|    148|   4.0| 829756906| 3.3669233|
| 70733|    148|   1.0| 837770520| 2.8460937|
|115095|    148|   4.0| 944930740| 3.3398237|
|  7223|    148|   3.0| 839813031|  2.517957|
| 65981|    148|   3.5|1453259300| 3.0118756|
| 74794|    148|   3.0| 989050056|

# **Model Evaluation**

In [31]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.8254095716645771


### RMSE of the model is 0.8 

### Testing for a random user

In [46]:
test.where("userId == 10").show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|    10|    193|   1.0|1227570860|
|    10|    356|   4.5|1227571313|
|    10|    457|   3.5|1227571337|
|    10|   1196|   5.0|1227571380|
|    10|   2094|   1.5|1227570889|
|    10|   2826|   2.0|1227571065|
|    10|   5049|   4.0|1227571186|
|    10|   5508|   3.0|1227571111|
|    10|  52767|   4.5|1227571105|
+------+-------+------+----------+



In [47]:
single_user = test.filter(test['userId']==10).select(['movieId','userId'])

In [48]:
reccomendations = model.transform(single_user)

In [49]:
reccomendations.orderBy('prediction',ascending=False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|   1196|    10|  4.109928|
|  52767|    10| 3.6272323|
|    356|    10| 3.5729246|
|    457|    10| 3.5412784|
|   5508|    10|  3.503428|
|   5049|    10| 3.3510597|
|   2826|    10| 3.1149595|
|   2094|    10| 2.8638248|
|    193|    10| 2.0066993|
+-------+------+----------+

