# Import Libraries

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
import pyspark
import numpy as np
from pyspark.sql.functions import col, explode

In [2]:
ratings_df=pd.read_csv("data/ratings.csv")

In [3]:
movies_df=pd.read_csv("data/updated_movies.csv")

In [4]:
samples_movie_df=movies_df[((movies_df['avg_user_rating']>=4)& 
                                   (movies_df['year']>=1980))]


In [5]:
sample_ratings_df=ratings_df.merge(samples_movie_df,on="movieId")

In [6]:
sample_ratings_df.count()

userId             2508638
movieId            2508638
rating             2508638
timestamp          2508638
title              2508638
genres             2508638
avg_user_rating    2508638
year               2508638
comb               2508638
dtype: int64

In [7]:
sample_ratings_df.to_csv("data/sample_ratings_df.csv")

# Initiate Spark Session

In [8]:
scSpark = SparkSession \
        .builder \
        .master("local[*]") \
        .config("spark.driver.memory", "6g") \
        .getOrCreate()


In [9]:
# Load data into spark dataframe
ratings = scSpark.read\
            .option("inferSchema", "true")\
            .csv("data/sample_ratings_df.csv", header=True, sep=",")

In [10]:
# Check dataframe schema
ratings.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- avg_user_rating: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- comb: string (nullable = true)



In [11]:
#Check for first 20 records
ratings.show(20,False)

+---+------+-------+------+----------+-------------------+---------------------------+-----------------+----+---------------------------------------+
|_c0|userId|movieId|rating|timestamp |title              |genres                     |avg_user_rating  |year|comb                                   |
+---+------+-------+------+----------+-------------------+---------------------------+-----------------+----+---------------------------------------+
|0  |1     |296    |5.0   |1147880044|Pulp Fiction (1994)|Comedy Crime Drama Thriller|4.188912039361382|1994|Comedy Crime Drama Thriller 4.1889 1994|
|1  |3     |296    |5.0   |1439474476|Pulp Fiction (1994)|Comedy Crime Drama Thriller|4.188912039361382|1994|Comedy Crime Drama Thriller 4.1889 1994|
|2  |4     |296    |4.0   |1573938898|Pulp Fiction (1994)|Comedy Crime Drama Thriller|4.188912039361382|1994|Comedy Crime Drama Thriller 4.1889 1994|
|3  |5     |296    |4.0   |830786155 |Pulp Fiction (1994)|Comedy Crime Drama Thriller|4.188912039361

In [12]:
#drop timestamp
ratings = ratings.drop(*['timestamp','avg_user_rating','year','comb'])

In [13]:
ratings.show()

+---+------+-------+------+-------------------+--------------------+-----------------+----+--------------------+
|_c0|userId|movieId|rating|              title|              genres|  avg_user_rating|year|                comb|
+---+------+-------+------+-------------------+--------------------+-----------------+----+--------------------+
|  0|     1|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|4.188912039361382|1994|Comedy Crime Dram...|
|  1|     3|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|4.188912039361382|1994|Comedy Crime Dram...|
|  2|     4|    296|   4.0|Pulp Fiction (1994)|Comedy Crime Dram...|4.188912039361382|1994|Comedy Crime Dram...|
|  3|     5|    296|   4.0|Pulp Fiction (1994)|Comedy Crime Dram...|4.188912039361382|1994|Comedy Crime Dram...|
|  4|     7|    296|   4.0|Pulp Fiction (1994)|Comedy Crime Dram...|4.188912039361382|1994|Comedy Crime Dram...|
|  5|     8|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|4.188912039361382|1994|Comed

In [14]:
#Checking the number of records in dataframe
ratings.count()

2508638

In [15]:
#Reducing the size of dataset initially to train model
sample_ratings_df = ratings.sample(fraction=0.02, seed=42)
sample_ratings_df.count()

50226

In [16]:
sample_ratings_df.show()

+----+------+-------+------+-------------------+--------------------+-----------------+----+--------------------+
| _c0|userId|movieId|rating|              title|              genres|  avg_user_rating|year|                comb|
+----+------+-------+------+-------------------+--------------------+-----------------+----+--------------------+
|  16|    26|    296|   3.0|Pulp Fiction (1994)|Comedy Crime Dram...|4.188912039361382|1994|Comedy Crime Dram...|
| 105|   217|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|4.188912039361382|1994|Comedy Crime Dram...|
| 109|   227|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|4.188912039361382|1994|Comedy Crime Dram...|
| 185|   378|    296|   3.0|Pulp Fiction (1994)|Comedy Crime Dram...|4.188912039361382|1994|Comedy Crime Dram...|
| 201|   413|    296|   3.5|Pulp Fiction (1994)|Comedy Crime Dram...|4.188912039361382|1994|Comedy Crime Dram...|
| 236|   494|    296|   4.0|Pulp Fiction (1994)|Comedy Crime Dram...|4.188912039361382|1

In [17]:
spark = pyspark.sql.SparkSession \
    .builder \
    .appName("spark_lens") \
    .master("local[2]") \
    .getOrCreate()

In [18]:
# Register temp table.
#checking for nulls 
sample_ratings_df.registerTempTable('rating')
# Query table for number of nulls.
spark.sql('''
    SELECT COUNT(rating) AS nulls
    FROM rating 
    WHERE rating=null
''').show()

+-----+
|nulls|
+-----+
|    0|
+-----+



# Build model

In [19]:
#Split Training and Test data
training, test = sample_ratings_df.randomSplit([0.8, 0.2])

In [20]:
training.count()

40171

In [21]:
#Create Basic Model
als = ALS(nonnegative=True, implicitPrefs=False, coldStartStrategy="drop")\
.setMaxIter(5)\
.setRegParam(0.01)\
.setUserCol("userId")\
.setItemCol("movieId")\
.setRatingCol("rating")\


# Confirm that a model called "als" was created
type(als)

pyspark.ml.recommendation.ALS

In [22]:
als.explainParams()


"alpha: alpha for implicit preference (default: 1.0)\nblockSize: block size for stacking input data in matrices. Data is stacked within partitions. If block size is more than remaining data in a partition then it is adjusted to the size of this data. (default: 4096)\ncheckpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)\ncoldStartStrategy: strategy for dealing with unknown or new users/items at prediction time. This may be useful in cross-validation or production scenarios, for handling user/item ids the model has not seen in the training data. Supported values: 'nan', 'drop'. (default: nan, current: drop)\nfinalStorageLevel: StorageLevel for ALS model factors. (default: MEMORY_AND_DISK)\nimplicitPrefs: whether to use implicit preference (default: False, current: False)\nintermediate

In [23]:
# Import the requisite items
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()
           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  16


In [24]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
print(cv)

CrossValidator_7bd0d7144f46


In [25]:
#Fit cross validator to the 'train' dataset
model = cv.fit(training)

#Extract best model from the cv model above
alsmodel = model.bestModel

In [26]:
# Print best_model
print(type(alsmodel))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", alsmodel._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", alsmodel._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", alsmodel._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
  Rank: 10
  MaxIter: 5
  RegParam: 0.15


In [27]:
# View the predictions
test_predictions = alsmodel.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

1.6255027249120588


In [28]:
test_predictions.show()

+-------+------+-------+------+--------------------+--------------------+-----------------+----+--------------------+----------+
|    _c0|userId|movieId|rating|               title|              genres|  avg_user_rating|year|                comb|prediction|
+-------+------+-------+------+--------------------+--------------------+-----------------+----+--------------------+----------+
|1597692| 29595|  48780|   5.0|Prestige, The (2006)|Drama Mystery Sci...|4.093231050865188|2006|Drama Mystery Sci...| 2.5641563|
|1598476| 35094|  48780|   4.5|Prestige, The (2006)|Drama Mystery Sci...|4.093231050865188|2006|Drama Mystery Sci...| 2.6400785|
|1607676|100007|  48780|   4.5|Prestige, The (2006)|Drama Mystery Sci...|4.093231050865188|2006|Drama Mystery Sci...| 2.3039634|
|1613577|142494|  48780|   4.0|Prestige, The (2006)|Drama Mystery Sci...|4.093231050865188|2006|Drama Mystery Sci...| 2.0484178|
|1600828| 51589|  48780|   4.0|Prestige, The (2006)|Drama Mystery Sci...|4.093231050865188|2006|D

In [29]:
nrecoomendations=alsmodel.recommendForAllUsers(5)
nrecoomendations.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|  1342|[[171773, 5.80972...|
|  1959|[[165575, 9.34316...|
|  2142|[[165575, 5.14053...|
|  3175|[[166291, 5.61728...|
|  3794|[[132253, 8.81997...|
|  4935|[[172197, 7.07680...|
|  5300|[[165575, 6.81612...|
|  5803|[[158894, 9.78420...|
|  6336|[[165575, 6.74957...|
|  6658|[[116975, 9.19224...|
|  7240|[[171773, 5.17630...|
|  7340|[[165575, 5.09727...|
|  7880|[[165575, 5.14053...|
|  7993|[[171773, 5.80972...|
|  9852|[[175397, 6.13809...|
| 10362|[[170683, 5.64290...|
| 10623|[[171773, 8.29960...|
| 11141|[[150696, 6.46160...|
| 15447|[[165575, 7.83186...|
| 15619|[[165575, 5.81849...|
+------+--------------------+
only showing top 20 rows



In [30]:
nrecoomendations = nrecoomendations\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('userId', col("rec_exp.movieId"), col("rec_exp.rating"))

nrecoomendations.limit(10).show()

+------+-------+---------+
|userId|movieId|   rating|
+------+-------+---------+
|  1342| 171773| 5.809721|
|  1342| 165575|5.7393336|
|  1342| 148667|5.7349095|
|  1342| 116975| 5.635259|
|  1342| 168716|  5.58637|
|  1959| 165575| 9.343166|
|  1959| 170683| 8.687488|
|  1959| 196631| 8.687488|
|  1959| 111562| 8.687488|
|  1959| 145060| 8.687488|
+------+-------+---------+



In [31]:
# Load data into spark dataframe
movies = scSpark.read\
            .option("inferSchema", "true")\
            .csv("data/movies.csv", header=True, sep=",")

In [32]:
ratings.join(movies, on='movieId').filter('userId = 7340').sort('rating', ascending=False).limit(10).show()

+-------+-------+------+------+--------------------+--------------------+------------------+----+--------------------+--------------------+--------------------+
|movieId|    _c0|userId|rating|               title|              genres|   avg_user_rating|year|                comb|               title|              genres|
+-------+-------+------+------+--------------------+--------------------+------------------+----+--------------------+--------------------+--------------------+
|   4973|  97826|  7340|   5.0|Amelie (Fabuleux ...|      Comedy Romance| 4.101282051282051|2001|Comedy Romance 4....|Amelie (Fabuleux ...|      Comedy|Romance|
|   6016| 182780|  7340|   5.0|City of God (Cida...|Action Adventure ...|  4.18158741329044|2002|Action Adventure ...|City of God (Cida...|Action|Adventure|...|
|    318| 297826|  7340|   5.0|Shawshank Redempt...|         Crime Drama| 4.413576004516335|1994|Crime Drama 4.413...|Shawshank Redempt...|         Crime|Drama|
|  44555|2163096|  7340|   5.0|Liv

In [33]:
nrecoomendations.join(movies, on='movieId').filter('userId=7340').sort('rating', ascending=False).limit(10).show()

+-------+------+---------+--------------------+--------------------+
|movieId|userId|   rating|               title|              genres|
+-------+------+---------+--------------------+--------------------+
| 165575|  7340|5.0972724|In guerra per amo...|  (no genres listed)|
|  99764|  7340| 4.679859|It's Such a Beaut...|Animation|Comedy|...|
| 168716|  7340|4.6706047|A Gathering of Ca...|  (no genres listed)|
| 150696|  7340| 4.443341|     Tomorrow (2015)|         Documentary|
|   5912|  7340|4.3567977|Hit the Bank (Vab...|        Comedy|Crime|
+-------+------+---------+--------------------+--------------------+



In [34]:
import pickle

In [35]:
#Exporting model into a .pickl file for use in front-end
nrecoomendations.toPandas().to_csv("collab_filter_recommendations_updated.csv")

In [36]:
movies.toPandas().to_csv("collab_filter_movies_updated.csv")