# Import Libraries

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
import pyspark
import numpy as np
from pyspark.sql.functions import col, explode

In [2]:
ratings_df=pd.read_csv("data/ratings.csv")

In [3]:
movies_df=pd.read_csv("data/updated_movies.csv")

In [4]:
samples_movie_df=movies_df[movies_df["genres"]!="UnKnown"]

In [5]:
samples_movie_df=samples_movie_df[((samples_movie_df['avg_user_rating']>=4)& 
                                   (samples_movie_df['year']>=1980))]


In [6]:
sample_ratings_df=ratings_df.merge(samples_movie_df,on="movieId")

In [7]:
sample_ratings_df.count()

userId             2506487
movieId            2506487
rating             2506487
timestamp          2506487
title              2506487
genres             2506487
avg_user_rating    2506487
year               2506487
comb               2506487
dtype: int64

In [8]:
sample_ratings_df.to_csv("data/sample_ratings_df.csv")

# Initiate Spark Session

In [9]:
scSpark = SparkSession \
        .builder \
        .master("local[*]") \
        .config("spark.driver.memory", "6g") \
        .getOrCreate()


In [10]:
# Load data into spark dataframe
ratings = scSpark.read\
            .option("inferSchema", "true")\
            .csv("data/sample_ratings_df.csv", header=True, sep=",")

In [11]:
# Check dataframe schema
ratings.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- avg_user_rating: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- comb: string (nullable = true)



In [12]:
#Check for first 20 records
ratings.show(20,False)

+---+------+-------+------+----------+-------------------+---------------------------+-----------------+----+---------------------------------------+
|_c0|userId|movieId|rating|timestamp |title              |genres                     |avg_user_rating  |year|comb                                   |
+---+------+-------+------+----------+-------------------+---------------------------+-----------------+----+---------------------------------------+
|0  |1     |296    |5.0   |1147880044|Pulp Fiction (1994)|Comedy Crime Drama Thriller|4.188912039361382|1994|Comedy Crime Drama Thriller 4.1889 1994|
|1  |3     |296    |5.0   |1439474476|Pulp Fiction (1994)|Comedy Crime Drama Thriller|4.188912039361382|1994|Comedy Crime Drama Thriller 4.1889 1994|
|2  |4     |296    |4.0   |1573938898|Pulp Fiction (1994)|Comedy Crime Drama Thriller|4.188912039361382|1994|Comedy Crime Drama Thriller 4.1889 1994|
|3  |5     |296    |4.0   |830786155 |Pulp Fiction (1994)|Comedy Crime Drama Thriller|4.188912039361

In [13]:
#drop timestamp
ratings = ratings.drop(*['timestamp','avg_user_rating','year','comb'])

In [14]:
ratings.show()

+---+------+-------+------+-------------------+--------------------+
|_c0|userId|movieId|rating|              title|              genres|
+---+------+-------+------+-------------------+--------------------+
|  0|     1|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|
|  1|     3|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|
|  2|     4|    296|   4.0|Pulp Fiction (1994)|Comedy Crime Dram...|
|  3|     5|    296|   4.0|Pulp Fiction (1994)|Comedy Crime Dram...|
|  4|     7|    296|   4.0|Pulp Fiction (1994)|Comedy Crime Dram...|
|  5|     8|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|
|  6|    10|    296|   4.5|Pulp Fiction (1994)|Comedy Crime Dram...|
|  7|    12|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|
|  8|    13|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|
|  9|    14|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|
| 10|    15|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|
| 11|    18|    296|   3.0|Pulp Fi

In [15]:
#Checking the number of records in dataframe
ratings.count()

2506487

In [16]:
#Reducing the size of dataset initially to train model
sample_ratings_df = ratings.sample(fraction=0.02, seed=42)
sample_ratings_df.count()

50174

In [17]:
sample_ratings_df.show()

+----+------+-------+------+-------------------+--------------------+
| _c0|userId|movieId|rating|              title|              genres|
+----+------+-------+------+-------------------+--------------------+
|  16|    26|    296|   3.0|Pulp Fiction (1994)|Comedy Crime Dram...|
| 105|   217|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|
| 109|   227|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|
| 185|   378|    296|   3.0|Pulp Fiction (1994)|Comedy Crime Dram...|
| 201|   413|    296|   3.5|Pulp Fiction (1994)|Comedy Crime Dram...|
| 236|   494|    296|   4.0|Pulp Fiction (1994)|Comedy Crime Dram...|
| 255|   541|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|
| 313|   664|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|
| 332|   705|    296|   3.5|Pulp Fiction (1994)|Comedy Crime Dram...|
| 395|   832|    296|   4.0|Pulp Fiction (1994)|Comedy Crime Dram...|
| 495|  1038|    296|   5.0|Pulp Fiction (1994)|Comedy Crime Dram...|
| 595|  1224|    296

In [18]:
spark = pyspark.sql.SparkSession \
    .builder \
    .appName("spark_lens") \
    .master("local[2]") \
    .getOrCreate()

In [19]:
# Register temp table.
#checking for nulls 
sample_ratings_df.registerTempTable('rating')
# Query table for number of nulls.
spark.sql('''
    SELECT COUNT(rating) AS nulls
    FROM rating 
    WHERE rating=null
''').show()

+-----+
|nulls|
+-----+
|    0|
+-----+



# Build model

In [20]:
#Split Training and Test data
training, test = sample_ratings_df.randomSplit([0.8, 0.2])

In [21]:
training.count()

40042

In [22]:
#Create Basic Model
als = ALS(nonnegative=True, implicitPrefs=False, coldStartStrategy="drop")\
.setMaxIter(5)\
.setRegParam(0.01)\
.setUserCol("userId")\
.setItemCol("movieId")\
.setRatingCol("rating")\


# Confirm that a model called "als" was created
type(als)

pyspark.ml.recommendation.ALS

In [23]:
als.explainParams()


"alpha: alpha for implicit preference (default: 1.0)\nblockSize: block size for stacking input data in matrices. Data is stacked within partitions. If block size is more than remaining data in a partition then it is adjusted to the size of this data. (default: 4096)\ncheckpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)\ncoldStartStrategy: strategy for dealing with unknown or new users/items at prediction time. This may be useful in cross-validation or production scenarios, for handling user/item ids the model has not seen in the training data. Supported values: 'nan', 'drop'. (default: nan, current: drop)\nfinalStorageLevel: StorageLevel for ALS model factors. (default: MEMORY_AND_DISK)\nimplicitPrefs: whether to use implicit preference (default: False, current: False)\nintermediate

In [24]:
# Import the requisite items
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()
           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  16


In [25]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
print(cv)

CrossValidator_297e50643a6b


In [26]:
#Fit cross validator to the 'train' dataset
model = cv.fit(training)

#Extract best model from the cv model above
alsmodel = model.bestModel

In [27]:
# Print best_model
print(type(alsmodel))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", alsmodel._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", alsmodel._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", alsmodel._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
  Rank: 10
  MaxIter: 5
  RegParam: 0.15


In [28]:
# View the predictions
test_predictions = alsmodel.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

1.6720604233002658


In [29]:
test_predictions.show()

+-------+------+-------+------+--------------------+--------------------+----------+
|    _c0|userId|movieId|rating|               title|              genres|prediction|
+-------+------+-------+------+--------------------+--------------------+----------+
|2461646| 28951|  32460|   4.0|Knockin' on Heave...|Action Comedy Cri...| 3.6830893|
|2461624| 22101|  32460|   3.5|Knockin' on Heave...|Action Comedy Cri...| 4.1876817|
|2461928| 86618|  32460|   5.0|Knockin' on Heave...|Action Comedy Cri...|  2.223307|
|1609925|116151|  48780|   4.0|Prestige, The (2006)|Drama Mystery Sci...| 3.8660324|
|1598676| 36580|  48780|   5.0|Prestige, The (2006)|Drama Mystery Sci...|  2.055814|
|1604632| 78559|  48780|   5.0|Prestige, The (2006)|Drama Mystery Sci...| 3.6710186|
|1615716|157642|  48780|   4.0|Prestige, The (2006)|Drama Mystery Sci...|  3.859186|
|1595983| 18057|  48780|   3.0|Prestige, The (2006)|Drama Mystery Sci...| 2.4594297|
|1616178|160951|  48780|   4.5|Prestige, The (2006)|Drama Mystery

In [30]:
nrecoomendations=alsmodel.recommendForAllUsers(5)
nrecoomendations.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|  1342|[[118268, 5.91144...|
|  1959|[[179499, 9.02109...|
|  2122|[[134796, 1.80805...|
|  2866|[[181699, 9.54523...|
|  3175|[[181699, 9.75988...|
|  3749|[[142881, 6.04344...|
|  3794|[[137052, 9.67334...|
|  5300|[[179499, 7.21687...|
|  5803|[[128862, 8.04416...|
|  6357|[[195211, 9.16718...|
|  7240|[[188173, 7.51155...|
|  7253|[[181699, 9.75988...|
|  9427|[[139088, 8.84895...|
| 10206|[[128862, 5.07594...|
| 11033|[[179499, 9.02109...|
| 11141|[[142881, 9.06517...|
| 13840|[[118268, 9.10882...|
| 15447|[[181699, 4.48308...|
| 15727|[[188173, 6.66739...|
| 16503|[[128862, 7.57408...|
+------+--------------------+
only showing top 20 rows



In [31]:
nrecoomendations = nrecoomendations\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('userId', col("rec_exp.movieId"), col("rec_exp.rating"))

nrecoomendations.limit(10).show()

+------+-------+---------+
|userId|movieId|   rating|
+------+-------+---------+
|  1342| 118268|5.9114494|
|  1342| 188173|5.6055145|
|  1342| 184147| 5.301752|
|  1342| 168476| 5.024436|
|  1342| 181699| 4.473131|
|  1959| 179499| 9.021095|
|  1959| 181699| 8.366581|
|  1959| 139088| 8.357357|
|  1959|  26587| 8.194602|
|  1959| 137052|   8.0789|
+------+-------+---------+



In [32]:
# Load data into spark dataframe
movies = scSpark.read\
            .option("inferSchema", "true")\
            .csv("data/movies.csv", header=True, sep=",")

In [33]:
ratings.join(movies, on='movieId').filter('userId = 7340').sort('rating', ascending=False).limit(10).show()

+-------+-------+------+------+--------------------+--------------------+--------------------+--------------------+
|movieId|    _c0|userId|rating|               title|              genres|               title|              genres|
+-------+-------+------+------+--------------------+--------------------+--------------------+--------------------+
|   4973|  97826|  7340|   5.0|Amelie (Fabuleux ...|      Comedy Romance|Amelie (Fabuleux ...|      Comedy|Romance|
|   6016| 182780|  7340|   5.0|City of God (Cida...|Action Adventure ...|City of God (Cida...|Action|Adventure|...|
|    318| 297826|  7340|   5.0|Shawshank Redempt...|         Crime Drama|Shawshank Redempt...|         Crime|Drama|
|  44555|2163096|  7340|   5.0|Lives of Others, ...|Drama Romance Thr...|Lives of Others, ...|Drama|Romance|Thr...|
|   2931|2405328|  7340|   5.0|Time of the Gypsi...|Comedy Crime Dram...|Time of the Gypsi...|Comedy|Crime|Dram...|
|     50| 973471|  7340|   4.5|Usual Suspects, T...|Crime Mystery Thr...

In [34]:
nrecoomendations.join(movies, on='movieId').filter('userId=7340').sort('rating', ascending=False).limit(10).show()

+-------+------+------+-----+------+
|movieId|userId|rating|title|genres|
+-------+------+------+-----+------+
+-------+------+------+-----+------+



In [35]:
#Exporting model into a .pickl file for use in front-end
nrecoomendations.toPandas().to_csv("collab_filter_recommendations.csv")

In [36]:
movies.toPandas().to_csv("collab_filter_movies.csv")