# Import Libraries

In [30]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
import pyspark
import numpy as np
from pyspark.sql.functions import col, explode

In [32]:
ratings_df=pd.read_csv("data/ratings.csv")

In [33]:
movies_df=pd.read_csv("data/updated_movies.csv")

In [35]:
samples_movie_df=movies_df[((movies_df['avg_user_rating']>=4)& 
                                   (movies_df['year']>=1980))]


In [36]:
sample_ratings_df=ratings_df.merge(samples_movie_df,on="movieId")

In [37]:
sample_ratings_df.count()

movieId             2595527
title_x             2595527
genres_x            2595527
userId              2595527
rating              2595527
rating_timestamp    2595527
tag                  108644
tags_timestamp       108644
title_y             2595527
genres_y            2595527
avg_user_rating     2595527
year                2595527
comb                2595527
dtype: int64

# Initiate Spark Session

In [19]:
scSpark = SparkSession \
        .builder \
        .getOrCreate()


In [20]:
# Load data into spark dataframe
ratings = scSpark.read\
            .option("inferSchema", "true")\
            .csv("data/ratings.csv", header=True, sep=",")

In [21]:
# Check dataframe schema
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [22]:
#Check for first 20 records
ratings.show(20,False)

+------+-------+------+----------+
|userId|movieId|rating|timestamp |
+------+-------+------+----------+
|1     |296    |5.0   |1147880044|
|1     |306    |3.5   |1147868817|
|1     |307    |5.0   |1147868828|
|1     |665    |5.0   |1147878820|
|1     |899    |3.5   |1147868510|
|1     |1088   |4.0   |1147868495|
|1     |1175   |3.5   |1147868826|
|1     |1217   |3.5   |1147878326|
|1     |1237   |5.0   |1147868839|
|1     |1250   |4.0   |1147868414|
|1     |1260   |3.5   |1147877857|
|1     |1653   |4.0   |1147868097|
|1     |2011   |2.5   |1147868079|
|1     |2012   |2.5   |1147868068|
|1     |2068   |2.5   |1147869044|
|1     |2161   |3.5   |1147868609|
|1     |2351   |4.5   |1147877957|
|1     |2573   |4.0   |1147878923|
|1     |2632   |5.0   |1147878248|
|1     |2692   |5.0   |1147869100|
+------+-------+------+----------+
only showing top 20 rows



In [23]:
#drop timestamp
ratings = ratings.drop(*['timestamp'])

In [24]:
ratings.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    296|   5.0|
|     1|    306|   3.5|
|     1|    307|   5.0|
|     1|    665|   5.0|
|     1|    899|   3.5|
|     1|   1088|   4.0|
|     1|   1175|   3.5|
|     1|   1217|   3.5|
|     1|   1237|   5.0|
|     1|   1250|   4.0|
|     1|   1260|   3.5|
|     1|   1653|   4.0|
|     1|   2011|   2.5|
|     1|   2012|   2.5|
|     1|   2068|   2.5|
|     1|   2161|   3.5|
|     1|   2351|   4.5|
|     1|   2573|   4.0|
|     1|   2632|   5.0|
|     1|   2692|   5.0|
+------+-------+------+
only showing top 20 rows



In [25]:
#Checking the number of records in dataframe
ratings.count()

25000095

In [26]:
#Read movies data csv created in ETL into dataframe
movies_df=pd.read_csv("data/updated_movies.csv")

In [29]:
samples_movie_df=movies_df[((movies_df['avg_user_rating']>=4)& 
                                   (movies_df['year']>=1980))]
#df_inner = df1.join(df2, on=['Roll_No'], how='inner')
samples_movie_df.head()

Unnamed: 0,movieId,title,genres,avg_user_rating,year,comb
27,28,Persuasion (1995),Drama Romance,4.03,1995,Drama Romance 4.03 1995
46,47,Seven (a.k.a. Se7en) (1995),Mystery Thriller,4.079166,1995,Mystery Thriller 4.0792 1995
49,50,"Usual Suspects, The (1995)",Crime Mystery Thriller,4.284353,1995,Crime Mystery Thriller 4.2844 1995
108,110,Braveheart (1995),Action Drama War,4.002273,1995,Action Drama War 4.0023 1995
160,162,Crumb (1994),Documentary,4.008077,1994,Documentary 4.0081 1994


In [28]:
sample_ratings=ratings.join(samples_movie_df,on=['movieId'], how='inner')

AttributeError: 'DataFrame' object has no attribute '_jdf'

In [None]:
sample_ratings.count()

In [9]:
#Reducing the size of dataset initially to train model
sample_ratings_df = ratings.sample(fraction=0.001, seed=42)
sample_ratings_df.count()

25273

In [10]:
sample_ratings_df.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    58|    527|   5.0|
|    59|    249|   3.5|
|    59|   3996|   4.5|
|    62|  54286|   4.0|
|    70|   5445|   4.0|
|    72|   2405|   2.0|
|    72|   3917|   4.0|
|    80|    891|   2.0|
|    80|   3174|   3.0|
|    84|  40629|   3.5|
|    98|   4912|   4.0|
|   101|   1569|   3.0|
|   114|  79132|   5.0|
|   117|   4447|   5.0|
|   119|  79132|   1.0|
|   120|   2058|   5.0|
|   130|      6|   3.0|
|   142|   1247|   4.5|
|   145|   1194|   5.0|
|   146|  69757|   3.0|
+------+-------+------+
only showing top 20 rows



In [11]:
spark = pyspark.sql.SparkSession \
    .builder \
    .appName("spark_lens") \
    .master("local[2]") \
    .getOrCreate()

In [12]:
# Register temp table.
#checking for nulls 
sample_ratings_df.registerTempTable('rating')
# Query table for number of nulls.
spark.sql('''
    SELECT COUNT(rating) AS nulls
    FROM rating 
    WHERE rating=null
''').show()

+-----+
|nulls|
+-----+
|    0|
+-----+



# Build model

In [13]:
#Split Training and Test data
training, test = sample_ratings_df.randomSplit([0.8, 0.2])

In [14]:
training.count()

20276

In [15]:
#Create Basic Model
als = ALS(nonnegative=True, implicitPrefs=False, coldStartStrategy="drop")\
.setMaxIter(5)\
.setRegParam(0.01)\
.setUserCol("userId")\
.setItemCol("movieId")\
.setRatingCol("rating")\


# Confirm that a model called "als" was created
type(als)

pyspark.ml.recommendation.ALS

In [16]:
als.explainParams()


"alpha: alpha for implicit preference (default: 1.0)\nblockSize: block size for stacking input data in matrices. Data is stacked within partitions. If block size is more than remaining data in a partition then it is adjusted to the size of this data. (default: 4096)\ncheckpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)\ncoldStartStrategy: strategy for dealing with unknown or new users/items at prediction time. This may be useful in cross-validation or production scenarios, for handling user/item ids the model has not seen in the training data. Supported values: 'nan', 'drop'. (default: nan, current: drop)\nfinalStorageLevel: StorageLevel for ALS model factors. (default: MEMORY_AND_DISK)\nimplicitPrefs: whether to use implicit preference (default: False, current: False)\nintermediate

In [17]:
# Import the requisite items
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()
           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  16


In [18]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
print(cv)

CrossValidator_6ee5fd121342


In [19]:
#Fit cross validator to the 'train' dataset
model = cv.fit(training)

#Extract best model from the cv model above
alsmodel = model.bestModel

In [20]:
# Print best_model
print(type(alsmodel))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", alsmodel._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", alsmodel._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", alsmodel._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
  Rank: 150
  MaxIter: 5
  RegParam: 0.01


In [21]:
# View the predictions
test_predictions = alsmodel.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

2.0734268767244757


In [22]:
test_predictions.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|154155|    471|   1.0|0.72159994|
|117811|   1088|   1.0| 2.1162229|
| 93792|   1088|   2.5|  2.684103|
| 78266|   1580|   3.0| 2.7079184|
| 63752|   1580|   3.5|  3.931393|
|114620|   1580|   2.0| 2.0368276|
| 84238|   2366|   2.5|  2.642948|
|157814|   3175|   3.5| 1.8547676|
|117149|   6620|   4.5| 1.7417858|
| 23704|  44022|   4.0| 1.0748026|
|107576|  96488|   3.5| 0.5121748|
|120611|   1084|   3.5| 2.3217304|
|  8528|   1127|   2.0| 2.1911337|
|134837|   2387|   3.0| 0.7957061|
| 82177|   2580|   4.0| 1.9651451|
|  5615|  48780|   4.0| 2.2055964|
|148362|  48780|   3.5| 1.0791944|
| 78229|  69481|   2.0| 1.8774968|
|141420| 196889|   3.0| 1.3115584|
| 51061|   1270|   3.0| 3.0732176|
+------+-------+------+----------+
only showing top 20 rows



In [24]:
nrecoomendations=alsmodel.recommendForAllUsers(5)
nrecoomendations.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|  7340|[[661, 1.999686],...|
| 30970|[[2424, 2.9995534...|
| 35820|[[953, 3.9996536]...|
| 56110|[[2959, 4.9993634...|
| 68610|[[5060, 4.9995394...|
| 95080|[[1208, 4.0737495...|
|108460|[[107, 4.999004],...|
|160820|[[2959, 5.089485]...|
| 46521|[[1207, 3.9996793...|
| 47501|[[1387, 4.932577]...|
| 57201|[[37729, 2.999528...|
| 74251|[[1220, 5.6360917...|
| 83861|[[223, 3.0247295]...|
|103011|[[1213, 3.5473666...|
|112971|[[3000, 3.2555804...|
|124861|[[34, 3.9995875],...|
|126191|[[919, 1.4998838]...|
|128131|[[316, 2.9996943]...|
|131811|[[1584, 3.3589978...|
|145011|[[6539, 2.999699]...|
+------+--------------------+
only showing top 20 rows



In [33]:
nrecoomendations = nrecoomendations\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('userId', col("rec_exp.movieId"), col("rec_exp.rating"))

nrecoomendations.limit(10).show()

+------+-------+---------+
|userId|movieId|   rating|
+------+-------+---------+
|  7340|    661| 1.999686|
|  7340|    527|1.7850212|
|  7340|   1387|1.7770323|
|  7340|   1291|1.7741535|
|  7340|   1220|1.7616001|
| 30970|   2424|2.9995534|
| 30970|    318|2.7798882|
| 30970|   1193| 2.680576|
| 30970|  68358|2.6383505|
| 30970|    527|2.6167846|
+------+-------+---------+



In [35]:
# Load data into spark dataframe
movies = scSpark.read\
            .option("inferSchema", "true")\
            .csv("data/movies.csv", header=True, sep=",")

In [42]:
ratings.join(movies, on='movieId').filter('userId = 7340').sort('rating', ascending=False).limit(10).show()

+-------+------+------+--------------------+--------------------+
|movieId|userId|rating|               title|              genres|
+-------+------+------+--------------------+--------------------+
|    946|  7340|   5.0|To Be or Not to B...|    Comedy|Drama|War|
|   3996|  7340|   5.0|Crouching Tiger, ...|Action|Drama|Romance|
|   1221|  7340|   5.0|Godfather: Part I...|         Crime|Drama|
|    750|  7340|   5.0|Dr. Strangelove o...|          Comedy|War|
|   1232|  7340|   5.0|      Stalker (1979)|Drama|Mystery|Sci-Fi|
|    904|  7340|   5.0|  Rear Window (1954)|    Mystery|Thriller|
|   1265|  7340|   5.0|Groundhog Day (1993)|Comedy|Fantasy|Ro...|
|    926|  7340|   5.0|All About Eve (1950)|               Drama|
|   2019|  7340|   5.0|Seven Samurai (Sh...|Action|Adventure|...|
|   2599|  7340|   5.0|     Election (1999)|              Comedy|
+-------+------+------+--------------------+--------------------+



In [52]:
nrecoomendations.join(movies, on='movieId').filter('userId=7340').sort('rating', ascending=False).limit(10).show()

+-------+------+---------+--------------------+--------------------+
|movieId|userId|   rating|               title|              genres|
+-------+------+---------+--------------------+--------------------+
|    661|  7340| 1.999686|James and the Gia...|Adventure|Animati...|
|    527|  7340|1.7850212|Schindler's List ...|           Drama|War|
|   1387|  7340|1.7770323|         Jaws (1975)|       Action|Horror|
|   1291|  7340|1.7741535|Indiana Jones and...|    Action|Adventure|
|   1220|  7340|1.7616001|Blues Brothers, T...|Action|Comedy|Mus...|
+-------+------+---------+--------------------+--------------------+



In [45]:
import pickle

In [50]:
#Exporting model into a .pickl file for use in front-end
nrecoomendations.toPandas().to_csv("collab_filter_recommendations.csv")

In [51]:
movies.toPandas().to_csv("collab_filter_movies.csv")