In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS

In [6]:
#Create session
appName = "Recommender System in Spark"
spark = SparkSession \
.builder \
.appName(appName) \
.config("spark.some.config.option","some-value") \
.getOrCreate()

In [9]:
#Read file into dataframe using automatically inferred schema
ratings = spark.read.csv('C:/Users/aayushi srivastava/Documents/AayushiSrivastavaJobSearch/PySparkProjects/dataset/ratings.csv',inferSchema=True,header=True)
movies = spark.read.csv('C:/Users/aayushi srivastava/Documents/AayushiSrivastavaJobSearch/PySparkProjects/dataset/movies.csv',inferSchema=True,header=True)
#merge movies and ratings dataframe based on movieId
ratings.join(movies, "movieId").show(3)

+-------+------+------+----------+--------------------+--------------------+
|movieId|userId|rating| timestamp|               title|              genres|
+-------+------+------+----------+--------------------+--------------------+
|     31|     1|   2.5|1260759144|Dangerous Minds (...|               Drama|
|   1029|     1|   3.0|1260759179|        Dumbo (1941)|Animation|Childre...|
|   1061|     1|   3.0|1260759182|     Sleepers (1996)|            Thriller|
+-------+------+------+----------+--------------------+--------------------+
only showing top 3 rows



In [10]:
#data preparation
#use only column data of "userid","movieId" and "rating"
data = ratings.select("userId", "movieId", "rating")
#divide data 70% for training and 30% for testing
splits = data.randomSplit([0.7,0.3])
train = splits[0].withColumnRenamed("rating","label")
test = splits[1].withColumnRenamed("rating","trueLabel")

#calculate number of rows
train_rows = train.count()
test_rows = test.count()

print("Number of training data rows:",train_rows,",Number of testing data rows:",test_rows)

Number of training data rows: 70038 ,Number of testing data rows: 29966


In [11]:
#Define model and train it
#define ALS(Alternating Least Square) as our recommender system
als = ALS(maxIter=19, regParam=0.01, userCol="userId",itemCol = "movieId", ratingCol = "label")
#train our ALS model
model = als.fit(train)
print("Training is done")

Training is done


In [12]:
#Predict Testing Data
prediction = model.transform(test)
print("Testing is done")

Testing is done


In [13]:
prediction.join(movies,"movieId").select(
"userId","title","prediction","trueLabel").show(n=10,truncate=False)

+------+---------------------------+----------+---------+
|userId|title                      |prediction|trueLabel|
+------+---------------------------+----------+---------+
|232   |Guilty as Sin (1993)       |3.776878  |4.0      |
|452   |Guilty as Sin (1993)       |2.7463377 |2.0      |
|311   |Guilty as Sin (1993)       |3.3679936 |3.0      |
|126   |Hudsucker Proxy, The (1994)|4.1146755 |5.0      |
|602   |Hudsucker Proxy, The (1994)|5.495386  |3.0      |
|274   |Hudsucker Proxy, The (1994)|2.8339715 |5.0      |
|440   |Hudsucker Proxy, The (1994)|2.955805  |3.0      |
|86    |Hudsucker Proxy, The (1994)|4.448243  |4.0      |
|299   |Hudsucker Proxy, The (1994)|4.7063327 |4.5      |
|309   |Hudsucker Proxy, The (1994)|4.5271063 |4.0      |
+------+---------------------------+----------+---------+
only showing top 10 rows



In [14]:
#Evaluate the accuracy of our model
#import RegressionEvaluator since we also want to calculate RMSE(Root Mean Square Error)
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
labelCol = "trueLabel", predictionCol = "prediction", metricName = "rmse")
rmse = evaluator.evaluate(prediction)
print("Root Mean Square Error (RMSE)", rmse)

Root Mean Square Error (RMSE) nan


In [15]:
prediction.count()
a = prediction.count()
print("Number of original data rows:",a)
#drop rows with any missing data
cleanPred = prediction.dropna(how="any",subset=["prediction"])
b = cleanPred.count()
print("Number of rows after dropping data wit missing rows:",b)
print("Number of missing data:", a-b)

Number of original data rows: 29966
Number of rows after dropping data wit missing rows: 28797
Number of missing data: 1169


In [16]:
rmse = evaluator.evaluate(cleanPred)
print("Root Mean Square Error: ",rmse)


Root Mean Square Error:  1.2551822384919467
