In [1]:
# import dependencies
from pyspark.sql import SparkSession # needed to use spark DataFarmes
spark = SparkSession.builder.appName("recommender").getOrCreate()
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
# import data
df = spark.read.csv('/FileStore/tables/ratings.csv', inferSchema=True, header=True)

In [3]:
# explore data columns
df.printSchema()

In [4]:
# un-select timestamp column
df = df.select(['userId', 'movieId', 'rating'])

In [5]:
df.head(5)

In [6]:
df.show()

In [7]:
df.describe().show()

In [8]:
# create evaluation data
training, test = df.randomSplit([0.8,0.2])

In [9]:
# create model
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating')

In [10]:
# train model
model = als.fit(training)

In [11]:
# get predictions
predictions = model.transform(test)

In [12]:
# look into predictions
predictions.describe().show()

In [13]:
# drop rows with non predictions
predictions = predictions.na.drop()

In [14]:
# again look into predictions
predictions.describe().show()

In [15]:
# evaluate
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating')
rmse = evaluator.evaluate(predictions)
rmse