In [1]:
# adapted from: https://github.com/apache/spark/blob/master/examples/src/main/python/als.py

import sys, pyspark

import numpy as np
from numpy.random import rand
from numpy import matrix
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName('Alternating Least Squares').setMaster('spark://sparkmaster:7077')
sc = SparkContext(conf=conf)

LAMBDA = 0.01   # regularization
np.random.seed(42)

In [2]:
sc._conf.getAll()

[('spark.driver.host', 'jupyterlab'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.port', '43609'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.master', 'spark://sparkmaster:7077'),
 ('spark.app.startTime', '1625020343548'),
 ('spark.app.name', 'Alternating Least Squares'),
 ('spark.app.id', 'app-20210629203224-0003'),
 ('spark.ui.showConsoleProgress', 'true')]

In [3]:
def rmse(R, ms, us):
    diff = R - ms * us.T
    return np.sqrt(np.sum(np.power(diff, 2)) / (M * U))


def update(i, mat, ratings):
    uu = mat.shape[0]
    ff = mat.shape[1]

    XtX = mat.T * mat
    Xty = mat.T * ratings[i, :].T

    for j in range(ff):
        XtX[j, j] += LAMBDA * uu

    return np.linalg.solve(XtX, Xty)

In [4]:
"""
Usage: als [M] [U] [F] [iterations] [partitions]"
"""

M = 100
U = 500
F = 50
ITERATIONS = 4
partitions = 20

R = matrix(rand(M, F)) * matrix(rand(U, F).T)
ms = matrix(rand(M, F))
us = matrix(rand(U, F))

Rb = sc.broadcast(R)
msb = sc.broadcast(ms)
usb = sc.broadcast(us)

for i in range(ITERATIONS):
    ms = sc.parallelize(range(M), partitions) \
           .map(lambda x: update(x, usb.value, Rb.value)) \
           .collect()
    # collect() returns a list, so array ends up being
    # a 3-d array, we take the first 2 dims for the matrix
    ms = matrix(np.array(ms)[:, :, 0])
    msb = sc.broadcast(ms)

    us = sc.parallelize(range(U), partitions) \
           .map(lambda x: update(x, msb.value, Rb.value.T)) \
           .collect()
    us = matrix(np.array(us)[:, :, 0])
    usb = sc.broadcast(us)

    error = rmse(R, ms, us)
    print("Iteration %d:" % i)
    print("\nRMSE: %5.4f\n" % error)

Iteration 0:

RMSE: 0.3770

Iteration 1:

RMSE: 0.1220

Iteration 2:

RMSE: 0.0922

Iteration 3:

RMSE: 0.0812



In [5]:
sc.stop()