## Recommendations with Spark ALS

SEE
https://datasciencemadesimpler.wordpress.com/tag/alternating-least-squares/


In [1]:
import pyspark

In [2]:
sc = pyspark.SparkContext("local[*]")

In [3]:
sc.version

'2.2.0'

In [4]:
from pyspark.mllib.recommendation import ALS
from pyspark.mllib.recommendation import Rating

In [18]:
print(ALS)
#for i in dir(ALS):
#    print(i)
#?ALS
#?Rating
?pyspark.mllib


<class 'pyspark.mllib.recommendation.ALS'>


[0;31mType:[0m        module
[0;31mString form:[0m <module 'pyspark.mllib' from '/home/batman/miniconda3/envs/spark/lib/python3.6/site-packages/pyspark/mllib/__init__.py'>
[0;31mFile:[0m        ~/miniconda3/envs/spark/lib/python3.6/site-packages/pyspark/mllib/__init__.py
[0;31mDocstring:[0m  
RDD-based machine learning APIs for Python (in maintenance mode).

The `pyspark.mllib` package is in maintenance mode as of the Spark 2.0.0 release to encourage
migration to the DataFrame-based APIs under the `pyspark.ml` package.


In [5]:
def expand_user(a, user):
    return [Rating(user, item, ranking) for item, ranking in enumerate(a) if ranking != 0]

In [6]:
def expand_all(a):
    return [expand_user(items, user) for user, items in enumerate(a)]

### Here we have ratings from eight users for six different movies: Titanic, Dirty Dancing, Die Hard, Terminator 2, Wayne's World, and Zoolander. Or in other words, two romantic films, two action films, and two comedies. Each row is a user, each column is a movie.

### The ratings are constructed so that if a user has seen both movies in one of these pairs, their ratings for the two movies are similar.

### There is no evidence in this data that anyone likes all three film genres.

In [7]:
rawdata = [
    [5,5,0,0,0,0],
    [0,0,5,5,0,0],
    [0,0,0,0,5,5],
    [0,1,5,5,5,0],
    [1,1,5,0,5,5],
    [5,5,0,5,1,1],
    [5,0,0,5,0,1],
    [5,5,5,0,1,0]
    ]
list_of_ratings = expand_all(rawdata)
pd.DataFrame(rawdata)
# User 2 has comedy as pref
# SImilar to U4 = Prod 2 ; U3 = Prod 2,3 ; 


Unnamed: 0,0,1,2,3,4,5
0,5,5,0,0,0,0
1,0,0,5,5,0,0
2,0,0,0,0,5,5
3,0,1,5,5,5,0
4,1,1,5,0,5,5
5,5,5,0,5,1,1
6,5,0,0,5,0,1
7,5,5,5,0,1,0


In [8]:
# construct an RDD of Ratings for every non-zero rating
ratings = [val for sublist in list_of_ratings for val in sublist]
ratingsRDD = sc.parallelize(ratings)
ratingsRDD.take(5)
#ratingsRDD.top
#ratingsRDD.toPandas()

[Rating(user=0, product=0, rating=5.0),
 Rating(user=0, product=1, rating=5.0),
 Rating(user=1, product=2, rating=5.0),
 Rating(user=1, product=3, rating=5.0),
 Rating(user=2, product=4, rating=5.0)]

In [9]:
rank = 2
numIterations = 5
als_lambda = 0.1
#als_lambda = 0.0
model = ALS.train(ratingsRDD, rank, numIterations, als_lambda, seed=4242, nonnegative=True)
print(model, type(model))
# there is also a trainImplicit method that one uses when
# working with implicit ratings (it uses a different cost function)
for i in dir(model):
    print(i)

<pyspark.mllib.recommendation.MatrixFactorizationModel object at 0x7f7d44854198> <class 'pyspark.mllib.recommendation.MatrixFactorizationModel'>
__class__
__del__
__delattr__
__dict__
__dir__
__doc__
__eq__
__format__
__ge__
__getattribute__
__gt__
__hash__
__init__
__init_subclass__
__le__
__lt__
__module__
__ne__
__new__
__reduce__
__reduce_ex__
__repr__
__setattr__
__sizeof__
__str__
__subclasshook__
__weakref__
_java_loader_class
_java_model
_load_java
_sc
call
load
predict
predictAll
productFeatures
rank
recommendProducts
recommendProductsForUsers
recommendUsers
recommendUsersForProducts
save
userFeatures


In [110]:
# here we see the model's vector of features for each user
users = model.userFeatures().collect()
res = sorted(users, key=lambda x: x[0])
print("{:<10} {}".format('user','vec'))
vecs = list()
for u,vec in res:
    print("{:<10} {}".format(u,vec))
    #print(type(vec))
    vecs.append(list(vec))

user       vec
0          array('d', [0.0, 2.1502060890197754])
1          array('d', [1.0264124870300293, 1.3004276752471924])
2          array('d', [1.7977725267410278, 0.3288046419620514])
3          array('d', [1.7840503454208374, 0.5924637913703918])
4          array('d', [1.7749810218811035, 0.48281174898147583])
5          array('d', [0.0, 2.101982355117798])
6          array('d', [0.015697011724114418, 2.0789928436279297])
7          array('d', [0.03723369538784027, 2.14143443107605])


In [111]:
# print(vecs)
# import matplotlib as mpl

# mpl.pyplot.plot(vecs)

In [112]:
# and the features for the "products"
products = model.productFeatures().collect()
res = sorted(products, key=lambda x: x[0])
print("{:<10} {}".format('product','vec'))
for u,vec in res:
    print("{:<10} {}".format(u,vec))

product    vec
0          array('d', [0.0, 2.311056137084961])
1          array('d', [0.0, 2.25119948387146])
2          array('d', [2.04840087890625, 2.214982032775879])
3          array('d', [1.8011021614074707, 2.3505852222442627])
4          array('d', [2.655724287033081, 0.4831928014755249])
5          array('d', [2.6537837982177734, 0.48782041668891907])


In [113]:
# recommend 3 items for user 2
model.recommendProducts(2, 3)

[Rating(user=2, product=4, rating=4.933264197914777),
 Rating(user=2, product=5, rating=4.931297221797547),
 Rating(user=2, product=2, rating=4.410855198089081)]

### Display the original matrix side-by-side with the reconstructed matrix. The values that were originally non-zero should be closely approximated, and the values that were zero (empty) now have predictions.

In [114]:
import sys
print(" original      reconstructed")
for user in range(0, len(rawdata)):
    for product in range (0, len(rawdata[0])):
        sys.stdout.write("%d " % rawdata[user][product])
    sys.stdout.write("    ")
    for product in range (0, len(rawdata[0])):
        sys.stdout.write("%0.0f " % model.predict(user, product))
    print(" ")

 original      reconstructed
5 5 0 0 0 0     5 5 5 5 1 1  
0 0 5 5 0 0     3 3 5 5 3 3  
0 0 0 0 5 5     1 1 4 4 5 5  
0 1 5 5 5 0     1 1 5 5 5 5  
1 1 5 0 5 5     1 1 5 4 5 5  
5 5 0 5 1 1     5 5 5 5 1 1  
5 0 0 5 0 1     5 5 5 5 1 1  
5 5 5 0 1 0     5 5 5 5 1 1  


In [115]:
print(" original         errors        predictions")
for user in range(0, len(rawdata)):
    
    # Original 
    for product in range (0, len(rawdata[0])):
        sys.stdout.write("%d " % rawdata[user][product])
    sys.stdout.write("    ")
    
    # Errors 
    for product in range (0, len(rawdata[0])):
        if rawdata[user][product] != 0:
            prediction = model.predict(user, product)
            if rawdata[user][product] != round(prediction, 0):
                sys.stdout.write("%0.0f " % prediction)
            else:
                sys.stdout.write("- ")
        else:
            sys.stdout.write("- ")
    sys.stdout.write("    ")
    
    # Predictions
    for product in range (0, len(rawdata[0])):
        if rawdata[user][product] == 0:
            prediction = model.predict(user, product)
            sys.stdout.write("%0.0f " % prediction)
        else:
            sys.stdout.write("- ")
    print(" ")

 original         errors        predictions
5 5 0 0 0 0     - - - - - -     - - 5 5 1 1  
0 0 5 5 0 0     - - - - - -     3 3 - - 3 3  
0 0 0 0 5 5     - - - - - -     1 1 4 4 - -  
0 1 5 5 5 0     - - - - - -     1 - - - - 5  
1 1 5 0 5 5     - - - - - -     - - - 4 - -  
5 5 0 5 1 1     - - - - - -     - - 5 - - -  
5 0 0 5 0 1     - - - - - -     - 5 5 - 1 -  
5 5 5 0 1 0     - - - - - -     - - - 5 - 1  


### Compute the mean squared error of the reconstructed matrix. This can be used to decide if the rank is sufficiently large.

In [116]:
evalRDD = ratingsRDD.map(lambda p: (p[0], p[1]))
evalRDD.take(5)

[(0, 0), (0, 1), (1, 2), (1, 3), (2, 4)]

In [117]:
predictions = model.predictAll(evalRDD).map(lambda r: ((r[0], r[1]), r[2]))
predictions.take(5)

[((0, 0), 4.969246978026604),
 ((0, 1), 4.840542837818589),
 ((1, 2), 4.982928176149699),
 ((1, 3), 4.905439824918915),
 ((2, 4), 4.933264197914777)]

In [118]:
ratingsAndPreds = ratingsRDD.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
ratingsAndPreds.take(5)

[((4, 2), (5.0, 4.705292034470233)),
 ((5, 1), (5.0, 4.731981592948102)),
 ((4, 5), (5.0, 4.945941306582549)),
 ((3, 1), (1.0, 1.3337541813455545)),
 ((4, 4), (5.0, 4.9471513704081005))]

In [119]:
ratingsAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()

0.0243250136568209

With a larger dataset we would separate the rating data into training and test sets, and see how well our predicted ratings match the actual data.

### Questions