## Machine Learning with Spark MLlib

In [1]:
import os
import sys
import random
from os import path as filepath
from pyspark import SparkConf, SparkContext

In [2]:
HDFS = "hdfs://{}".format(os.environ["HDFS"])
USER = filepath.join(HDFS, "user", "ec2-user")
RATINGS = filepath.join(USER, "dating/ratings.dat")
GENDER = filepath.join(USER, "dating/gender.dat")

In [3]:
def parse_rating(line, sep=','):
    """
    Parses a rating line
    Returns: tuple of (random integer, (user_id, profile_id, rating))
    """
    fields = line.strip().split(sep)
    user_id = int(fields[0])     # convert user_id to int
    profile_id = int(fields[1])  # convert profile_id to int
    rating = float(fields[2])    # convert rated_id to int
    return random.randint(1, 10), (user_id, profile_id, rating)

In [4]:
ratings = sc.textFile(RATINGS).map(parse_rating)

In [5]:
def parse_user(line, sep=','):
    """
    Parses a user line
    Returns: tuple of (user_id, gender)
    """
    fields = line.strip().split(sep)
    user_id = int(fields[0])  # convert user_id to int
    gender = fields[1]
    return user_id, gender

In [6]:
users = dict(sc.textFile(GENDER).map(parse_user).collect())

In [9]:
# Create the training (80%) and validation (20%) set, based on last digit of timestamp
num_partitions = 4

training = ratings.filter(lambda x: x[0] > 2) \
                .values() \
                .repartition(num_partitions) \
                .cache()

validation = ratings.filter(lambda x: x[0] <= 2) \
                .values() \
                .repartition(num_partitions) \
                .cache()

num_training = training.count()
num_validation = validation.count()

print "Training: %d and validation: %d\n" % (num_training, num_validation)

Training: 868466 and validation: 217058



In [10]:
# rank is the number of latent factors in the model
# num_iterations is the number of iterations to run.
# reg_lambda specifies the regularization parameter in ALS

rank = 8
num_iterations = 8
reg_lambda = 0.1

In [11]:
from pyspark.mllib.recommendation import ALS

# Train model with training data and configured rank and iterations
model = ALS.train(training, rank, num_iterations, reg_lambda)

# evaluate the trained model on the validation set
print "The model was trained with rank = %d, lambda = %.1f, and %d iterations. \n" %(rank, reg_lambda, num_iterations)

The model was trained with rank = 8, lambda = 0.1, and 8 iterations. 



In [12]:
from math import sqrt
from operator import add

def compute_rmse(model, data, n):
    """
    Compute Root Mean Squared Error (RMSE), or square root of the average value
        of (actual rating - predicted rating)^2
    """
    predictions = model.predictAll(data.map(lambda x: (x[0], x[1])))
    
    predictions_ratings = predictions.map(lambda x: ((x[0], x[1]), x[2])) \
                        .join(data.map(lambda x: ((x[0], x[1]), x[2]))) \
                        .values()
    
    return sqrt(predictions_ratings.map(lambda x: (x[0] - x[1]) ** 2).reduce(add) / float(n))

In [13]:
# Print RMSE of model
validation_rmse = compute_rmse(model, validation, num_validation)

print "The model was trained with rank=%d, lambda=%.1f, and %d iterations." %(rank, reg_lambda, num_iterations)
print "Its RMSE on the validation set is %f.\n" % validation_rmse

The model was trained with rank=8, lambda=0.1, and 8 iterations.
Its RMSE on the validation set is 1.757121.



In [14]:
matchseeker = 5
gender_filter = 'F'

# Filter on preferred gender
partners = sc.parallelize([u[0] for u in filter(lambda u: u[1] == gender_filter, users.items())])

# run predictions with trained model
predictions = model.predictAll(partners.map(lambda x: (matchseeker, x))).collect()

# sort the recommendations
recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:10]

print "Eligible partners recommended for User ID: %d" % matchseeker
for i in xrange(len(recommendations)):
    print ("%2d: %s" % (i + 1, recommendations[i][1])).encode('ascii', 'ignore')

# clean up
sc.stop()

Eligible partners recommended for User ID: 5
 1: 179536
 2: 54497
 3: 39842
 4: 72128
 5: 52412
 6: 32754
 7: 24360
 8: 82600
 9: 14862
10: 99659
