In [1]:
from pyspark import SparkConf, SparkContext
import pandas as pd
import numpy as np
from math import sqrt
import time

#### Get spark configuration and spark context
	We set memory for both executor and driver to 4G in order to process groupbykey part which is very memory intensive.
    You can change SparkConf() to the cluster by 
    conf = SparkConf().set('spark.executor.memory', '6G').set('spark.driver.memory', '6G').set("spark.hadoop.validateOutputSpecs", "false")
    
#### Import data including user rating and movie details
	3 datasets needed in this application are ratings.dat which contains user rating, movies.dat which contains movie info, movie_list.text which is created manually from movie filtering part. We broadcast movie list in order to have it processed faster by transfer it to every node.

#### Split data
We will split the rating data into train and test for validation. After we get the accuracy, we can use the full dataset to run the whole program again.


In [2]:
# conf = SparkConf().set('spark.executor.memory', '4G').set('spark.driver.memory', '4G').set("spark.hadoop.validateOutputSpecs", "false")
# sc = SparkContext(conf=conf, appName='mvii')

# rating_data = sc.textFile("/data/movie-ratings/ml-10M100K/ratings.dat")
# movie_data = sc.textFile("/data/movie-ratings/ml-10M100K/movies.dat")


conf = SparkConf().setMaster("local[*]").set('spark.hadoop.validateOutputSpecs', 'false')
sc = SparkContext(conf=conf)
rating_data = sc.textFile("full_data/ratings.dat")
movie_data = sc.textFile("full_data/movies.dat")

filtered_movie_list = np.loadtxt('input/movie_list.txt', dtype=int)

filtered_movie_list_bc = sc.broadcast(filtered_movie_list)

rating_data_test, rating_data_train = rating_data.randomSplit(weights=[0.2, 0.8], seed=1)

# Similarity score

1. Extract ratings.dat to the processed format 
2. Filter only relevant movie from movie_list.txt. Movie list from the broadcast.
3. Aggregate the data by user id return tuple of 3 values 1. List of movie id and rating of that user 2. Sum of rating of that user 3. Count number of rating of the user
4. Filter out users who have rated 1 or less movie since making a cosine similarity matrix requires at least 2 rated movies on each user
5. Normalize the score by subtracting rating by its users’ average.
6. Transform the RDD to pairs of movies (movie1, movie2), (rating1, rating2). This will be used to calculate cosine similarity.
7. Group RDD by key and then calculate cosine similarity score for each pair. Save the score to a file as well as persist it for the second part, rating prediction.
8. Now we have cosine similarity score for most popular movies in our dataset. Next step, we will predict user rating from this score.

In [3]:
def _extract_user_rating(line):
    data = line.split('::')
    return (int(data[0]), (int(data[1]), float(data[2])))

def rate_substract_mean(v):
    rating_list = v[0]
    acc_rating = v[1]
    count_rating = v[2]
    user_mean = acc_rating / count_rating
    
    result = [(m_id, rating - user_mean) for m_id, rating in rating_list]
    
    return result

def pair_movie(line):
    user_id = line[0]
    rating_list = line[1]
    
    result = []
    for i in range(len(rating_list)):
        for j in range(i+1, len(rating_list)):
            m_id1, rating1 = rating_list[i]
            m_id2, rating2 = rating_list[j]
            if(m_id1 < m_id2):
                result += [((m_id1, m_id2), (rating1, rating2))]
    
    return result

def _computeCosineSimilarity(ratingPairs):
    numPairs = 0
    sum_xx = sum_yy = sum_xy = 0
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1

    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)

    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))

    return (score, numPairs)


In [4]:
user_rating_lists = rating_data_train \
    .map(_extract_user_rating) \
    .filter(lambda x: x[1][0] in filtered_movie_list_bc.value) \
    .aggregateByKey(([], 0, 0), lambda g1,v2: (g1[0]+[v2], g1[1]+v2[1], g1[2]+1), 
                    lambda g1,g2: (g1[0]+g2[0], g1[1]+g2[1], g1[2]+g2[2])) \
    .filter(lambda x: x[1][2] > 1) \
    .mapValues(rate_substract_mean) 
    
moviePairSimilarities = user_rating_lists.flatMap(pair_movie) \
    .groupByKey() \
    .mapValues(_computeCosineSimilarity).persist() 
moviePairSimilarities.saveAsPickleFile("input/movie-sims-obj3")

In [14]:
# moviePairSimilarities.saveAsPickleFile("input/movie-sims-obj")
# moviePairSimilarities.saveAsTextFile("input/movie-sims-text")

In [6]:
moviePairSimilarities.top(5)

[((60069, 63082), (0.018322956138432503, 41)),
 ((59315, 63082), (-0.22889563637379112, 41)),
 ((59315, 60069), (0.1947293137686646, 559)),
 ((58559, 63082), (-0.05210648209847141, 51)),
 ((58559, 60069), (0.21557453084032255, 664))]

# Prediction part

1. Loading similarity score as a dictionary like {(movie_id1, movie_id2): cosine score, ….}. We load it to local variable not RDD since we want to access it quickly to calculate a prediction.
2. User_lists keep user rating data from training set. It is kept as a dictionary to be processed faster. 
Format {user id: [(movie id1, rate1), (movie id2, rate2)]}
3. Test set contains unforeseen user rating. We will predict the rating according to user id and movie id from this set. It has the same format as User_lists in number 2.
4. Predicted rating calculation part. The result is a tuple of (user id, predicted rating, true rating).
5. Code for predicted rating. This is done according to the formula discussed. If the predicted rate is more than 5, change it to 5 and if it is less than 1, change it to 1.

In [7]:
def load_sim_dict(from_file=False):
    if from_file:
        sim_movie = sc.pickleFile("input/movie-sims-obj3/")
        
    else:
        sim_movie = moviePairSimilarities
        
    sim_movie_local = sim_movie.collect()
    sim_dict = {}
    for sm in sim_movie_local:
        key = sm[0]
        value = sm[1]

        sim_dict[key] = value
    
    return sim_dict

def _extract_user_rating(line):
    data = line.split('::')
    return (int(data[0]), [(int(data[1]), float(data[2]))])

def _extract_movie_data(line):
    data = line.split('::')
    return (int(data[0]), data[1])

def rate_movie(user_ratings, predicted_movie):
    
    numerator = 0
    denominator = 0
    
    for movie_id, rating in user_ratings:
        if(predicted_movie < movie_id):
            m1 = predicted_movie
            m2 = movie_id
        else:
            m2 = predicted_movie
            m1 = movie_id
        
        if (m1, m2) in sim_dict:
            sim_score, number_of_record = sim_dict[(m1, m2)]
        else:
            sim_score, number_of_record = (0,0)
        
        #normalize
        n_rating = (2*(rating - 1) - 4) / 4
        numerator += sim_score*n_rating #normalize_rating[rating]
        denominator += sim_score
    
    predicted_rating = numerator/denominator if denominator else 0
    #denormalize
    predicted_rating = 0.5*(predicted_rating+1)*4 + 1
    
    if predicted_rating > 5:
        predicted_rating = 5.0
    elif predicted_rating < 1:
        predicted_rating = 1.0
    
    return predicted_rating

In [8]:
sim_dict = load_sim_dict()

movie_dict = dict(movie_data.map(_extract_movie_data).filter(lambda x: x[0] in filtered_movie_list).collect())

user_lists = rating_data_train.map(_extract_user_rating).filter(lambda x: x[1][0][0] in filtered_movie_list).reduceByKey(lambda v1,v2: v1+v2).collect()
user_lists = dict(user_lists)

test_set = rating_data_test.map(_extract_user_rating).filter(lambda x: x[1][0][0] in filtered_movie_list).reduceByKey(lambda v1,v2: v1 + v2).collect()


In [9]:
predicted_ratings = []

for user_id, movie_rating in test_set:
    
    if user_id in user_lists:
        
        user_ratings = user_lists[user_id]

        for m_id, rating in movie_rating:
            predicted_ratings += [(user_id, m_id,rate_movie(user_ratings, m_id), rating)]

## Result and accuracy

We choose ‘Root mean squared error’ for the accuracy since it is numeric and the result is 1.34. This means the average error is 1.34 out of 5 which is not bad

In [10]:
predicted_ratings = pd.DataFrame(predicted_ratings)
predicted_ratings.columns = ['user', 'movie', 'predicted', 'true']

In [11]:
rmse = sqrt(((predicted_ratings['predicted']-predicted_ratings['true'])**2).sum() / predicted_ratings.count()[0])
print(rmse)

1.3403969536697378


In [12]:
predicted_ratings

Unnamed: 0,user,movie,predicted,true
0,8,50,1.380536,5.0
1,8,527,3.762502,4.0
2,8,1136,3.936062,4.0
3,8,1204,4.221492,2.0
4,8,2571,4.468610,5.0
5,8,4226,3.180856,4.5
6,8,5952,3.029023,3.0
7,16,1175,5.000000,5.0
8,16,2360,1.697077,4.0
9,40,50,2.231777,5.0


In [15]:
# predicted_ratings.to_csv('input/predicted_ratings')