In [1]:
from pyspark import SparkConf, SparkContext
import pandas as pd
import numpy as np
from math import sqrt
import time
# from sklearn.metrics import mean_squared_error

In [2]:
conf = SparkConf().setMaster("local[*]")
sc = SparkContext(conf=conf)
rating_data = sc.textFile("full_data/ratings.dat")
movie_data = sc.textFile("full_data/movies.dat")

filtered_movie_list = np.loadtxt('input/movie_list.txt', dtype=int)

rating_data_test, rating_data_train = rating_data.randomSplit(weights=[0.2, 0.8], seed=1)

# Similarity score

In [3]:
def _extract_user_rating(line):
    data = line.split('::')
    return (int(data[0]), (int(data[1]), float(data[2])))

def rate_substract_mean(v):
    rating_list = v[0]
    acc_rating = v[1]
    count_rating = v[2]
    user_mean = acc_rating / count_rating
    
    result = [(m_id, rating - user_mean) for m_id, rating in rating_list]
    
    return result

def pair_movie(line):
    user_id = line[0]
    rating_list = line[1]
    
    result = []
    for i in range(len(rating_list)):
        for j in range(i+1, len(rating_list)):
            m_id1, rating1 = rating_list[i]
            m_id2, rating2 = rating_list[j]
            if(m_id1 < m_id2):
                result += [((m_id1, m_id2), (rating1, rating2))]
    
    return result

def _computeCosineSimilarity(ratingPairs):
    numPairs = 0
    sum_xx = sum_yy = sum_xy = 0
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1

    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)

    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))

    return (score, numPairs)


In [4]:
user_rating_lists = rating_data_train \
    .map(_extract_user_rating) \
    .filter(lambda x: x[1][0] in filtered_movie_list) \
    .aggregateByKey(([], 0, 0), lambda g1,v2: (g1[0]+[v2], g1[1]+v2[1], g1[2]+1), 
                    lambda g1,g2: (g1[0]+g2[0], g1[1]+g2[1], g1[2]+g2[2])) \
    .filter(lambda x: x[1][2] > 1) \
    .mapValues(rate_substract_mean) 
    
# user_rating_lists.saveAsPickleFile("input/movie-rating")

In [5]:
moviePairSimilarities = user_rating_lists.flatMap(pair_movie) \
    .groupByKey() \
    .mapValues(_computeCosineSimilarity).persist() 
moviePairSimilarities.saveAsPickleFile("input/movie-sims-obj3")

In [6]:
# moviePairSimilarities = sc.pickleFile("input/movie-sims-obj3/")
moviePairSimilarities.takeSample(False, 10)

[((1244, 1940), (-0.09462831301543684, 133)),
 ((1161, 2940), (-0.09588332625048708, 61)),
 ((1304, 7075), (-0.0953607369281508, 99)),
 ((1277, 7327), (0.08402766032704813, 29)),
 ((3246, 26082), (-0.31185314850185963, 16)),
 ((3521, 5772), (0.2236223940765554, 59)),
 ((26662, 51255), (0.012086403146876187, 133)),
 ((2511, 3967), (-0.217953105443295, 76)),
 ((2908, 6270), (0.07118940223653676, 64)),
 ((4178, 5840), (0.09688612127555797, 17))]

# Prediction part

In [7]:
def load_sim_dict(from_file=False):
    if from_file:
        sim_movie = sc.pickleFile("input/movie-sims-obj3/")
        
    else:
        sim_movie = moviePairSimilarities
        
    sim_movie_local = sim_movie.collect()
    sim_dict = {}
    for sm in sim_movie_local:
        key = sm[0]
        value = sm[1]

        sim_dict[key] = value
    
    return sim_dict

def _extract_user_rating(line):
    data = line.split('::')
    return (int(data[0]), [(int(data[1]), float(data[2]))])

def _extract_movie_data(line):
    data = line.split('::')
    return (int(data[0]), data[1])

def rate_movie(user_ratings, predicted_movie):
    
    numerator = 0
    denominator = 0
    
#     normalize_rating = {1:-1, 2:-0.5, 3:0, 4:0.5, 5:1}
    

    for movie_id, rating in user_ratings:
        if(predicted_movie < movie_id):
            m1 = predicted_movie
            m2 = movie_id
        else:
            m2 = predicted_movie
            m1 = movie_id
        
        if (m1, m2) in sim_dict:
            sim_score, number_of_record = sim_dict[(m1, m2)]
        else:
            sim_score, number_of_record = (0,0)
        
        #normalize
        n_rating = (2*(rating - 1) - 4) / 4
        numerator += sim_score*n_rating #normalize_rating[rating]
        denominator += sim_score
    
    predicted_rating = numerator/denominator if denominator else 0
    #denormalize
    predicted_rating = 0.5*(predicted_rating+1)*4 + 1
    
    if predicted_rating > 5:
        predicted_rating = 5.0
    elif predicted_rating < 1:
        predicted_rating = 1.0
    
    return predicted_rating

In [8]:
sim_dict = load_sim_dict(True)

movie_dict = dict(movie_data.map(_extract_movie_data).filter(lambda x: x[0] in filtered_movie_list).collect())

# user_lists = rating_data_train.map(_extract_user_rating).filter(lambda x: x[1][0][0] in filtered_movie_list).reduceByKey(lambda v1,v2: v1+v2).persist()
user_lists = rating_data_train.map(_extract_user_rating).filter(lambda x: x[1][0][0] in filtered_movie_list).reduceByKey(lambda v1,v2: v1+v2).collect()
user_lists = dict(user_lists)

test_set = rating_data_test.map(_extract_user_rating).filter(lambda x: x[1][0][0] in filtered_movie_list).reduceByKey(lambda v1,v2: v1 + v2).collect()


In [9]:
predicted_ratings = []

for user_id, movie_rating in test_set:
#     user_data = user_lists.filter(lambda line: line[0]==user_id).collect()
    
    if user_id in user_lists:
        
        user_ratings = user_lists[user_id]

        for m_id, rating in movie_rating:
            predicted_ratings += [(user_id, m_id,rate_movie(user_ratings, m_id), rating)]

## Result and accuracy

In [10]:
predicted_ratings = pd.DataFrame(predicted_ratings)

rmse = sqrt(((predicted_ratings[2]-predicted_ratings[3])**2).sum() / predicted_ratings.count()[0])
print(rmse)

# rmse = sqrt(mean_squared_error(predicted_ratings[2],predicted_ratings[3]))
# predicted_ratings

1.3993434646688374


In [11]:
# moviePairSimilarities.saveAsPickleFile("input/movie-sims-obj-train-full")

In [12]:
predicted_ratings

Unnamed: 0,0,1,2,3
0,8,50,5.000000,5.0
1,8,527,3.568286,4.0
2,8,1036,4.084594,4.0
3,8,1079,3.505156,4.0
4,8,1136,1.000000,4.0
5,8,1204,4.033430,2.0
6,8,1240,3.742597,4.0
7,8,2194,3.595259,2.5
8,8,2571,5.000000,5.0
9,8,2618,2.469546,4.0
