In [1]:
from pyspark import SparkConf, SparkContext
import pandas as pd
import numpy as np
from math import sqrt
import time
# import itertools
# from sklearn.metrics import mean_squared_error

In [2]:
conf = SparkConf().setMaster("local[*]")
sc = SparkContext(conf=conf)
rating_data = sc.textFile("full_data/ratings.dat")
movie_data = sc.textFile("full_data/movies.dat")

filtered_movie_list = np.loadtxt('input/movie_list.txt', dtype=int)

rating_data_test, rating_data_train = rating_data.randomSplit(weights=[0.2, 0.8], seed=1)

# Similarity score

In [3]:
def _extract_user_rating(line):
    data = line.split('::')
    return (int(data[0]), (int(data[1]), float(data[2])))

def rate_substract_mean(v):
    rating_list = v[0]
    acc_rating = v[1]
    count_rating = v[2]
    user_mean = acc_rating / count_rating
    
    result = [(m_id, rating - user_mean) for m_id, rating in rating_list]
    
    return result

def pair_movie(line):
    user_id = line[0]
    rating_list = line[1]
    
    result = []
    for i in range(len(rating_list)):
        for j in range(i+1, len(rating_list)):
            m_id1, rating1 = rating_list[i]
            m_id2, rating2 = rating_list[j]
            if(m_id1 < m_id2):
                result += [((m_id1, m_id2), (rating1, rating2))]
    
    return result

def _computeCosineSimilarity(ratingPairs):
    numPairs = 0
    sum_xx = sum_yy = sum_xy = 0
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1

    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)

    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))

    return (score, numPairs)


In [4]:
user_rating_lists = rating_data_train \
    .map(_extract_user_rating) \
    .filter(lambda x: x[1][0] in filtered_movie_list) \
    .aggregateByKey(([], 0, 0), lambda g1,v2: (g1[0]+[v2], g1[1]+v2[1], g1[2]+1), 
                    lambda g1,g2: (g1[0]+g2[0], g1[1]+g2[1], g1[2]+g2[2])) \
    .filter(lambda x: x[1][2] > 1) \
    .mapValues(rate_substract_mean) 
    
user_rating_lists.saveAsPickleFile("input/movie-rating")

In [6]:
moviePairSimilarities = user_rating_lists.flatMap(pair_movie) \
    .groupByKey() \
    .mapValues(_computeCosineSimilarity).persist() 
moviePairSimilarities.saveAsPickleFile("input/movie-sims-obj3")

In [7]:
# moviePairSimilarities = sc.pickleFile("input/movie-sims-obj3/")
moviePairSimilarities.takeSample(False, 10)

[((5147, 27815), (-0.18802116493306567, 18)),
 ((1299, 2859), (0.0550433802853718, 310)),
 ((1235, 27815), (0.3373387077927842, 47)),
 ((1203, 5971), (0.06797354688821527, 229)),
 ((1927, 7116), (-0.04257587165846587, 59)),
 ((1178, 7256), (0.14889483058288241, 90)),
 ((541, 8477), (0.10850753672604761, 118)),
 ((3090, 7762), (0.3199812916717421, 25)),
 ((632, 7820), (0.5928748497510965, 3)),
 ((1148, 8477), (-0.18791721796428662, 76))]

# Prediction part

In [3]:
def load_sim_dict(from_file=False):
    if from_file:
        sim_movie = sc.pickleFile("input/movie-sims-obj3/")
        
    else:
        sim_movie = moviePairSimilarities
        
    sim_movie_local = sim_movie.collect()
    sim_dict = {}
    for sm in sim_movie_local:
        key = sm[0]
        value = sm[1]

        sim_dict[key] = value
    
    return sim_dict

def _extract_user_rating(line):
    data = line.split('::')
    return (int(data[0]), [(int(data[1]), float(data[2]))])

def _extract_movie_data(line):
    data = line.split('::')
    return (int(data[0]), data[1])

def rate_movie(user_ratings, predicted_movie):
    
    numerator = 0
    denominator = 0
    
#     normalize_rating = {1:-1, 2:-0.5, 3:0, 4:0.5, 5:1}
    

    for movie_id, rating in user_ratings:
        if(predicted_movie < movie_id):
            m1 = predicted_movie
            m2 = movie_id
        else:
            m2 = predicted_movie
            m1 = movie_id
        
        if (m1, m2) in sim_dict:
            sim_score, number_of_record = sim_dict[(m1, m2)]
        else:
            sim_score, number_of_record = (0,0)
        
        #normalize
        n_rating = (2*(rating - 1) - 4) / 4
        numerator += sim_score*n_rating #normalize_rating[rating]
        denominator += sim_score
    
    predicted_rating = numerator/denominator if denominator else 0
    #denormalize
    predicted_rating = 0.5*(predicted_rating+1)*4 + 1
    
    if predicted_rating > 5:
        predicted_rating = 5.0
    elif predicted_rating < 1:
        predicted_rating = 1.0
    
    return predicted_rating

In [10]:
sim_dict = load_sim_dict(True)

movie_dict = dict(movie_data.map(_extract_movie_data).filter(lambda x: x[0] in filtered_movie_list).collect())

# user_lists = rating_data_train.map(_extract_user_rating).filter(lambda x: x[1][0][0] in filtered_movie_list).reduceByKey(lambda v1,v2: v1+v2).persist()
user_lists = rating_data_train.map(_extract_user_rating).filter(lambda x: x[1][0][0] in filtered_movie_list).reduceByKey(lambda v1,v2: v1+v2).collect()
user_lists = dict(user_lists)

test_set = rating_data_test.map(_extract_user_rating).filter(lambda x: x[1][0][0] in filtered_movie_list).reduceByKey(lambda v1,v2: v1 + v2).collect()


In [14]:
dict(user_lists)

{8: [(47, 4.0),
  (260, 3.5),
  (290, 4.5),
  (293, 3.0),
  (457, 3.5),
  (541, 4.0),
  (593, 4.0),
  (608, 5.0),
  (678, 4.5),
  (778, 4.0),
  (1089, 4.0),
  (1148, 4.0),
  (1196, 4.0),
  (1197, 5.0),
  (1198, 4.5),
  (1200, 5.0),
  (1206, 2.5),
  (1208, 2.5),
  (1214, 4.0),
  (1234, 4.5),
  (1253, 3.5),
  (1262, 4.0),
  (1291, 4.5),
  (1358, 4.5),
  (1617, 4.0),
  (1704, 3.5),
  (2028, 3.5),
  (2329, 4.0),
  (2762, 4.5),
  (2858, 5.0),
  (2959, 4.5),
  (4011, 4.5),
  (4878, 5.0),
  (4993, 3.5),
  (7153, 4.0)],
 16: [(260, 5.0),
  (541, 3.0),
  (1196, 5.0),
  (1199, 4.0),
  (1200, 4.0),
  (1206, 4.0),
  (1214, 4.0)],
 24: [(260, 5.0), (608, 3.0), (858, 5.0)],
 40: [(290, 3.0),
  (296, 5.0),
  (457, 4.0),
  (527, 4.0),
  (593, 5.0),
  (608, 5.0),
  (1147, 5.0),
  (1193, 5.0),
  (1196, 5.0),
  (1248, 5.0),
  (1617, 4.0)],
 56: [(50, 4.5),
  (110, 3.5),
  (260, 5.0),
  (296, 5.0),
  (356, 4.5),
  (527, 5.0),
  (541, 5.0),
  (750, 5.0),
  (858, 5.0),
  (904, 5.0),
  (908, 5.0),
  (909, 5.

In [16]:
predicted_ratings = []

for user_id, movie_rating in test_set:
#     user_data = user_lists.filter(lambda line: line[0]==user_id).collect()
    
    if user_id in user_lists:
        
        user_ratings = user_lists[user_id]

        for m_id, rating in movie_rating:
            predicted_ratings += [(user_id, m_id,rate_movie(user_ratings, m_id), rating)]

## Result and accuracy

In [30]:
predicted_ratings = pd.DataFrame(predicted_ratings)

rmse = sqrt(((predicted_ratings[2]-predicted_ratings[3])**2).sum() / predicted_ratings.count()[0])
print(rmse)

# rmse = sqrt(mean_squared_error(predicted_ratings[2],predicted_ratings[3]))
# predicted_ratings

1.340396953669738


In [None]:
moviePairSimilarities.saveAsPickleFile("input/movie-sims-obj-train-full")