In [1]:
from pyspark import SparkConf, SparkContext
import pandas as pd
import numpy as np
from math import sqrt

In [2]:
sc = SparkContext.getOrCreate()
rating_data = sc.textFile("input/ml-100k/u.data")

rating_data_test, rating_data_train = rating_data.randomSplit(weights=[0.2, 0.8], seed=1)

In [3]:
def _extract_user_rating(line):
    data = line.split('\t')
    return (int(data[0]), (int(data[1]), float(data[2])))
    
def create_sim_score(rating_data):


    def _filter_movies(line):
        movie1 = line[1][0]
        movie2 = line[1][1]

        return movie1 < movie2

    def _makePairs(line):
        (movie1, rating1) = line[1][0]
        (movie2, rating2) = line[1][1]

        return ((movie1, movie2), (rating1, rating2))

    def _computeCosineSimilarity(ratingPairs):
        numPairs = 0
        sum_xx = sum_yy = sum_xy = 0
        for ratingX, ratingY in ratingPairs:
            sum_xx += ratingX * ratingX
            sum_yy += ratingY * ratingY
            sum_xy += ratingX * ratingY
            numPairs += 1

        numerator = sum_xy
        denominator = sqrt(sum_xx) * sqrt(sum_yy)

        score = 0
        if (denominator):
            score = (numerator / (float(denominator)))

        return (score, numPairs)

    # rating_data = sc.textFile("ml-100k/u.data")
    user_rating_lists = rating_data.map(_extract_user_rating)
    join_lists = user_rating_lists.join(user_rating_lists)

    user_movies_list = join_lists.filter(_filter_movies)

    rating_pairs = user_movies_list.map(_makePairs)
    moviePairRatings= rating_pairs.groupByKey()
    moviePairSimilarities = moviePairRatings.mapValues(_computeCosineSimilarity).persist()
    moviePairSimilarities.sortByKey().saveAsPickleFile("input/movie-sims-obj2")

In [20]:
def load_sim_dict():
    sim_movie = sc.pickleFile("input/movie-sims-obj2/")
    sim_movie_local = sim_movie.collect()
    sim_dict = {}
    for sm in sim_movie_local:
        key = sm[0]
        value = sm[1]

        sim_dict[key] = value
    
    return sim_dict

def _extract_user_rating(line):
    data = line.split('\t')
    return (int(data[0]), [(int(data[1]), float(data[2]))])

def _extract_movie_data(line):
    data = line.split('|')
    return (int(data[0]), data[1])

sim_dict = load_sim_dict()
movie_data = sc.textFile("input/ml-100k/u.item")
movie_dict = dict(movie_data.map(_extract_movie_data).collect())

user_lists = rating_data_train.map(_extract_user_rating).reduceByKey(lambda v1,v2: v1+v2).persist()

In [5]:
user_id = 816
predicted_movie = 660

_, user_ratings = user_lists.filter(lambda line: line[0]==user_id).collect()[0]

def rate_movie(user_ratings, predicted_movie):
    
    numerator = 0
    denominator = 0

    for movie_id, rating in user_ratings:
        if(predicted_movie < movie_id):
            m1 = predicted_movie
            m2 = movie_id
        else:
            m2 = predicted_movie
            m1 = movie_id
        
        if (m1, m2) in sim_dict:
            sim_score, number_of_record = sim_dict[(m1, m2)]
        else:
            sim_score, number_of_record = (0,0)

        numerator += sim_score*rating
        denominator += sim_score
    
    predicted_rating = numerator/denominator if denominator else 0
    
    return predicted_rating

print(rate_movie(user_ratings, predicted_movie))

3.7313299683332


In [6]:
user_id = 816
_, user_ratings = user_lists.filter(lambda line: line[0]==user_id).collect()[0]

predicted_ratings = []
for m_id in movie_dict.keys():
#     print(rate_movie(user_id, m_id))
    predicted_ratings += [(m_id,rate_movie(user_ratings, m_id))]

sorted_m_list = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)

In [7]:
for m_id, rating in sorted_m_list:
    print(movie_dict[m_id], rating)

Substance of Fire, The (1996) 5.0
Grateful Dead (1995) 5.0
I Don't Want to Talk About It (De eso no se habla) (1993) 5.0
Machine, The (1994) 4.5
Gordy (1995) 4.5
Mirage (1995) 4.5
Infinity (1996) 4.333333333333333
Hunted, The (1995) 4.333162217557625
Three Lives and Only One Death (1996) 4.25
New Jersey Drive (1995) 4.25
Walking Dead, The (1995) 4.206629921155735
Awfully Big Adventure, An (1995) 4.197421342120916
Blood For Dracula (Andy Warhol's Dracula) (1974) 4.166666666666667
Wonderful, Horrible Life of Leni Riefenstahl, The (1993) 4.122546813883998
I Like It Like That (1994) 4.112313464289478
Horse Whisperer, The (1998) 4.089429204838683
Bent (1997) 4.089270176975951
Newton Boys, The (1998) 4.08608689889268
Aparajito (1956) 4.080838650429385
Madame Butterfly (1995) 4.077716754302861
Just Cause (1995) 4.074522070270199
Cobb (1994) 4.061708313728197
Nico Icon (1995) 4.060795702940939
Late Bloomers (1996) 4.0188857327641445
Rendezvous in Paris (Rendez-vous de Paris, Les) (1995) 4.0086

Apocalypse Now (1979) 3.7505642872368403
Rebel Without a Cause (1955) 3.7505422807258126
Adventures of Priscilla, Queen of the Desert, The (1994) 3.7505257705736836
True Lies (1994) 3.7505245858528706
Homeward Bound: The Incredible Journey (1993) 3.7504227517258837
Batman Returns (1992) 3.7503587487901715
Emma (1996) 3.7503309596802707
Dances with Wolves (1990) 3.7502675106496604
Manhattan Murder Mystery (1993) 3.7502627572293656
Ready to Wear (Pret-A-Porter) (1994) 3.7502622310047706
Graduate, The (1967) 3.7502588317297825
Searching for Bobby Fischer (1993) 3.7502327355535345
It's a Wonderful Life (1946) 3.7502032161518684
Right Stuff, The (1983) 3.750186200324893
Basic Instinct (1992) 3.750118538016355
Empire Strikes Back, The (1980) 3.750099424112059
Hunchback of Notre Dame, The (1996) 3.7500571519900125
Hot Shots! Part Deux (1993) 3.750009835249598
Big Squeeze, The (1996) 3.75
Jupiter's Wife (1994) 3.75
Innocent Sleep, The (1995) 3.75
Raw Deal (1948) 3.75
Naked in New York (1994) 3

In [15]:
rating_data_test.map(_extract_user_rating).takeSample(False,5)

[(586, [(576, 3.0)]),
 (938, [(255, 1.0)]),
 (707, [(732, 4.0)]),
 (18, [(57, 4.0)]),
 (623, [(227, 4.0)])]

In [18]:
test_set = rating_data_test.map(_extract_user_rating).reduceByKey(lambda v1,v2: v1 + v2).collect()
test_set

[(6,
  [(86, 3.0),
   (301, 2.0),
   (69, 3.0),
   (469, 5.0),
   (275, 4.0),
   (498, 4.0),
   (213, 4.0),
   (28, 2.0),
   (467, 4.0),
   (117, 2.0),
   (458, 1.0),
   (199, 4.0),
   (197, 5.0),
   (193, 3.0),
   (483, 5.0),
   (526, 3.0),
   (187, 4.0),
   (518, 3.0),
   (189, 3.0),
   (534, 4.0),
   (536, 4.0),
   (21, 3.0),
   (200, 3.0),
   (520, 4.0),
   (465, 1.0),
   (493, 5.0),
   (495, 4.0),
   (203, 3.0),
   (274, 4.0),
   (408, 4.0),
   (166, 4.0),
   (308, 3.0),
   (7, 2.0),
   (317, 3.0),
   (127, 5.0),
   (9, 4.0),
   (261, 3.0),
   (306, 4.0),
   (533, 4.0)]),
 (286,
  [(1014, 5.0),
   (357, 4.0),
   (44, 3.0),
   (34, 5.0),
   (721, 3.0),
   (689, 5.0),
   (151, 5.0),
   (158, 3.0),
   (100, 3.0),
   (432, 3.0),
   (642, 3.0),
   (419, 5.0),
   (704, 2.0),
   (683, 5.0),
   (89, 4.0),
   (107, 1.0),
   (1113, 3.0),
   (738, 5.0),
   (354, 4.0),
   (477, 3.0),
   (1411, 2.0),
   (168, 4.0),
   (289, 5.0),
   (554, 4.0),
   (881, 5.0),
   (53, 2.0),
   (161, 2.0),
   (3

In [24]:
predicted_ratings = []

for user_id, movie_rating in test_set:
    _, user_ratings = user_lists.filter(lambda line: line[0]==user_id).collect()[0]
    
    for m_id, rating in movie_rating:
        predicted_ratings += [(user_id, m_id,rate_movie(user_ratings, m_id), rating)]

In [25]:
predicted_ratings

[(6, 86, 3.6837227993268287, 3.0),
 (6, 301, 3.675583290925926, 2.0),
 (6, 69, 3.676287248166154, 3.0),
 (6, 469, 3.6888001255865097, 5.0),
 (6, 275, 3.684540492146498, 4.0),
 (6, 498, 3.6844248141639153, 4.0),
 (6, 213, 3.6832275564537706, 4.0),
 (6, 28, 3.6771847117083953, 2.0),
 (6, 467, 3.6917736269311288, 4.0),
 (6, 117, 3.6741537447539945, 2.0),
 (6, 458, 3.6855508923026457, 1.0),
 (6, 199, 3.6797184120942967, 4.0),
 (6, 197, 3.681771197407057, 5.0),
 (6, 193, 3.6813773889782384, 3.0),
 (6, 483, 3.6816736626746827, 5.0),
 (6, 526, 3.680896608429538, 3.0),
 (6, 187, 3.679425544169338, 4.0),
 (6, 518, 3.6964233718225463, 3.0),
 (6, 189, 3.6801465708491987, 3.0),
 (6, 534, 3.660556735067101, 4.0),
 (6, 536, 3.716880568228155, 4.0),
 (6, 21, 3.6810339707092443, 3.0),
 (6, 200, 3.6819757455036424, 3.0),
 (6, 520, 3.6755677243450307, 4.0),
 (6, 465, 3.6687518213962167, 1.0),
 (6, 493, 3.6881386044129694, 5.0),
 (6, 495, 3.678271776480549, 4.0),
 (6, 203, 3.6811182399938573, 3.0),
 (6, 