In [1]:
from pyspark import SparkConf, SparkContext
import pandas as pd
import numpy as np
from math import sqrt

def load_sim_dict():
    sim_movie = sc.pickleFile("movie-sims-obj/")
    sim_movie_local = sim_movie.collect()
    sim_dict = {}
    for sm in sim_movie_local:
        key = sm[0]
        value = sm[1]

        sim_dict[key] = value
    
    return sim_dict

def _extract_user_rating(line):
    data = line.split('\t')
    return (int(data[0]), [(int(data[1]), float(data[2]))])

def _extract_movie_data(line):
    data = line.split('|')
    return (int(data[0]), data[1])

sc = SparkContext.getOrCreate()
rating_data = sc.textFile("ml-100k/u.data")

sim_dict = load_sim_dict()
movie_data = sc.textFile("ml-100k/u.item")
movie_dict = dict(movie_data.map(_extract_movie_data).collect())

user_lists = rating_data.map(_extract_user_rating).reduceByKey(lambda v1,v2: v1+v2)

# sim_dict_bc = sc.broadcast(sim_dict)
# user_lists_bc = sc.broadcast(user_lists)

In [2]:
user_id = 816
predicted_movie = 660

_, user_ratings = user_lists.filter(lambda line: line[0]==user_id).collect()[0]

def rate_movie(user_ratings, predicted_movie):
#     user_lists = user_lists_bc.value
#     sim_dict = sim_dict_bc.value
    
    numerator = 0
    denominator = 0

    for movie_id, rating in user_ratings:
        if(predicted_movie < movie_id):
            m1 = predicted_movie
            m2 = movie_id
        else:
            m2 = predicted_movie
            m1 = movie_id
        
        if (m1, m2) in sim_dict:
            sim_score, number_of_record = sim_dict[(m1, m2)]
        else:
            sim_score, number_of_record = (0,0)

        numerator += sim_score*rating
        denominator += sim_score
    
    predicted_rating = numerator/denominator if denominator else 0
    
    return predicted_rating

print(rate_movie(user_ratings, predicted_movie))

3.8558878016108524


In [3]:
user_id = 816
_, user_ratings = user_lists.filter(lambda line: line[0]==user_id).collect()[0]

predicted_ratings = []
for m_id in movie_dict.keys():
#     print(rate_movie(user_id, m_id))
    predicted_ratings += [(m_id,rate_movie(user_ratings, m_id))]

In [6]:
sorted_m_list = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)

In [7]:
for m_id, rating in sorted_m_list:
    print(movie_dict[m_id], rating)

Substance of Fire, The (1996) 5.0
All Things Fair (1996) 5.0
Mirage (1995) 4.666666666666667
Delta of Venus (1994) 4.597845558072851
Carmen Miranda: Bananas Is My Business (1994) 4.542398920113096
Babyfever (1994) 4.507577497529358
Pushing Hands (1992) 4.5
Killer: A Journal of Murder (1995) 4.5
Shadows (Cienie) (1988) 4.5
The Courtyard (1995) 4.5
Yankee Zulu (1994) 4.5
Hostile Intentions (1994) 4.5
Tigrero: A Film That Was Never Made (1994) 4.5
Eye of Vichy, The (Oeil de Vichy, L') (1993) 4.5
Promise, The (Versprechen, Das) (1994) 4.5
To Cross the Rubicon (1991) 4.5
Daens (1992) 4.5
Man from Down Under, The (1943) 4.5
Careful (1992) 4.5
Vermont Is For Lovers (1992) 4.5
Vie est belle, La (Life is Rosey) (1987) 4.5
Quartier Mozart (1992) 4.5
Touki Bouki (Journey of the Hyena) (1973) 4.5
Wend Kuuni (God's Gift) (1982) 4.5
Spirits of the Dead (Tre passi nel delirio) (1968) 4.5
Pharaoh's Army (1995) 4.5
I, Worst of All (Yo, la peor de todas) (1990) 4.5
Hungarian Fairy Tale, A (1987) 4.5
Dea

Farinelli: il castrato (1994) 3.8232959893994436
Phantom, The (1996) 3.822735691430921
Death and the Maiden (1994) 3.8221673780076317
Double Happiness (1994) 3.822005274448071
Doom Generation, The (1995) 3.821472450352385
8 1/2 (1963) 3.820969713415556
Fast, Cheap & Out of Control (1997) 3.82092866931116
Wings of Desire (1987) 3.8208410767639274
Priest (1994) 3.8207388383503
When the Cats Away (Chacun cherche son chat) (1996) 3.8206763843009925
Wes Craven's New Nightmare (1994) 3.8200107051980527
Critical Care (1997) 3.819953860163389
Paris, Texas (1984) 3.8197962727284063
Marked for Death (1990) 3.8193412071794106
Half Baked (1998) 3.8185401867898605
Angela (1995) 3.8181818181818183
Two Bits (1995) 3.8178285870790094
Keys to Tulsa (1997) 3.8173157349682016
Moll Flanders (1996) 3.816529132477375
Awfully Big Adventure, An (1995) 3.8155571309107503
Great White Hype, The (1996) 3.814684900761545
Kika (1993) 3.8144521945279855
Second Jungle Book: Mowgli & Baloo, The (1997) 3.81418999989338