In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import *
import numpy as np
from itertools import permutations
from pyspark.mllib.recommendation import ALS
import math

In [3]:
sc = SparkContext.getOrCreate()

In [4]:
small_ratings_raw_data = sc.textFile('/Users/gregcattell/rutgers_study/cs550/PJ_data/ml-latest-small/ratings.csv')

small_ratings_raw_data_header = small_ratings_raw_data.take(1)[0]

small_ratings_data = small_ratings_raw_data.filter(lambda line: line!=small_ratings_raw_data_header)

small_ratings_data = small_ratings_data.map(lambda line : line.split(","))\
.map(lambda tokens: (tokens[0],tokens[1],tokens[2])).cache()

In [11]:
small_movies_raw_data = sc.textFile('/Users/gregcattell/rutgers_study/cs550/PJ_data/ml-latest-small/movies.csv')
small_movies_raw_data_header = small_movies_raw_data.take(1)[0]

small_movies_data = small_movies_raw_data.filter(lambda line: line!=small_movies_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1])).cache()

small_movies_titles = small_movies_data.map(lambda x: (int(x[0]),x[1]))


### Spliting data into training set and test set

In [6]:
training_RDD, test_RDD = small_ratings_data.randomSplit([8, 2], seed=0)
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

In [7]:
small_movies_raw_data.take(3)

['movieId,title,genres',
 '1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy',
 '2,Jumanji (1995),Adventure|Children|Fantasy']

In [8]:
small_ratings_raw_data_header

'userId,movieId,rating,timestamp'

In [9]:
training_RDD.count()

80720

### Get mean for all movies

In [10]:
movie_ratings = training_RDD.map(lambda tokens: (int(tokens[1]),(float(tokens[2]),1,(float(tokens[2])**2))))
movie_mean_form = movie_ratings.reduceByKey(lambda x, y : (x[0] + y[0], x[1] + y[1], x[2]+y[2]))\
.map(lambda pair: (pair[0], (pair[1][0]/pair[1][1], np.sqrt(pair[1][2]))))

In [11]:
#movie_mean_form.take(10)

In [12]:
join_tmp = training_RDD.map(lambda line: (int(line[1]), (int(line[0]), float(line[2]))))\
.join(movie_mean_form)
# (moviev, ((user, rating), (movie_mean, movieForm)))
normalized_ratings = join_tmp.map(lambda line: (line[1][0][0], line[0], line[1][0][1] - line[1][1][0], line[1][1][1]))
# (user, movie, normalized_rating, movieForm)

In [13]:
#normalized_ratings.take(10)

### Get pairs for one user: ((movie1, movie2), (mov_vec1, mov_vec2), (mov_form1, mov_form2))

In [14]:
def item_join(line):
    perm = list(permutations(line, 2))
    return perm

In [15]:
movie_pairs = normalized_ratings.map(lambda line: (line[0], (line[1], line[2], line[3]))).groupByKey()\
.map(lambda line: list(line[1])).flatMap(lambda line: item_join(line))\
.map(lambda line: ((line[0][0], line[1][0]),((line[0][1], line[1][1]),(line[0][2], line[1][2])))).cache()

In [16]:
#movie_pairs.take(10)

### Get cosine distances ((movie1, movie2), cosine distance)

In [17]:
cosine_unit = movie_pairs.map(lambda x: (x[0], x[1][0][0]*x[1][0][1]/x[1][1][0]/x[1][1][1]))

In [18]:
#cosine_unit.take(10)

In [19]:
cosine_dist = cosine_unit.reduceByKey(lambda x,y : x + y).cache()

In [20]:
cosine_dist.take(10)

[((4381, 1025), -0.003890556475138052),
 ((1732, 59018), 0.0),
 ((32, 33558), -0.0),
 ((3252, 2058), -0.00028137611368194836),
 ((4688, 55094), 0.009298353633971536),
 ((6333, 44665), 0.004478178624224742),
 ((1921, 41997), 0.0007341377596712991),
 ((1678, 6552), 0.014398442912188473),
 ((5446, 2232), 0.005214660986677659),
 ((49530, 5152), -0.003354876132261581)]

### Get baseline estimation for user-item

In [21]:
ratings = training_RDD.map(lambda x: float(x[2]))
total = ratings.sum()
total_num = ratings.count()
#get the global average at first
global_mean = total/total_num

In [22]:
global_mean

3.502682111000991

In [23]:
#we already have the average list for movie
movie_average = movie_mean_form.map(lambda line: (line[0], line[1][0]))

In [24]:
#movie_average.take(10)

In [25]:
#get the average list for user
user_average = training_RDD.map(lambda x: (int(x[0]), (float(x[2]), 1)))\
.reduceByKey(lambda x,y: (x[0] + y[0], x[1] + y[1])).map(lambda x: (x[0], x[1][0]/x[1][1]))

In [26]:
#user_average.take(10)

In [27]:
#define the function to get baseline
"""
float baseLine(int user, int movie):
"""
movie_ave_dict = {x[0]: x[1] for x in movie_average.collect()}
user_ave_dict = {x[0]: x[1] for x in user_average.collect()}

def baseLine(user, movie):
    if movie not in movie_ave_dict:
        return user_ave_dict[user]
    return user_ave_dict[user] + movie_ave_dict[movie] - global_mean

In [28]:
#baseLine(2, 6)

### Calculate the predictions for test data

In [29]:
#we already have the (user, movie) pairs for test, change it to (movie, user)
#test_for_predict_RDD.take(10)
test_data = test_for_predict_RDD.map(lambda x: (int(x[1]), int(x[0])))
test_data.take(10)

[(70, 1),
 (101, 1),
 (110, 1),
 (151, 1),
 (216, 1),
 (316, 1),
 (333, 1),
 (356, 1),
 (367, 1),
 (500, 1)]

In [30]:
training_data = training_RDD.map(lambda line: ((int(line[0]), int(line[1])), float(line[2])))
#change the cosine_dist((movie1, movie2), similarity) to (movie1, (movie2, similarity))
movie_sim = cosine_dist.map(lambda x: (x[0][0], (x[0][1], x[1])))
#movie_sim join with test_data, we will get (movie, ((movie', sim), user))
#then, we will change to ((user, movie'),(movie, sim))
tmp = movie_sim.join(test_data).map(lambda x: ((x[1][1], x[1][0][0]), (x[0], x[1][0][1])))
#then, join tmp with training data to get ratings: ((user, movie'),(rating, (movie, sim)))
#and rearranged to ((user, movie),(sim * (rating - baseLine(user, movie'), sim)))
predict_unit = training_data.join(tmp)\
.map(lambda x: ((x[0][0], x[1][1][0]),(x[1][1][1] * (x[1][0] - baseLine(x[0][0], x[0][1])),x[1][1][1])))
#reducebykey to get sum
prediction = predict_unit.reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1]))\
.map(lambda x: (x[0], baseLine(x[0][0], x[0][1]) + x[1][0]/x[1][1]) if x[1][1]!=0 else (x[0], baseLine(x[0][0], x[0][1])))

In [31]:
#prediction.take(10)

### Compare with the true value, calculating MSE & RMSE

In [32]:
#rearrange the test_RDD
test_value = test_RDD.map(lambda x: ((int(x[0]), int(x[1])), float(x[2])))
#join with the prediction ((user, movie), (true, prediction))
comparison = prediction.join(test_value).cache()

In [33]:
comparison.take(10)

[((6, 348), (3.457092721075168, 3.0)),
 ((16, 3022), (3.8170876691880404, 3.5)),
 ((21, 8665), (4.058276771157846, 4.0)),
 ((21, 8529), (3.437896285693656, 4.5)),
 ((21, 1721), (3.837722559014498, 3.5)),
 ((22, 5388), (2.0076843860122935, 3.0)),
 ((24, 6350), (4.073748227593957, 4.5)),
 ((28, 8970), (3.4369034428867806, 3.0)),
 ((28, 1994), (2.749099006671379, 2.5)),
 ((51, 2167), (3.4427811230915863, 4.0))]

In [44]:
#Get the unit for MAE_tmp: abs(true-prediction
MAE_tmp = comparison.map(lambda x: abs(x[1][0] - x[1][1])).mean()
#RMSE_tmp: (true-prediction)**2
RMSE_tmp = comparison.map(lambda x: (x[1][0] - x[1][1])**2).mean()
num = comparison.count()

In [45]:
MAE = MAE_tmp/num
RMSE = np.sqrt(RMSE_tmp)/num
print("The MAE for the CF prediction is:", MAE)
print("The RMSE for the CF prediction is:", RMSE)

The MAE for the CF prediction is: 1.0530645571718233
The RMSE for the CF prediction is: 0.09852137544294987


In [36]:
#some of the test_data may not be successfully joined due to 
#the information of movie not appeared in the training_set
part_1 = comparison.map(lambda x: x[0])
part_2 = test_data.map(lambda x: (x[1], x[0])).subtract(part_1)\
.map(lambda x: (x, baseLine(x[0], x[1])))

In [37]:
part_2.take(10)

[((18, 157108), 3.736641221374046),
 ((21, 140301), 3.28978978978979),
 ((89, 26645), 3.442822384428224),
 ((89, 48649), 3.442822384428224),
 ((89, 69469), 3.442822384428224),
 ((89, 76301), 3.442822384428224),
 ((89, 170993), 3.442822384428224),
 ((103, 86815), 3.945),
 ((105, 134849), 4.139966273187184),
 ((105, 140265), 4.139966273187184)]

In [53]:
part2_comparison = part_2.join(test_value)
Total_compar = comparison.union(part2_comparison)
MAE_tmp_2 = Total_compar.map(lambda x: abs(x[1][0] - x[1][1])).mean()
RMSE_tmp_2 = Total_compar.map(lambda x: (x[1][0] - x[1][1])**2).mean()
num_2 = Total_compar.count()

In [54]:
MAE_T = MAE_tmp_2
RMSE_T = np.sqrt(RMSE_tmp_2)
print("The MAE for the CF prediction is:", MAE_T)
print("The RMSE for the CF prediction is:", RMSE_T)

The MAE for the CF prediction is: 1.0426516036926083
The RMSE for the CF prediction is: 13.430305307457385


### Using ALS for recommendation:

In [47]:
training_RDD, validation_RDD, test_RDD = small_ratings_data.randomSplit([6, 2, 2], seed=0)
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

In [51]:
seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print('For rank', rank,'the RMSE is: ', error) 
    if error < min_error:
        min_error = error
        best_rank = rank

print('The best model was trained with rank: ', best_rank)

For rank 4 the RMSE is:  0.9114026007220244
For rank 8 the RMSE is:  0.9180911451666388
For rank 12 the RMSE is:  0.9183376003842972
The best model was trained with rank:  4


In [52]:
model = ALS.train(training_RDD, best_rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print('For testing data the RMSE is', error)

For testing data the RMSE is 0.9086056540214045


### Using large dataset

In [4]:
# Load the complete dataset file
complete_ratings_raw_data = sc.textFile('/Users/gregcattell/rutgers_study/cs550/PJ_data/ml-latest/ratings.csv')
complete_ratings_raw_data_header = complete_ratings_raw_data.take(1)[0]

# Parse
complete_ratings_data = complete_ratings_raw_data.filter(lambda line: line!=complete_ratings_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),int(tokens[1]),float(tokens[2]))).cache()
    
complete_ratings_data.count()

27753444

In [5]:
complete_ratings_data.take(10)

[(1, 307, 3.5),
 (1, 481, 3.5),
 (1, 1091, 1.5),
 (1, 1257, 4.5),
 (1, 1449, 4.5),
 (1, 1590, 2.5),
 (1, 1591, 1.5),
 (1, 2134, 4.5),
 (1, 2478, 4.0),
 (1, 2840, 3.0)]

In [6]:
seed = 5
iterations = 10
regularization_parameter = 0.1
best_rank = 4

In [7]:
complete_training_RDD, complete_test_RDD = small_ratings_data.randomSplit([8, 2], seed=0)

complete_model = ALS.train(complete_training_RDD, best_rank, seed=seed, 
                           iterations=iterations, lambda_=regularization_parameter)

In [9]:
test_for_predict_RDD = complete_test_RDD.map(lambda x: (x[0], x[1]))

predictions = complete_model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = complete_test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

In [10]:
print("The RMSE of CF is: ", error)

The RMSE of CF is:  0.8802398422205924


### Making Top-10 Recommendation

In [12]:
small_movies_titles.count()

9742

In [14]:
#At first, we need to build a list of (user, movie(not watched before))
#we first get (user, list(watched movie))
user_watched = complete_training_RDD.map(lambda x: (int(x[0]),int(x[1]))).groupByKey()\
.map(lambda x: (x[0], set(x[1])))
#get a movie set for function:
movie_set = set(small_movies_data.map(lambda x: int(x[0])).collect())

In [15]:
# list((int user, int unwatched)) getUnwatched(int user, set movies):
def getUnwatched(user, movies):
    unwat_mov = list(movie_set - movies)
    unwat_user = [(user, x) for x in unwat_mov]
    return unwat_user

In [16]:
user_unWatched = user_watched.flatMap(lambda x: getUnwatched(x[0], x[1]))

In [21]:
total_recommendations = complete_model.predictAll(user_unWatched).map(lambda r: ((r[0], r[1]), r[2]))

In [22]:
total_recommendations.take(10)

[((312, 81132), 3.594017154327881),
 ((96, 81132), 3.244278589910458),
 ((600, 81132), 3.1654283745497827),
 ((324, 81132), 3.3527077471031346),
 ((180, 81132), 2.8364072517403205),
 ((156, 81132), 3.240139905189947),
 ((216, 81132), 3.2890586614329282),
 ((408, 81132), 3.599622858828204),
 ((456, 81132), 4.013056526198035),
 ((480, 81132), 2.9908068948515423)]

In [77]:
def getTop_Ten(pairs):
    pairs.sort(key =lambda x: x[1], reverse = True)
    return set([x[0] for x in pairs[:10]])

In [78]:
Top_Ten = total_recommendations.map(lambda line: (line[0][0], (line[0][1], line[1]))).groupByKey()\
.map(lambda line: (line[0], list(line[1]))).map(lambda line: (line[0], getTop_Ten(line[1])))

In [79]:
Top_Ten.take(10)

[(312, {3379, 3567, 4642, 6818, 7815, 8477, 25771, 40491, 58301, 99764}),
 (96, {720, 3404, 3566, 3925, 5607, 7842, 58303, 59018, 60943, 94070}),
 (600, {3379, 3567, 5222, 6818, 7815, 40491, 58301, 96004, 99764, 141718}),
 (324, {1194, 3567, 3846, 4634, 7025, 25947, 26258, 26326, 82378, 141718}),
 (180, {3925, 4495, 4617, 6201, 7841, 8235, 51931, 59018, 60943, 112804}),
 (156, {3379, 4495, 6201, 7564, 7815, 7841, 8235, 26326, 58301, 89904}),
 (216, {3096, 3379, 4495, 4642, 6201, 7564, 7815, 7841, 8235, 89904}),
 (408, {3567, 3925, 5867, 7815, 7842, 33649, 59018, 60943, 67618, 130518}),
 (456, {40, 3379, 3567, 3837, 5222, 5480, 33649, 86347, 98279, 130518}),
 (480, {2239, 4495, 6201, 7815, 7841, 8235, 51931, 59018, 60943, 89904})]

In [80]:
def sortTest(pairs):
    pairs.sort(key = lambda x: x[1], reverse = True)
    return [x[0] for x in pairs]

In [81]:
testlist = complete_test_RDD.map(lambda r: (int(r[0]), (int(r[1]), float(r[2])))).groupByKey()\
.map(lambda r: (r[0], list(r[1]))).map(lambda line: (line[0], sortTest(line[1])))

In [82]:
testlist.take(10)

[(2, [60756, 68157, 3578, 86345, 71535]),
 (4,
  [457,
   538,
   898,
   910,
   919,
   1086,
   1103,
   1188,
   1197,
   1449,
   2186,
   2583,
   3083,
   3365,
   3851,
   4034,
   4765,
   4967,
   345,
   800,
   892,
   1073,
   1219,
   1266,
   1466,
   1719,
   1895,
   2406,
   2921,
   3386,
   3897,
   4273,
   937,
   1580,
   3079,
   4027,
   4029,
   4166,
   235,
   450,
   553,
   1199,
   1860,
   2959,
   2973,
   126,
   222,
   2571,
   2628,
   4641]),
 (6,
  [230,
   316,
   318,
   350,
   457,
   709,
   47,
   54,
   207,
   212,
   225,
   252,
   261,
   274,
   317,
   354,
   415,
   440,
   469,
   711,
   802,
   8,
   105,
   267,
   310,
   348,
   374,
   382,
   416,
   472,
   531,
   552,
   592,
   608,
   694,
   795,
   837,
   867,
   88,
   327]),
 (8, [318, 357, 380, 21, 47, 186, 231, 296, 235, 356, 367, 592]),
 (10,
  [7458,
   49286,
   136020,
   106696,
   912,
   6942,
   7153,
   68954,
   95449,
   104374,
   30749,
   72720,
   

In [83]:
comparison_com = testlist.map(lambda line: (line[0], set(line[1]))).join(Top_Ten)\
.map(lambda x: (len(x[1][0] & x[1][1]), len(x[1][1]),len(x[1][0])))\
.map(lambda x: (x[0]/x[1], x[0]/x[2]))

In [89]:
precision = comparison_com.map(lambda x: x[0]).mean()
recall = comparison_com.map(lambda x: x[1]).mean()
print("precision: ", precision)
print("recall: ", recall)

precision:  0.0013157894736842107
recall:  0.00021420595911864591


In [92]:
F_measure = 2 * precision *recall/ (precision + recall)
print("F_measure: ", F_measure)

F_measure:  0.0003684324020398062


In [106]:
def nDCG(test, prediction):
    DCG =0
    IDCG = 0
    j=1
    for i in range(len(test)):
        if test[i] in prediction:
            DCG+=1/np.log2(1+i+1)
            IDCG+=1/np.log2(j+1)
            j+=1
    if IDCG!=0:
        return DCG/IDCG
    else: return 0

In [109]:
comparison_ndcg = testlist.join(Top_Ten).map(lambda x: nDCG(x[1][0], x[1][1]))

In [112]:
comparison_ndcg.take(100)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.15773243839286438,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [107]:
test = [10, 1, 3 ,5 ,6]
prediction_test = set([1, 3, 5])
nDCG(test, prediction_test)

1.0
1.6309297535714575
2.1309297535714578


0.7328286204777911

In [46]:
test_arr = [(12,1),(32,8),(42,3)]
tt = getTop_Ten(test_arr)
print(tt)

[(32, 8), (42, 3), (12, 1)]


In [45]:
unWatched_user = user_watched.flatMap(lambda x: getUnwatched(x[0], x[1]))

In [46]:
unWatched_user.take(10)

[(1, 2),
 (2, 2),
 (3, 2),
 (4, 2),
 (5, 2),
 (6, 2),
 (7, 2),
 (8, 2),
 (9, 2),
 (10, 2)]

In [49]:
#we already have movie_sim: (movie1, (movie2, sim))
sim_unWatched = movie_sim.join(unWatched_user).map(lambda x: ((x[1][1], x[1][0][0]), (x[0], x[1][0][1])))
#then, join tmp with training data to get ratings: ((user, movie'),(rating, (movie, sim)))
#and rearranged to ((user, movie),(sim * (rating - baseLine(user, movie'), sim)))
rate_sim= training_data.join(sim_unWatched)
predict_unit_recom = rate_sim.map(lambda x: ((x[0][0], x[1][1][0]),(x[1][1][1] * (x[1][0] - baseLine(x[0][0], x[0][1])),x[1][1][1])) if x[1][1][1]==0 else ((x[0][0], x[1][1][0]),(0, 0)))
#reducebykey to get sum
prediction_recom1 = predict_unit_recom.reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1]))\
.map(lambda x: (x[0], baseLine(x[0][0], x[0][1]) + x[1][0]/x[1][1]) if x[1][1]!=0 else (x[0], baseLine(x[0][0], x[0][1])))

In [None]:
def chooseTen(pairs):
    pairs.sort(key = lambda pair: pair[1], reverse = True)
    return pairs[:10]

In [None]:
top_ten = prediction_recom1.map(lambda x: (x[0][0], (x[0][1], x[1]))).groupByKey()\
.map(lambda x: (x[0], list(x[1]))).map(lambda x: (x[0], chooseTen(x[1])))

In [48]:
prediction_recom2= prediction_recom1.map(lambda x: x[0]).map(lambda x: (x[1], x[0])).subtract(part_1)\
.map(lambda x: (x, baseLine(x[0], x[1])))
recom_predict = prediction_recom2.union(prediction_recom1)

NameError: name 'part_1' is not defined

In [32]:
movie_sim.take(10)

[(2078, (1500, 0.007539678018083155)),
 (58, (3996, 0.002107636057698439)),
 (106, (296, -0.0)),
 (2874, (3060, 0.0014048498350659184)),
 (171, (3591, 0.015091188412907453)),
 (2763, (3967, 0.001908686110680079)),
 (780, (3510, 0.00465873049470773)),
 (2944, (3386, 0.005182070325553633)),
 (3508, (6870, 0.0010159358663180738)),
 (5632, (4378, 0.0032184760871333566))]

In [31]:
sim_unWatched.take(10)

[((2, 7438), (168456, -0.0)),
 ((4, 7438), (168456, -0.0)),
 ((6, 7438), (168456, -0.0)),
 ((8, 7438), (168456, -0.0)),
 ((10, 7438), (168456, -0.0)),
 ((12, 7438), (168456, -0.0)),
 ((14, 7438), (168456, -0.0)),
 ((16, 7438), (168456, -0.0)),
 ((18, 7438), (168456, -0.0)),
 ((20, 7438), (168456, -0.0))]

In [50]:
rate_sim.take(10)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 6 in stage 41.0 failed 1 times, most recent failure: Lost task 6.0 in stage 41.0 (TID 92, localhost, executor driver): ExecutorLostFailure (executor driver exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 2162218 ms
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [88]:
user_unWatched.take(10)

[[(1, 2),
  (2, 2),
  (3, 2),
  (4, 2),
  (5, 2),
  (6, 2),
  (7, 2),
  (8, 2),
  (9, 2),
  (10, 2),
  (11, 2),
  (12, 2),
  (13, 2),
  (14, 2),
  (15, 2),
  (16, 2),
  (17, 2),
  (18, 2),
  (19, 2),
  (20, 2),
  (21, 2),
  (22, 2),
  (23, 2),
  (24, 2),
  (25, 2),
  (26, 2),
  (27, 2),
  (28, 2),
  (29, 2),
  (30, 2),
  (31, 2),
  (32, 2),
  (32799, 2),
  (34, 2),
  (131098, 2),
  (36, 2),
  (131104, 2),
  (38, 2),
  (39, 2),
  (40, 2),
  (41, 2),
  (42, 2),
  (43, 2),
  (44, 2),
  (45, 2),
  (46, 2),
  (47, 2),
  (48, 2),
  (49, 2),
  (50, 2),
  (65585, 2),
  (52, 2),
  (53, 2),
  (54, 2),
  (55, 2),
  (65588, 2),
  (57, 2),
  (58, 2),
  (98361, 2),
  (60, 2),
  (61, 2),
  (62, 2),
  (63, 2),
  (64, 2),
  (65, 2),
  (66, 2),
  (65596, 2),
  (68, 2),
  (69, 2),
  (70, 2),
  (71, 2),
  (72, 2),
  (73, 2),
  (74, 2),
  (75, 2),
  (76, 2),
  (77, 2),
  (78, 2),
  (79, 2),
  (80, 2),
  (81, 2),
  (82, 2),
  (83, 2),
  (85, 2),
  (86, 2),
  (87, 2),
  (88, 2),
  (89, 2),
  (163925, 2),
  (

In [81]:
mov_list = [1,2,3,4,5,6,7,8]
mov_set = set(mov_list)
getUnwatched(1, mov_set)[:20]

[(1, 9),
 (1, 10),
 (1, 11),
 (1, 12),
 (1, 13),
 (1, 14),
 (1, 15),
 (1, 16),
 (1, 17),
 (1, 18),
 (1, 19),
 (1, 20),
 (1, 21),
 (1, 22),
 (1, 23),
 (1, 24),
 (1, 25),
 (1, 26),
 (1, 27),
 (1, 28)]

In [61]:
#small_movies_data.take(10)
movie_list[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [36]:
MAE.sum()

20357.844019245935

In [58]:
#comparison.take(10)

[((6, 348), (3.457092721075168, 3.0)),
 ((19, 2791), (3.5765813554107435, 2.0)),
 ((19, 351), (2.8360829153172915, 2.0)),
 ((19, 2111), (2.644690308780727, 3.0)),
 ((19, 551), (2.8217478447006257, 5.0)),
 ((21, 8665), (4.058276771157847, 4.0)),
 ((21, 8529), (3.437896285693656, 4.5)),
 ((21, 1721), (3.837722559014498, 3.5)),
 ((24, 6350), (4.073748227593957, 4.5)),
 ((27, 1015), (2.9429447224566188, 4.0))]

In [54]:
#We first need a list like this (user, list of movies the user rated)
user_ratedRDD = training_RDD.map(lambda line: (int(line[0]), (int(line[1]), float(line[2])))).groupByKey()\
.map(lambda line: (line[0], list(line[1])))

In [56]:
user_ratedRDD.take(2)

[(2,
  [(318, 3.0),
   (333, 4.0),
   (1704, 4.5),
   (6874, 4.0),
   (8798, 3.5),
   (46970, 4.0),
   (48516, 4.0),
   (58559, 4.5),
   (74458, 4.0),
   (77455, 3.0),
   (79132, 4.0),
   (80489, 4.5),
   (80906, 5.0),
   (89774, 5.0),
   (91529, 3.5),
   (91658, 2.5),
   (99114, 3.5),
   (106782, 5.0),
   (109487, 3.0),
   (112552, 4.0),
   (114060, 2.0),
   (115713, 3.5),
   (122882, 5.0),
   (131724, 5.0)]),
 (4,
  [(21, 3.0),
   (32, 2.0),
   (45, 3.0),
   (47, 2.0),
   (52, 3.0),
   (58, 3.0),
   (106, 4.0),
   (125, 5.0),
   (162, 5.0),
   (171, 3.0),
   (176, 5.0),
   (190, 2.0),
   (215, 5.0),
   (232, 5.0),
   (247, 3.0),
   (260, 5.0),
   (265, 5.0),
   (296, 1.0),
   (319, 5.0),
   (342, 5.0),
   (348, 4.0),
   (351, 3.0),
   (357, 3.0),
   (368, 4.0),
   (417, 2.0),
   (441, 1.0),
   (475, 5.0),
   (492, 5.0),
   (509, 1.0),
   (539, 1.0),
   (588, 4.0),
   (593, 5.0),
   (595, 3.0),
   (599, 2.0),
   (608, 5.0),
   (648, 3.0),
   (708, 4.0),
   (759, 3.0),
   (899, 4.0),
 

In [61]:
#We also need a list like this (movie, list of movies with the similarity we have calculated)
movie_simRDD = cosine_dist.map(lambda x: (x[0][0],(x[0][1], x[1]))).groupByKey()\
.map(lambda x: (x[0], list(x[1])))

In [87]:
empty = []
predictions = sc.parallelize(empty)
#test for one loop
for i in range(len(test_data)):
    user, movie = test_data[i]
    if movie in movie_ave_dict:
        user_movie_rate = user_ratedRDD.filter(lambda x: x[0] == user).flatMap(lambda x: x[1])
        movie_movie_sim = movie_simRDD.filter(lambda x: x[0] == movie).flatMap(lambda x: x[1])
        Bum = baseLine(user, movie)
        predict = user_movie_rate.join(movie_movie_sim)\
        .map(lambda x: (Bum, (x[1][1]*(x[1][0]-baseLine(user, x[0])), x[1][1])))\
        .reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1])).map(lambda x: ((user, movie), x[0] + x[1][0]/x[1][1]))
    else:
        predict = sc.parallelize([((user, movie), baseLine(user, movie))])     
    predictions = predictions.union(predict)
    
# user, movie = test_data[0]
# user_movie_rate = user_ratedRDD.filter(lambda x: x[0] == user).flatMap(lambda x: x[1])
# movie_movie_sim = movie_simRDD.filter(lambda x: x[0] == movie).flatMap(lambda x: x[1])
# Bum = baseLine(user, movie)
# predict = user_movie_rate.join(movie_movie_sim)\
# .map(lambda x: (Bum, (x[1][1]*(x[1][0]-baseLine(user, x[0])), x[1][1])))\
# .reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1])).map(lambda x: ((user, movie), x[0] + x[1][0]/x[1][1]))
# predictions.append(predict.collect())

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/gregcattell/spark-2.4.0-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1152, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/anaconda3/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [Errno 54] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/gregcattell/spark-2.4.0-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/Users/gregcattell/spark-2.4.0-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving


Py4JError: An error occurred while calling None.org.apache.spark.api.python.PythonFunction

In [None]:
predictions.take(10)

In [77]:
test = movie_simRDD.lookup(movie)

[[(3072, -0.0026872353830549845),
  (2948, 0.0008032733421044164),
  (51540, 0.0020408537533611396),
  (2840, -0.006139166069524992),
  (216, 0.002511763485657834),
  (908, -0.008060039118425445),
  (4000, -0.0),
  (728, 0.00870056575089157),
  (2540, 0.002312758555563365),
  (4728, -0.004136710768179149),
  (4744, 0.005270961483465851),
  (68, 0.0005104061541693555),
  (116668, 0.0),
  (7348, 0.0),
  (7036, -0.0011719268736471897),
  (7820, -0.00021694705791541576),
  (7028, -0.00026367094096858926),
  (6204, -0.0),
  (7748, 0.00011427794975368393),
  (5780, -0.0),
  (4084, -0.002008809082330271),
  (608, -0.0005064892499728672),
  (2412, -0.0017898576994629812),
  (196, 0.005418000474218096),
  (236, -0.0003326910973291352),
  (6708, 0.000713260871419645),
  (4388, 0.0030485263560449376),
  (32, 0.004814083885370248),
  (85780, -0.0019852769886129448),
  (1180, -0.0),
  (2720, -0.0007979051778267757),
  (1468, 0.023600591272495353),
  (1480, 0.0035231227904004405),
  (5500, 0.0010999

### get pairs for movies denominator

In [18]:
movie_module = training_RDD.map(lambda x: (int(x[1]), float(x[2])**2)).reduceByKey(lambda x, y: x+y)\
.map(lambda x: (x[0], np.sqrt(x[1])))

In [19]:
movie_mode_cart = movie_module.cartesian(movie_module).map(lambda x: ((x[0][0], x[1][0]), x[0][1]*x[1][0]))

In [24]:
cosine_frac = movie_numerator.join(movie_mode_cart).cache()#.map(lambda x: (x[0], x[1][0]/x[1[1]]))

In [None]:
cosine_frac.take(10)

In [None]:
recom = cosine_dist.map(lambda x: (x[0], x[1][0]/x[1[1]])).map(lambda x: (x[1], x[0])).sortByKey()

In [None]:
recom.take(10)

In [12]:
movie_vectors = normalized_ratings.groupByKey().map(lambda vector: (int(vector[0]), list(vector[1]))).sortByKey()
movieVecList = movie_vectors.collect()
#.map(lambda vector: vector[1]).collect()

In [44]:
#movieVecList[0:10]
users = training_RDD.map(lambda tokens: tokens[0]).distinct().count()
movies = small_movies_data.count()
simMatrix = np.zeros((movies, users))
for vec in movieVecList:
    for pair in vec[1]:
        simMatrix[vec[0]-1, pair[0]-1] = pair[1]

simMatrix[0:5]

IndexError: index 25749 is out of bounds for axis 0 with size 9742

In [13]:
movie_arr = movie_vectors.map(lambda vector: vector[0]).collect()
movie_set = set(movie_arr)

### Step.3 get item-item similarity matrix

In [176]:
# def CosineSimilarity(movie_1, movie_2):
#     if movie_1 not in movie_set or movie_2 not in movie_set:
#         return 0.0;
#     vector_1 = movie_vectors.filter(lambda movie: movie[0]==movie_1).flatMap(lambda x: x[1])
#     vector_2 = movie_vectors.filter(lambda movie: movie[0]==movie_2).flatMap(lambda x: x[1])
#     cosine = vector_1.join(vector_2).map(lambda x: x[1][0] * x[1][1]).sum()
#     M_vector_1 = vector_1.map(lambda vector: vector[1]**2).sum()
#     M_vector_2 = vector_2.map(lambda vector: vector[1]**2).sum()
#     cosine_dist = cosine/np.sqrt(M_vector_1 * M_vector_2)
#     return cosine_dist

In [20]:
def CosineSimilarity(movie_1, movie_2):
    if movie_1 == movie_2:
        return 1.0;
    if movie_1 not in movie_set or movie_2 not in movie_set:
        return 0.0;
    vector_1 = {x[0]:x[1] for x in movieVecList[int(movie_1)-1]}
    vector_2 = {x[0]:x[1] for x in movieVecList[int(movie_2)-1]}
    numerator = 0
    for x in vector_1:
        if x in vector_2:
            numerator+=vector_1[x]*vector_2[x] 
    m_1 = [x**2 for x in vector_1.values()]
    m_2 = [x**2 for x in vector_2.values()]
    cosine_dist = numerator / np.sqrt(sum(m_1)*sum(m_2))
    return cosine_dist

In [21]:
CosineSimilarity('157', '1')

0.04893018393393601

In [29]:
movies = small_movies_data.map(lambda tokens: (0, tokens[0]))
simMatrix = movies.join(movies)#.take(10000)
# simTest = sc.parallelize(simMatrix)
# sim_test_rdd = simTest.map(lambda line: (line[1], CosineSimilarity(line[1][0], line[1][1])))
# sim_test_rdd.collect()

In [129]:
users = training_RDD.map(lambda tokens: tokens[0]).distinct().map(lambda user: (user, 0))
movies = small_movies_data.map(lambda tokens: (0.0, tokens[0]))

In [30]:
simMatrix.count()

94906564

In [109]:
pairs.take(10)

[(('1', '1'), 0),
 (('1', '2'), 0),
 (('1', '3'), 0),
 (('1', '4'), 0),
 (('1', '5'), 0),
 (('1', '6'), 0),
 (('1', '7'), 0),
 (('1', '8'), 0),
 (('1', '9'), 0),
 (('1', '10'), 0)]

In [136]:
normalized_ratings.take(10)

[('50', ('1', 0.7601156069364166)),
 ('50', ('5', -0.23988439306358345)),
 ('50', ('6', -3.2398843930635834)),
 ('50', ('7', 0.26011560693641655)),
 ('50', ('8', 0.7601156069364166)),
 ('50', ('16', -0.23988439306358345)),
 ('50', ('18', 0.7601156069364166)),
 ('50', ('24', -0.23988439306358345)),
 ('50', ('28', -0.7398843930635834)),
 ('50', ('29', -0.7398843930635834))]

In [113]:
movie_ratings.take(10)

[('1', (4.0, 1)),
 ('3', (4.0, 1)),
 ('6', (4.0, 1)),
 ('47', (5.0, 1)),
 ('50', (5.0, 1)),
 ('157', (5.0, 1)),
 ('163', (5.0, 1)),
 ('223', (3.0, 1)),
 ('231', (5.0, 1)),
 ('235', (4.0, 1))]