In [1]:
%%spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1057,application_1554300167658_0109,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
from pyspark import *
import numpy as np
from itertools import permutations
from pyspark.mllib.recommendation import ALS

In [3]:
small_ratings_raw_data = sc.textFile('550_finalPJ/ratings.csv')
small_ratings_raw_data_header = small_ratings_raw_data.take(1)[0]

small_ratings_data = small_ratings_raw_data.filter(lambda line: line!=small_ratings_raw_data_header)\
.map(lambda line : line.split(",")).map(lambda tokens: (tokens[0],tokens[1],tokens[2]))\
.map(lambda x: (int(x[0]), int(x[1]), float(x[2])))

In [4]:
small_movies_raw_data = sc.textFile('550_finalPJ/movies.csv')
small_movies_raw_data_header = small_movies_raw_data.take(1)[0]

small_movies_data = small_movies_raw_data.filter(lambda line: line!=small_movies_raw_data_header)\
.map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1]))\
.map(lambda x: (int(x[0]), x[1]))

#### ______Checking data set: 
#### small_ratings_data = (userID, movieID, rating)
#### small_movies_data = (movieID, movieName)

In [5]:
small_ratings_data.take(10)

[(1, 307, 3.5), (1, 481, 3.5), (1, 1091, 1.5), (1, 1257, 4.5), (1, 1449, 4.5), (1, 1590, 2.5), (1, 1591, 1.5), (1, 2134, 4.5), (1, 2478, 4.0), (1, 2840, 3.0)]

In [6]:
small_movies_data.take(10)

[(1, 'Toy Story (1995)'), (2, 'Jumanji (1995)'), (3, 'Grumpier Old Men (1995)'), (4, 'Waiting to Exhale (1995)'), (5, 'Father of the Bride Part II (1995)'), (6, 'Heat (1995)'), (7, 'Sabrina (1995)'), (8, 'Tom and Huck (1995)'), (9, 'Sudden Death (1995)'), (10, 'GoldenEye (1995)')]

## Split data into training set and test set: 8:2

In [7]:
training_RDD, test_RDD = small_ratings_data.randomSplit([8, 2], seed=0)
test_user_unwatch = test_RDD.map(lambda x: (x[0], x[1]))

#### ___Checking data set:
#### Numbers of training data and test data
#### test samples for prediction: (userID, unWatchedID)

In [8]:
Total_train = training_RDD.count()
Total_test = test_RDD.count()
print("The total number of training dataset is", Total_train)
print("The total number of test dataset is", Total_test)
print("Rate of training and test:", Total_train/Total_test)

The total number of training dataset is 22205406
The total number of test dataset is 5548038
Rate of training and test: 4.0023889526351475

In [9]:
test_user_unwatch.take(10)

[(1, 1590), (1, 1591), (1, 2134), (1, 2478), (1, 3020), (2, 849), (2, 1186), (2, 1244), (2, 1663), (2, 2707)]

In [10]:
pre_train_RDD, pre_valid_RDD, pre_test_RDD = small_ratings_data.randomSplit([6, 2, 2], seed=0)
validation_for_predict_RDD = pre_valid_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = pre_test_RDD.map(lambda x: (x[0], x[1]))

In [11]:
seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(pre_train_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = pre_valid_RDD.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MAE = rates_and_preds.map(lambda r: abs(r[1][0] - r[1][1])).mean()
    RMSE = np.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = RMSE
    err += 1
    print('For rank', rank,'the MAE is: ', MAE,'the RMSE is: ', RMSE) 
    if RMSE < min_error:
        min_error = RMSE
        best_rank = rank

print('The best model was trained with rank: ', best_rank)

KeyboardInterrupt: 

In [None]:
model = ALS.train(training_RDD, best_rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
predictions = model.predictAll(test_user_unwatch).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MAE = rates_and_preds.map(lambda r: abs(r[1][0] - r[1][1])).mean()
RMSE = np.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print('For testing data the MAR is', MAE)
print('For testing data the RMSE is', RMSE)

In [None]:
def nDCG(test, prediction):
        DCG = 0
        IDCG = 0
        j=1
        for p in range(len(test)):
            if test[p] in prediction:
                DCG+=1/np.log2(1+p+1)
                IDCG+=1/np.log2(j+1)
                j+=1
        if IDCG!=0:
            return DCG/IDCG
        else: return 0

In [None]:
precision = 0
recall = 0
comparison_ndcg = 0
iterations = 2000

for i in range(iterations):
    index = i+1
    movie_set = small_movies_data.map(lambda x: x[0])
    u_w_new = training_RDD.filter(lambda x: x[0] == index).map(lambda x: x[1])
    u_u_new = movie_set.subtract(u_w_new).map(lambda x: (index, x))
    
    u_prediction = model.predictAll(u_u_new).map(lambda r: ((r[0],r[1]), r[2]))\
    .map(lambda x: (x[0][0], (x[0][1], x[1]))).top(10, key = lambda x: x[1][1])
    
    Top_Ten = sc.parallelize(u_prediction).map(lambda x: (x[0], x[1][0]))\
    .groupByKey().map(lambda x: (x[0], list(x[1])))
    
    true = testlist.map(lambda r: (r[0], set(r[1])))
    recom = Top_Ten.map(lambda r: (r[0], set(r[1])))
    
    comparison_com = true.join(recom).map(lambda x: (len(x[1][0] & x[1][1]), len(x[1][1]), len(x[1][0])))\
    .map(lambda x: (x[0]/x[1], x[0]/x[2])).collect()
    
    if len(comparison_com)!=0:
        precision += comparison_com[0][0]
        recall += comparison_com[0][1]
        
    print("precision for", i+1, comparison_com[0][0])
    print("recall for", i+1, comparison_com[0][1])
    
    test = testlist.flatMap(lambda r: r[1]).collect()
    predic = Top_Ten.flatMap(lambda r: r[1]).collect()
    
    comparison_ndcg +=nDCG(test, predic)    

In [None]:
precision_mean = precision/iterations
recall_mean = recall/iterations
F_measure = 2*precision*recall/(precision+recall)
ndcg_mean = comparison_ndcg/iterations