[Building a Movie Recommendation Service with Apache Spark & Flask - Part 1](https://www.codementor.io/jadianes/building-a-recommender-with-apache-spark-python-example-app-part1-du1083qbw)  
[building-a-web-service-with-apache-spark-flask-example-app-part2](https://www.codementor.io/jadianes/building-a-web-service-with-apache-spark-flask-example-app-part2-du1083854)

In [None]:
import os

datasets_path = os.path.join('..', 'datasets')

complete_dataset_path = os.path.join(datasets_path, 'ml-latest.zip')
small_dataset_path = os.path.join(datasets_path, 'ml-latest-small.zip')

In [None]:
import urllib

small_f = urllib.urlretrieve (small_dataset_url, small_dataset_path)
complete_f = urllib.urlretrieve (complete_dataset_url, complete_dataset_path)b

In [None]:
import zipfile

with zipfile.ZipFile(small_dataset_path, "r") as z:
    z.extractall(datasets_path)

with zipfile.ZipFile(complete_dataset_path, "r") as z:
    z.extractall(datasets_path)

# Or

In [1]:
import os
import sys
from pyspark import SparkContext
from pyspark import SparkConf


conf = SparkConf()
conf.setAppName("spark-ntlk-env")

sc = SparkContext(conf=conf)



In [2]:
# from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
import os

# load in the data
small_ratings_raw_data = sc.textFile("./data/ml-latest-small/ratings.csv")
small_ratings_raw_data_header = small_ratings_raw_data.take(1)[0]

- Now we can parse the raw data into a new RDD.

In [3]:
small_ratings_data = small_ratings_raw_data.filter(
    lambda line: line!=small_ratings_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1],tokens[2])).cache()

In [4]:
# (UserID, MovieID, Rating)
small_ratings_data.take(3)

[('1', '1', '4.0'), ('1', '3', '4.0'), ('1', '6', '4.0')]

In [5]:
# (MovieID, Title)
small_movies_raw_data = sc.textFile("./data/ml-latest-small/movies.csv")
small_movies_raw_data_header = small_movies_raw_data.take(1)[0]

small_movies_data = small_movies_raw_data.filter(lambda line: line!=small_movies_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1])).cache()
    
small_movies_data.take(3)

[('1', 'Toy Story (1995)'),
 ('2', 'Jumanji (1995)'),
 ('3', 'Grumpier Old Men (1995)')]

In [7]:
training_RDD, validation_RDD, test_RDD  = small_ratings_data.randomSplit([6,2,2])
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

In [10]:
from pyspark.mllib.recommendation import ALS
import math

seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    ## each row of result is: ((user_id, movie_id), (rating, prediction))
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print ('For rank {} the RMSE is {}'.format(rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank

print ('The best model was trained with rank %s' % best_rank)

For rank 4 the RMSE is 0.9026612551350627
For rank 8 the RMSE is 0.9117383425445459
For rank 12 the RMSE is 0.9113554954921999
The best model was trained with rank 4


In [11]:
predictions.take(3)

[((474, 3272), 3.208659536074121),
 ((414, 3272), 3.8004430477211533),
 ((187, 52328), 2.8183004496639414)]

In [12]:
rates_and_preds.take(3)

[((140, 4322), (4.0, 3.3464495336698277)),
 ((141, 367), (2.5, 3.0442845441241784)),
 ((272, 122922), (2.0, 3.982320280753517))]

![](https://annefou.github.io/pyspark/slides/images/RDD.png)