# Task 1

In [2]:
sc
import re

Load the file `movie_titles_canonical.txt`

In [3]:
raw_mtc = sc.textFile("movie_titles_canonical.txt")

In [4]:
raw_nmt = sc.textFile("netflix_movie_titles.txt")

Set the file contents to lower case and split using comma. The movie title is then manipulated using the regex functions to all special characters including space is removed, while some common words are also removed. The year is added as a concatenation to the title of the movie as this created a better unique identifier for the movie. 

All the original data is kept as the value so that we can reference to it later on.

In [5]:
data_mtc = (raw_mtc
              .map(lambda x: x.split(","))
              .map(lambda x: (re.sub('\W+','',
                                     re.sub('the |of |is |a ', '', x[0].lower())) + x[1], [x[0], x[1]])))

In [6]:
data_nmt = (raw_nmt
              .map(lambda x: x.split(","))
              .map(lambda x: (re.sub('\W+','', 
                                     re.sub('the |of |is |a ', '', x[2].lower())) + x[1], [x[0], x[1], x[2]])))

The following is a representation of the transformed RDD.

In [7]:
data_mtc.take(5)

[('avatar2009', ['Avatar', '2009']),
 ('amélie2001', ['Amélie', '2001']),
 ('fullmetaljacket1987', ['Full Metal Jacket', '1987']),
 ('etextraterrestrial1982', ['E.T.: The Extra-Terrestrial', '1982']),
 ('independenceday1996', ['Independence Day', '1996'])]

In [8]:
data_nmt.take(5)

[('dinosaurplanet2003', ['1', '2003', 'Dinosaur Planet']),
 ('islemantt2004review2004', ['2', '2004', 'Isle of Man TT 2004 Review']),
 ('character1997', ['3', '1997', 'Character']),
 ('paulabdulsgetupdance1994', ['4', '1994', "Paula Abdul's Get Up & Dance"]),
 ('riseandfallecw2004', ['5', '2004', 'The Rise and Fall of ECW'])]

The two RDDs can be joined on the key that should be unique and is close to to the second file, so that more movies can be matched. Infact the reason the two files where manipulated in the same way so that the final key is unique and can be joined as it would be the same in both files.

In [9]:
data_joined_mtc_nmt = data_mtc.join(data_nmt)

In [62]:
data_joined_mtc_nmt.count()

3728

The final joined RDD is then mapped to get the final desired output which is `ID => TITLE`

In [10]:
data_final_mtc_nmt = data_joined_mtc_nmt.map(lambda x: (int(x[1][1][0]), x[1][0][0]))

Finally the RDD is transformed to a dictionary and broadcasted to all spark clusters.

In [11]:
dic_final_mtc_nmt = data_final_mtc_nmt.collectAsMap()

In [12]:
broadcast_mov = sc.broadcast(dic_final_mtc_nmt)

# Task 2

In [13]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [14]:
raw_mas = sc.textFile("mv_all_simple.txt");

In [15]:
raw_mas_1, raw_mas_2 = raw_mas.randomSplit(weights=[0.8, 0.2], seed=1)

In [15]:
ratings_mas = (raw_mas_1.map(lambda x: x.split(','))
    .map(lambda l: Rating(int(l[1]), int(l[0]), int(l[2]))))

In [16]:
rank = 10
numIterations = 5
regularization_parameter = 0.01
model = ALS.train(ratings_mas, rank, numIterations, lambda_=regularization_parameter)

In [17]:
testdata = ratings_mas.map(lambda p: (p[0], p[1]))

In [18]:
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))

## Task 3

In [19]:
user_preds = predictions.filter(lambda x: x[0][0] == 30878)

In [20]:
user_recommendations = user_preds.map(lambda x: (x[0][1], x[1]))

In [21]:
user_recommendations_movie_titles = user_recommendations.join(data_final_mtc_nmt)

In [22]:
user_top_recommendations_movie_titles = user_recommendations_movie_titles.map(lambda x: (x[0], x[1][1], x[1][0])).takeOrdered(10, key=lambda x: -x[2])

In [23]:
print ('TOP recommended movies:\n%s' %
        '\n'.join(map(str, user_top_recommendations_movie_titles)))

TOP recommended movies:
(5582, 'Star Wars: Episode V: The Empire Strikes Back', 4.316707005074456)
(14240, 'The Lord of the Rings: The Return of the King', 4.309599588582811)
(16265, 'Star Wars: Episode IV - A New Hope', 4.286788548463047)
(9628, 'Star Wars: Episode VI - Return of the Jedi', 4.253354490312434)
(14050, 'Willy Wonka & the Chocolate Factory', 4.249344217912746)
(2452, 'The Lord of the Rings: The Fellowship of the Ring', 4.2296427897135676)
(13673, 'Toy Story', 4.2227653792831745)
(12870, "Schindler's List", 4.211485619896104)
(12676, 'The Secret of NIMH', 4.201740547651723)
(7193, 'The Princess Bride', 4.199720069522107)


In [24]:
user_actual = ratings_mas.filter(lambda x: x[0] == 30878)

In [25]:
user_ratings = user_actual.map(lambda x: (x[1], x[2]))

In [26]:
user_ratings_movie_titles = user_ratings.join(data_final_mtc_nmt)

In [27]:
user_ratings_movie_titles = user_ratings_movie_titles.map(lambda x: (x[0], x[1][1], x[1][0])).takeOrdered(10, key=lambda x: -x[2])

In [28]:
print ('TOP movies:\n%s' %
        '\n'.join(map(str, user_ratings_movie_titles)))

TOP movies:
(2212, 'The Secret Garden', 5)
(4661, 'Deuce Bigalow: Male Gigolo', 5)
(1659, 'Grumpy Old Men', 5)
(7745, 'Apollo 13', 5)
(6797, 'The Breakfast Club', 5)
(2452, 'The Lord of the Rings: The Fellowship of the Ring', 5)
(15646, "Lemony Snicket's A Series of Unfortunate Events", 5)
(2771, 'Top Secret!', 5)
(13673, 'Toy Story', 5)
(9330, 'For a Few Dollars More', 5)


In [29]:
def toCSVLine(data):
  return ','.join(str(d) for d in data)

In [101]:
user_rec_mov_rdd = sc.parallelize(user_top_recommendations_movie_titles)
user_rec_mov_csv = user_rec_mov_rdd.map(toCSVLine)
user_rec_mov_csv.saveAsTextFile('user_rec_mov')

In [102]:
user_rat_mov_rdd = sc.parallelize(user_ratings_movie_titles)
user_rat_mov_csv = user_rat_mov_rdd.map(toCSVLine)
user_rat_mov_csv.saveAsTextFile('user_rat_mov')

## Task 4

In [16]:
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics

In [17]:
ratings_mas_2 = (raw_mas_2.map(lambda x: x.split(','))
    .map(lambda l: Rating(int(l[1]), int(l[0]), float(l[2]) - 2.5)))

In [18]:
rank = 10
numIterations = 10
regularization_parameter = 0.01
model_2 = ALS.train(ratings_mas_2, rank, numIterations, lambda_=regularization_parameter)

In [19]:
testdata_2 = ratings_mas_2.map(lambda p: (p.user, p.product))

In [20]:
predictions_2  = model_2.predictAll(testdata_2).map(lambda r: ((r.user, r.product), r.rating))

In [20]:
ratings_tuple = ratings_mas_2.map(lambda r: ((r.user, r.product), r.rating))

In [21]:
score_labels = predictions_2.join(ratings_tuple).map(lambda tup: tup[1])

In [22]:
metrics = RegressionMetrics(score_labels)

In [23]:
print("RMSE = %s" % metrics.rootMeanSquaredError)

RMSE = 0.7255143677238088


## Task 5

In [38]:
raw_qs = sc.textFile("qualifying_simple.txt")

In [39]:
data_qs = raw_qs.map(lambda x: x.split(',')).map(lambda x: (x[1], x[0]))

In [40]:
data_qs.take(5)

[('1046323', '1'),
 ('1080030', '1'),
 ('1830096', '1'),
 ('368059', '1'),
 ('802003', '1')]

In [29]:
data_qs_csv = data_qs.map(toCSVLine)
data_qs_csv.saveAsTextFile('data_qs')

## Task6

In [21]:
from neo4j.v1 import GraphDatabase, basic_auth
import pandas as pd

In [63]:
sample_predictions_1, sample_predictions_2 = predictions_2.randomSplit(weights=[0.05, 0.95], seed=10)

In [64]:
sample_predictions_1 = sample_predictions_1.map(lambda x: (x[0][1], (x[0][0], x[1])))

In [65]:
sample_predictions_1.take(5)

[(14358, (1393975, 1.342284448695834)),
 (10921, (1393975, 0.06382050785608284)),
 (4520, (1393975, 1.0203518089436434)),
 (7230, (1393975, 2.432663241116531)),
 (14233, (1393975, 0.9258170484198738))]

In [66]:
data_final_mtc_nmt.take(5)

[(11863, 'D3: The mighty Ducks'),
 (482, 'Frida'),
 (15885, 'Alien Nation'),
 (17116, 'Stickmen'),
 (10550, 'Cold Mountain')]

In [67]:
joined_predictions = sample_predictions_1.join(data_final_mtc_nmt)

In [68]:
joined_predictions = joined_predictions.map(lambda x: (str(x[0]) + str(x[1][0][0]), (x[0], x[1][1], x[1][0][0], x[1][0][1])))

In [69]:
joined_predictions.first()

('4100772745', (4100, 'Dinosaur', 772745, 1.0551761940315265))

In [70]:
dict_joined_predictions = joined_predictions.collectAsMap()

In [72]:
dict_joined_predictions['4100772745']

(4100, 'Dinosaur', 772745, 1.0551761940315265)

In [92]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth("neo4j", "1234"))

In [None]:
session = driver.session()
  
for key in dict_joined_predictions:
    raw_data = dict_joined_predictions[key]
    
    movie_result = session.run('MATCH (m:Movie { id: {movie_id}, title: {movie_title}}) RETURN m', {'movie_id': str(raw_data[0]), 'movie_title' : raw_data[1]})
    for movie in movie_result:
        user_result = session.run('MERGE (u:User { id: {user_id}}) RETURN u', {'user_id': str(raw_data[2])})
        for user in user_result:
            user_rated_result = session.run('MATCH (u:User { id: {user_id}}),(m:Movie { id: {movie_id}, title: {movie_title}}) MERGE (u)-[r:RATED{stars:{rating}}]->(m) RETURN r', {'user_id': str(raw_data[2]), 'movie_id': str(raw_data[0]), 'movie_title' : raw_data[1], 'rating' : raw_data[3]})
            for user_rating in user_rated_result:
                print("User: " + str(user['u']['id']) + " rated movie " + movie['m']['id'] + " " + movie['m']['title'] + " " + user_rated_result['r']['stars'])
        
session.close()

In [104]:
joined_predictions.count()

709847

In [59]:
len(dict_joined_predictions)

710926