Set up pyspark and SparkSession

In [None]:
import pyspark
from pyspark.sql.types import IntegerType
from pyspark import SparkContext
import json

spark = pyspark.sql.SparkSession.builder \
    .master("local") \
    .appName("movies") \
    .getOrCreate()

Load data

Calculate average for each user

In [3]:
averages = df \
    .groupBy("usedf = spark.read.csv(path="/home/jovyan/movielens/ratings.csv", header=True)
df = df.withColumn("rating", df["rating"].cast(IntegerType()))r_id") \
    .avg("rating") \
    .select("*")

averages.take(3)

[Row(user_id='296', avg(rating)=3.597938144329897),
 Row(user_id='467', avg(rating)=3.2028985507246377),
 Row(user_id='675', avg(rating)=3.3095238095238093)]

Subtract the average from each users each rating

In [4]:
joined = averages \
    .join(df, df["user_id"] == averages["user_id"]) \
    .select(df["user_id"], "avg(rating)", "rating", "movie_id")\
    .collect()

joined[:3]

[Row(user_id='1', avg(rating)=4.188679245283019, rating=5, movie_id='1193'),
 Row(user_id='1', avg(rating)=4.188679245283019, rating=3, movie_id='661'),
 Row(user_id='1', avg(rating)=4.188679245283019, rating=3, movie_id='914')]

Convert DataFrame to RDD

In [5]:
sc = SparkContext.getOrCreate()
rdd = sc.parallelize(joined)

Map the whole RDD to a python dictionary for easy look up

In [6]:
def seq_op(acc, obj):
    user_id = obj["user_id"]
    movie_id = obj["movie_id"]
    average = obj["average"]
    rating = obj["rating"]
    diff = obj["diff"]
    
    if user_id not in acc:
        acc[user_id] = {
            "average": average,
            "ratings": {
                movie_id: {
                    "rating": rating,
                    "diff": diff
                }
            }
        }
    else:
        acc[user_id]["ratings"][movie_id] = {
            "rating": rating,
            "diff": diff
        }
    return acc


combOp = (lambda x, y: {**x, **y})


mapped = rdd.map(lambda row: {"user_id": row["user_id"], 
                              "rating": row["rating"],
                              "movie_id": row["movie_id"],
                              "average": row["avg(rating)"],
                              "diff": row["rating"] - row["avg(rating)"]
                              })\
            .aggregate({}, seq_op, combOp)


print(json.dumps(mapped['1'], indent=4))



Define function for calculating cosine similarity

In [7]:
import numpy as np


def cosine_similarity(first_list, second_list):

    a = np.array(first_list)
    b = np.array(second_list)
    
    dot_product = np.dot(a, b)
    
    a_length = np.linalg.norm(a)
    b_length = np.linalg.norm(b)
    
    similarity = dot_product / (a_length * b_length)
    similarity = "{0:.10f}".format(similarity)
    
    return similarity


Define the compare method that compares the user to every other user

In [8]:
def compare(user_id, mapped):
    comparison = set()
    user_movies = mapped[user_id]['ratings']

    for other_user_id in mapped:
        user = []
        other = []
        other_user_movies = mapped[other_user_id]['ratings']
        for movie_id in other_user_movies:
            if movie_id in user_movies:  # movie rated by both
                user.append(user_movies[movie_id]['diff'])
                other.append(other_user_movies[movie_id]['diff'])

        # convert ids to int for correct comparison
        user_id = int(user_id)
        other_user_id = int(other_user_id)

        smaller_id = user_id if user_id < other_user_id else other_user_id
        bigger_id = user_id if user_id > other_user_id else other_user_id

        similarity = cosine_similarity(user, other)
        comparison.add((smaller_id, bigger_id, similarity))

    return comparison


Define a methods for combining sets

In [9]:
def combine_sets(set1, set2):
    set1.update(set2)
    return set1

Perform the comparison

In [10]:
comparison = sc.parallelize(mapped) \
    .map(lambda user_id: compare(user_id, mapped)) \
    .aggregate(set(), combine_sets, combine_sets)



