Set up pyspark and SparkSession

In [58]:
import pyspark
from pyspark.sql.types import IntegerType
from pyspark import SparkContext
import json

spark = pyspark.sql.SparkSession.builder \
    .master("local") \
    .appName("movies") \
    .getOrCreate()

Load data

In [59]:
df = spark.read.csv(path="/home/jovyan/movielens/ratings.csv", header=True).limit(1000)
df = df.withColumn("rating", df["rating"].cast(IntegerType()))

Calculate average for each user

In [60]:
averages = df \
    .groupBy("user_id") \
    .avg("rating") \
    .select("*")

averages.take(3)

[Row(user_id='1', avg(rating)=4.188679245283019),
 Row(user_id='2', avg(rating)=3.7131782945736433),
 Row(user_id='3', avg(rating)=3.9019607843137254)]

Subtract the average from each users each rating

In [62]:
joined = averages \
    .join(df, df["user_id"] == averages["user_id"]) \
    .select(df["user_id"], "avg(rating)", "rating", "movie_id")\
    .collect()

joined[:3]

[Row(user_id='1', avg(rating)=4.188679245283019, rating=5, movie_id='1193'),
 Row(user_id='1', avg(rating)=4.188679245283019, rating=3, movie_id='661'),
 Row(user_id='1', avg(rating)=4.188679245283019, rating=3, movie_id='914')]

Convert DataFrame to RDD

In [63]:
sc = SparkContext.getOrCreate()
rdd = sc.parallelize(joined)

Map the whole RDD to a python dictionary for easy look up

In [89]:
def seq_op(acc, obj):
    user_id = obj["user_id"]
    movie_id = obj["movie_id"]
    average = obj["average"]
    rating = obj["rating"]
    diff = obj["diff"]
    
    if user_id not in acc:
        acc[user_id] = {
            "average": average,
            "ratings": {
                movie_id: {
                    "rating": rating,
                    "diff": diff
                }
            }
        }
    else:
        acc[user_id]["ratings"][movie_id] = {
            "rating": rating,
            "diff": diff
        }
    return acc


combOp = (lambda x, y: {**x, **y})


mapped = rdd.map(lambda row: {"user_id": row["user_id"], 
                              "rating": row["rating"],
                              "movie_id": row["movie_id"],
                              "average": row["avg(rating)"],
                              "diff": row["rating"] - row["avg(rating)"]
                              })\
            .aggregate({}, seq_op, combOp)


print(json.dumps(mapped['1'], indent=4))



{
    "average": 4.188679245283019,
    "ratings": {
        "1193": {
            "rating": 5,
            "diff": 0.8113207547169807
        },
        "661": {
            "rating": 3,
            "diff": -1.1886792452830193
        },
        "914": {
            "rating": 3,
            "diff": -1.1886792452830193
        },
        "3408": {
            "rating": 4,
            "diff": -0.18867924528301927
        },
        "2355": {
            "rating": 5,
            "diff": 0.8113207547169807
        },
        "1197": {
            "rating": 3,
            "diff": -1.1886792452830193
        },
        "1287": {
            "rating": 5,
            "diff": 0.8113207547169807
        },
        "2804": {
            "rating": 5,
            "diff": 0.8113207547169807
        },
        "594": {
            "rating": 4,
            "diff": -0.18867924528301927
        },
        "919": {
            "rating": 4,
            "diff": -0.18867924528301927
        },
        "595

Define function for calculating cosine similarity

In [146]:
import numpy as np


def cosine_similarity(first_list, second_list):

    a = np.array(first_list)
    b = np.array(second_list)
    
    dot_product = np.dot(a, b)
    
    a_length = np.linalg.norm(a)
    b_length = np.linalg.norm(b)
    
    similarity = dot_product / (a_length * b_length)
    similarity = "{0:.10f}".format(similarity)
    
    return similarity


Define the compare method that compares the user to every other user

In [147]:
def compare(user_id, mapped):
    comparison = set()
    user_movies = mapped[user_id]['ratings']

    for other_user_id in mapped:
        user = []
        other = []
        other_user_movies = mapped[other_user_id]['ratings']
        for movie_id in other_user_movies:
            if movie_id in user_movies:  # movie rated by both
                user.append(user_movies[movie_id]['diff'])
                other.append(other_user_movies[movie_id]['diff'])

        # convert ids to int for correct comparison
        user_id = int(user_id)
        other_user_id = int(other_user_id)

        smaller_id = user_id if user_id < other_user_id else other_user_id
        bigger_id = user_id if user_id > other_user_id else other_user_id

        similarity = cosine_similarity(user, other)
        comparison.add((smaller_id, bigger_id, similarity))

    return comparison


In [148]:
def combine_sets(set1, set2):
    set1.update(set2)
    return set1

In [None]:
import operator

stream = sc.parallelize(mapped) \
    .map(lambda user_id: compare(user_id, mapped)) \
    .aggregate(set(), combine_sets, combine_sets)

stream = sorted(stream, key=operator.itemgetter(0, 1))

for tup in stream:
    print(tup)



(1, 1, '1.0000000000')
(1, 2, '0.4064027716')
(2, 2, '1.0000000000')
(1, 3, '-0.2588836853')
(2, 3, '0.1890310596')
(3, 3, '1.0000000000')
(2, 4, '-0.1158950939')
(1, 4, '0.3128806539')
(3, 4, '0.3323344150')
(4, 4, '1.0000000000')
(4, 5, '-0.1254782857')
(1, 5, '-0.2058648870')
(2, 5, '-0.2313821493')
(3, 5, '-0.4232298737')
(5, 5, '1.0000000000')
(2, 6, '-0.1494087588')
(3, 6, '-0.1801803761')
(5, 6, '-0.6541019536')
(1, 6, '0.2594337896')
(6, 6, '1.0000000000')
(4, 6, '1.0000000000')
(4, 7, '-0.1589797008')
(5, 7, '0.0416290214')
(2, 7, '0.2096508756')
(6, 7, '0.2616426213')
(1, 7, '0.3344098452')
(3, 7, '0.7468360709')
(7, 7, '1.0000000000')
(2, 8, '-0.0098030927')
(3, 8, '-0.5555050916')
(6, 8, '-0.6949899073')
(1, 8, '0.1580612378')
(5, 8, '0.2081621193')
(7, 8, '0.3019626599')
(4, 8, '0.4562714916')
(8, 8, '1.0000000000')
(3, 9, '-0.2173912332')
(6, 9, '-0.2276675682')
(5, 9, '0.0650753693')
(1, 9, '0.3184318199')
(2, 9, '0.3580123072')
(4, 9, '0.3903137001')
(8, 9, '0.405380927