In [1]:
import pyspark
from pyspark.sql.types import IntegerType
from pyspark import SparkContext

spark = pyspark.sql.SparkSession.builder \
    .master("local") \
    .appName("movies") \
    .getOrCreate()

In [13]:
df = spark.read.csv(path="/home/jovyan/movielens/ratings.csv", header=True)
df = df.withColumn("rating", df["rating"].cast(IntegerType()))
df = df.withColumn("user_id", df["user_id"].cast(IntegerType()))
df = df.withColumn("movie_id", df["movie_id"].cast(IntegerType()))


In [14]:
averages = df \
    .groupBy("user_id") \
    .avg("rating") \
    .select("*")

In [15]:
subtracted = averages \
    .join(df, df["user_id"] == averages["user_id"]) \
    .select(df["user_id"], "avg(rating)", "rating", "movie_id") \
    .collect()

In [34]:
sc = SparkContext.getOrCreate()
rdd = sc.parallelize(subtracted)



In [35]:
mapped = rdd.map(lambda row: {"user_id": row["user_id"], 
                              "rating": row["rating"],
                              "movie_id": row["movie_id"],
                              # "average": row["avg(rating)"],
                              "diff": row["rating"] - row["avg(rating)"]
                              })

In [36]:
def common_movie_ids(user_id1, user_id2, rows):
    movies_1 = rows.filter(lambda row: row["user_id"] == user_id1).map(lambda row: row["movie_id"]).collect()
    movies_2 = rows.filter(lambda row: row["user_id"] == user_id2).map(lambda row: row["movie_id"]).collect()
    return set(movies_1).intersection(movies_2)


In [59]:
user1_id = 1
user2_id = 2

movie_ids = common_movie_ids(user1_id, user2_id, mapped)
print(movie_ids)

{3105, 1193, 1962, 2028, 2321, 1207, 1246}


In [60]:
user1_rating_vector = mapped\
    .filter(lambda row: row["movie_id"] in movie_ids and row["user_id"] is user1_id)\
    .map(lambda row: row["diff"])\
    .collect()

user2_rating_vector = mapped\
    .filter(lambda row: row["movie_id"] in movie_ids and row["user_id"] is user2_id)\
    .map(lambda row: row["diff"])\
    .collect()

In [61]:
import numpy as np

a = np.array(user1_rating_vector)
b = np.array(user2_rating_vector)

dot_product = np.dot(a, b)

a_length = np.linalg.norm(a)
b_length = np.linalg.norm(b)

cosine_similarity = dot_product / (a_length * b_length)

print(cosine_similarity)


-0.264426718823
