In [1]:
from pyspark import SparkConf, SparkContext
import pandas as pd
import numpy as np
from math import sqrt

In [2]:
sc = SparkContext.getOrCreate()

In [3]:
rating_data = sc.textFile("ml-100k/u.data")
movie_data = sc.textFile("ml-100k/u.item")

In [26]:
def __extract_user_rating(line):
            data = line.split('\t')
            return (int(data[0]), (int(data[1]), float(data[2])))

In [28]:
user_rating_lists = rating_data.map(__extract_user_rating)#.reduceByKey(lambda v1, v2: v1 + v2)
user_rating_lists.takeSample(False, 5)

[(634, (547, 4.0)),
 (804, (204, 4.0)),
 (514, (792, 4.0)),
 (296, (480, 5.0)),
 (622, (705, 3.0))]

In [29]:
join_lists = user_rating_lists.join(user_rating_lists)
join_lists.takeSample(False, 5)

[(712, ((755, 4.0), (1503, 4.0))),
 (416, ((184, 4.0), (195, 5.0))),
 (727, ((125, 4.0), (210, 3.0))),
 (655, ((631, 4.0), (966, 3.0))),
 (524, ((715, 4.0), (660, 5.0)))]

In [30]:
def filter_movies(line):
    movie1 = line[1][0]
    movie2 = line[1][1]
    
    return movie1 < movie2

In [34]:
user_movies_list = join_lists.filter(filter_movies)
user_movies_list.takeSample(False, 5)

[(94, ((233, 3.0), (625, 4.0))),
 (406, ((22, 3.0), (131, 2.0))),
 (669, ((174, 3.0), (222, 3.0))),
 (13, ((204, 5.0), (353, 4.0))),
 (391, ((204, 3.0), (435, 5.0)))]

In [38]:
def makePairs(line):
    (movie1, rating1) = line[1][0]
    (movie2, rating2) = line[1][1]
    
    return ((movie1, movie2), (rating1, rating2))

In [39]:
rating_pairs = user_movies_list.map(makePairs)
rating_pairs.takeSample(False, 5)

[((198, 483), (5.0, 5.0)),
 ((208, 288), (5.0, 3.0)),
 ((38, 218), (1.0, 4.0)),
 ((79, 154), (4.0, 2.0)),
 ((39, 248), (3.0, 4.0))]

In [109]:
moviePairRatings= rating_pairs.groupByKey()
moviePairRatings.takeSample(False, 5)

[((55, 177), <pyspark.resultiterable.ResultIterable at 0x7fb575843fd0>),
 ((627, 1145), <pyspark.resultiterable.ResultIterable at 0x7fb575858198>),
 ((342, 878), <pyspark.resultiterable.ResultIterable at 0x7fb575858278>),
 ((569, 1012), <pyspark.resultiterable.ResultIterable at 0x7fb5758584a8>),
 ((98, 384), <pyspark.resultiterable.ResultIterable at 0x7fb575843e48>)]

In [110]:
def computeCosineSimilarity(ratingPairs):
    numPairs = 0
    sum_xx = sum_yy = sum_xy = 0
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1

    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)

    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))

    return (score, numPairs)

In [116]:
moviePairSimilarities = moviePairRatings.mapValues(computeCosineSimilarity).persist()

In [119]:
moviePairSimilarities.takeSample(False, 5)

[((577, 1219), (0.8267004114290564, 6)),
 ((25, 1226), (0.9502501268747329, 19)),
 ((583, 1009), (0.978021978021978, 5)),
 ((1005, 1118), (0.9591144340027616, 4)),
 ((32, 1305), (0.9970544855015815, 2))]

# Function

In [120]:
def _extract_user_rating(line):
    data = line.split('\t')
    return (int(data[0]), (int(data[1]), float(data[2])))

def _filter_movies(line):
    movie1 = line[1][0]
    movie2 = line[1][1]
    
    return movie1 < movie2

def _makePairs(line):
    (movie1, rating1) = line[1][0]
    (movie2, rating2) = line[1][1]
    
    return ((movie1, movie2), (rating1, rating2))

def _computeCosineSimilarity(ratingPairs):
    numPairs = 0
    sum_xx = sum_yy = sum_xy = 0
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1

    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)

    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))

    return (score, numPairs)

rating_data = sc.textFile("ml-100k/u.data")
user_rating_lists = rating_data.map(_extract_user_rating)
join_lists = user_rating_lists.join(user_rating_lists)

user_movies_list = join_lists.filter(_filter_movies)

rating_pairs = user_movies_list.map(_makePairs)
moviePairRatings= rating_pairs.groupByKey()
moviePairSimilarities = moviePairRatings.mapValues(_computeCosineSimilarity).persist()