In [83]:
from pyspark import SparkConf, SparkContext
import pandas as pd
import numpy as np
from math import sqrt

In [2]:
conf = SparkConf()
sc = SparkContext.getOrCreate()

In [3]:
rating_data = sc.textFile("ml-100k/u.data")
movie_data = sc.textFile("ml-100k/u.item")

In [4]:
movie_data.takeSample(False, 5)

['1062|Four Days in September (1997)|23-Jan-1998||http://us.imdb.com/M/title-exact?imdb-title-119815|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0',
 '110|Operation Dumbo Drop (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Operation%20Dumbo%20Drop%20(1995)|0|1|1|0|0|1|0|0|0|0|0|0|0|0|0|0|0|1|0',
 '1457|Love Is All There Is (1996)|11-Oct-1996||http://us.imdb.com/M/title-exact?Love%20Is%20All%20There%20Is%20(1996)|0|0|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0',
 '730|Queen Margot (Reine Margot, La) (1994)|01-Jan-1996||http://us.imdb.com/Title?Reine+Margot,+La+(1994)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|1|0|0|0|0',
 '81|Hudsucker Proxy, The (1994)|01-Jan-1994||http://us.imdb.com/M/title-exact?Hudsucker%20Proxy,%20The%20(1994)|0|0|0|0|0|1|0|0|0|0|0|0|0|0|1|0|0|0|0']

In [96]:
rating_data.takeSample(False, 10)

['640\t55\t5\t874777765',
 '264\t430\t5\t886123531',
 '255\t219\t5\t883216544',
 '268\t21\t3\t875742822',
 '275\t448\t3\t880314383',
 '132\t521\t4\t891278996',
 '130\t815\t3\t874953866',
 '545\t720\t3\t883115664',
 '864\t182\t3\t888886913',
 '664\t481\t5\t878091912']

In [6]:
movie_ids = rating_data.map(lambda line: line.split('\t')[1])

In [7]:
movie_count = movie_ids.map(lambda id: (id, 1)).reduceByKey(lambda v1,v2: v1+v2).collect()

In [8]:
len(movie_count)

1682

In [13]:
def extract_user_rating(line):
    data = line.split('\t')
    return (data[0], (data[1], data[2]))

In [14]:
def filter_rating_by_movie_id(movie_id):
    return rating_data.filter(lambda line: line.split('\t')[1] == movie_id).map(extract_user_rating)

In [108]:
m995 = filter_rating_by_movie_id('21')
m223 = filter_rating_by_movie_id('481')

In [109]:
m223.collect()

[('151', ('481', '3')),
 ('308', ('481', '4')),
 ('334', ('481', '5')),
 ('6', ('481', '5')),
 ('318', ('481', '4')),
 ('312', ('481', '5')),
 ('7', ('481', '5')),
 ('398', ('481', '3')),
 ('234', ('481', '5')),
 ('345', ('481', '3')),
 ('13', ('481', '3')),
 ('382', ('481', '5')),
 ('82', ('481', '5')),
 ('465', ('481', '4')),
 ('406', ('481', '3')),
 ('85', ('481', '4')),
 ('429', ('481', '3')),
 ('194', ('481', '3')),
 ('202', ('481', '1')),
 ('474', ('481', '4')),
 ('401', ('481', '3')),
 ('553', ('481', '3')),
 ('326', ('481', '1')),
 ('389', ('481', '5')),
 ('539', ('481', '4')),
 ('409', ('481', '3')),
 ('655', ('481', '2')),
 ('664', ('481', '5')),
 ('450', ('481', '5')),
 ('556', ('481', '5')),
 ('716', ('481', '4')),
 ('659', ('481', '5')),
 ('848', ('481', '3')),
 ('271', ('481', '3')),
 ('870', ('481', '4')),
 ('301', ('481', '4')),
 ('747', ('481', '5')),
 ('567', ('481', '5')),
 ('313', ('481', '4')),
 ('643', ('481', '4')),
 ('896', ('481', '4')),
 ('913', ('481', '3')),

In [110]:
join_rdd = m995.join(m223)
join_rdd.collect()

[('82', (('21', '1'), ('481', '5'))),
 ('429', (('21', '2'), ('481', '3'))),
 ('6', (('21', '3'), ('481', '5'))),
 ('308', (('21', '3'), ('481', '4'))),
 ('751', (('21', '5'), ('481', '4'))),
 ('870', (('21', '3'), ('481', '4'))),
 ('234', (('21', '3'), ('481', '5'))),
 ('747', (('21', '2'), ('481', '5'))),
 ('301', (('21', '2'), ('481', '4'))),
 ('13', (('21', '3'), ('481', '3'))),
 ('655', (('21', '2'), ('481', '2')))]

In [111]:
def part_cosine_sim(line):
    s1 = int(line[1][0][1])
    s2 = int(line[1][1][1])
    
    return (s1*s2, s1**2, s2**2)

In [112]:
p_score = join_rdd.map(part_cosine_sim)
p_score.collect()

[(5, 1, 25),
 (6, 4, 9),
 (15, 9, 25),
 (12, 9, 16),
 (20, 25, 16),
 (12, 9, 16),
 (15, 9, 25),
 (10, 4, 25),
 (8, 4, 16),
 (9, 9, 9),
 (4, 4, 4)]

In [113]:
score_3_parts = p_score.reduce(lambda s1, s2: (s1[0]+s2[0], s1[1]+s2[1], s1[2]+s2[2]))
score_3_parts

(116, 87, 186)

In [114]:
score = score_3_parts[0] / (sqrt(score_3_parts[1]) * sqrt(score_3_parts[2]))
score

0.9118888192307179

In [118]:
def cosince_similarity(rating_data, movie_id1, movie_id2):

    def _filter_rating_by_movie_id(movie_id):
        def __extract_user_rating(line):
            data = line.split('\t')
            return (data[0], (data[1], data[2]))
        
        return rating_data.filter(lambda line: line.split('\t')[1] == movie_id).map(__extract_user_rating)

    def _part_cosine_sim(line):
        s1 = int(line[1][0][1])
        s2 = int(line[1][1][1])

        return (s1*s2, s1**2, s2**2)

    movie1_rating = _filter_rating_by_movie_id(movie_id1)
    movie2_rating = _filter_rating_by_movie_id(movie_id2)
    
    join_rdd = movie1_rating.join(movie2_rating)
    
    p_score = join_rdd.map(_part_cosine_sim)
    score_3_parts = p_score.reduce(lambda s1, s2: (s1[0]+s2[0], s1[1]+s2[1], s1[2]+s2[2]))
    score = score_3_parts[0] / (sqrt(score_3_parts[1]) * sqrt(score_3_parts[2]))
    
    return score

In [119]:
cosince_similarity(rating_data, '21', '481')

(116, 87, 186)


0.9118888192307179