In [3]:
import json
import numpy as np

In [7]:
def euclidean_score(dataset, user1, user2):
    if user1 not in dataset:
        raise TypeError('Cannot find ' + user1 + ' in the dataset')
    if user2 not in dataset:
        raise TypeError('Cannot find ' + user2 + ' in the dataset')
    
    # Movies rated by both user1 and user2
    common_movies = {}

    for item in dataset[user1]:
        if item in dataset[user2]:
            common_movies[item] = 1

    # If there are no common movies between the users
    # then the score is 0
    if len(common_movies) == 0:
        return 0
    
    squared_diff = []
    
    for item in dataset[user1]:
        if item in dataset[user2]:
            squared_diff.append(np.square(dataset[user1][item] - dataset[user2][item]))
    
    return 1 / (1 + np.sqrt(np.sum(squared_diff)))

def pearson_score(dataset, user1, user2):
    if user1 not in dataset:
        raise TypeError('Cannot find ' + user1 + ' in the dataset')
    if user2 not in dataset:
        raise TypeError('Cannot find ' + user2 + ' in the dataset')
    
    # Movies rated by both user1 and user2
    common_movies = {}
    
    for item in dataset[user1]:
        if item in dataset[user2]:
            common_movies[item] = 1
    
    num_ratings = len(common_movies)
    
    # If there are no common movies between user1 and user2, then the score is 0
    if num_ratings == 0:
        return 0
    
    # Calculate the sum of ratings of all the common movies
    user1_sum = np.sum([dataset[user1][item] for item in common_movies])
    user2_sum = np.sum([dataset[user2][item] for item in common_movies])
    
    # Calculate the sum of the squares of ratings of all the common movies
    user1_squared_sum = np.sum([dataset[user1][item] for item in common_movies])
    user2_squared_sum = np.sum([dataset[user2][item] for item in common_movies])
    
    # Calculate the sum of products of the ratings of the common movies
    sum_of_products = np.sum([dataset[user1][item] * dataset[user2][item] for item in common_movies])
    
    # Calculate the pearson correlation score
    sxy = sum_of_products - (user1_sum * user2_sum / num_ratings)
    sxx = user1_squared_sum - np.square(user1_sum) / num_ratings
    syy = user2_squared_sum - np.square(user2_sum) / num_ratings
    
    if sxx * syy == 0:
        return 0
    
    return sxy / np.sqrt(sxx * syy)

In [15]:
def find_similar_users(dataset, user, num_users):
    # Compute pearson score between input user
    # and all users in the dataset
    scores = np.array([[x, pearson_score(dataset, user, x)] for x in dataset if x != user])
    
    # Sort the scores in decreasing order
    scores_sorted = np.argsort(scores[:, 1])[::-1]
    
    # Extract the top 'num_users' scores
    top_users = scores_sorted[:num_users]
    return scores[top_users]

In [16]:
ratings_file = 'ratings.json'
user1 = 'David Smith'
user2 = 'Bill Duffy'
with open(ratings_file, 'r') as f:
    data = json.loads(f.read())
    
    print(euclidean_score(data, user1, user2))
    print(pearson_score(data, user1, user2))

    print(find_similar_users(data, user1, 3))

0.585786437627
0.243442127135
[['Chris Duncan' '0.433012701892']
 ['Bill Duffy' '0.243442127135']
 ['Adam Cohen' '0.232897126278']]
