# Collaborative Filtering

In [2]:
critics = {
    'Lisa Rose': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'Superman Returns': 3.5,
        'You, Me and Dupree': 2.5,
        'The Night Listener': 3.0
    },
    'Gene Seymour': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 1.5,
        'Superman Returns': 5.0,
        'The Night Listener': 3.0,
        'You, Me and Dupree': 3.5
    },
    'Michael Phillips': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.0,
        'Superman Returns': 3.5,
        'The Night Listener': 4.0
    },
    'Claudia Puig': {
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'Superman Returns': 4.0,
        'The Night Listener': 4.5,
        'You, Me and Dupree': 2.5
    },
    'Mick LaSalle': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0,
        'Just My Luck': 2.0,
        'Superman Returns': 3.0,
        'The Night Listener': 3.0,
        'You, Me and Dupree': 2.0
    },
    'Jack Matthews': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0,
        'The Night Listener': 3.0,
        'Superman Returns': 5.0,
        'You, Me and Dupree': 3.5
    },
    'Toby': {
        'Snakes on a Plane': 4.5,
        'Superman Returns': 4.0,
        'You, Me and Dupree': 1.0
    }
}

In [5]:
# Print out the critics.
critics.keys()

dict_keys(['Lisa Rose', 'Gene Seymour', 'Michael Phillips', 'Claudia Puig', 'Mick LaSalle', 'Jack Matthews', 'Toby'])

In [12]:
# Print out the ratings by Lisa Rose.
critics['Lisa Rose']

{'Lady in the Water': 2.5,
 'Snakes on a Plane': 3.5,
 'Just My Luck': 3.0,
 'Superman Returns': 3.5,
 'You, Me and Dupree': 2.5,
 'The Night Listener': 3.0}

## Distance Calculation

In [13]:
# To calculate the euclidean distance.
from math import sqrt

sqrt(pow(5-4,2) + pow(4-1,2))

3.1622776601683795

In [14]:
# Using numpy.
import numpy as np
from numpy.linalg import norm as euclidean

euclidean(np.array([5,4]) - np.array([4,1]))

3.1622776601683795

In [36]:
# The smaller the distance between the two points, the more similar they are.
# To convert it into score (the higher the score, the more similar the users),
# we just need to take the inverse of it. We add 1 to the denominator to avoid
# zero-division.
# A value of 1 means that two users have identical preference.
1 / (1 + sqrt(pow(5-4,2) + pow(4-1,2)))

0.2402530733520421

## Euclidean distance

In [96]:
from math import sqrt


def intersect(a, b):
    """Takes two dict, a and b, and returns the keys that exists in both dict."""
    return a.keys() & b.keys()

def similarity_euclidean(prefs, person1, person2):
    # Find the common similarity between both users.
    similarity_set = intersect(prefs[person1], prefs[person2])
    
    # If they have no ratings in common, return 0.
    if len(similarity_set) == 0: return 0
    
    square_distance = lambda item: pow(prefs[person1][item] - prefs[person2][item], 2)
    
    sum_of_squares = sum([square_distance(item) for item in similarity_set])
    return 1 / (1 + sum_of_squares)

In [97]:
# The similarity between two users can then be calculated.
similarity_euclidean(critics, 'Lisa Rose', 'Gene Seymour')

0.14814814814814814

In [98]:
import unittest

class TestSimilarityDistance(unittest.TestCase):
    def test_intersect_one_key(self):
        a = {'a': 1}
        b = {'a': 2, 'b': 3}
        want = {'a'}
        got = intersect(a, b)
        self.assertEqual(want, got)

    def test_intersect_no_keys(self):
        a = {'a': 1}
        b = {'b': 2}
        want = set()
        got = intersect(a, b)
        self.assertEqual(want, got)
        
if __name__ == '__main__':
    unittest.main(argv=['hello world'], exit=False)

..
----------------------------------------------------------------------
Ran 2 tests in 0.013s

OK


## Pearson Correlation Score

In [74]:
from scipy.stats import pearsonr

def similarity_pearson(prefs, person1, person2):
    # Only compute ratings for similar preferences.
    similarity_set = intersect(prefs[person1], prefs[person2])    
    
    # If there are no ratings in common, return 0.
    if len(similarity_set) == 0: return 0
    
    # Get the ratings for both users.
    rate1 = []
    rate2 = []
    for item in similarity_set:
        rate1.append(prefs[person1][item])
        rate2.append(prefs[person2][item])
    
    # Compute the pearson score.
    return pearsonr(rate1, rate2)

In [76]:
pearson_score, p_value = similarity_pearson(critics, 'Lisa Rose', 'Gene Seymour')
pearson_score

0.39605901719066977

In [91]:
# The raw way of calculating pearson score, in case we need to implement it in another language without using any libraries.
def _pearson(prefs, person1, person2):
    # Get the list of mutually rated items.
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] = 1
    
    # Find the number of elements.
    n = len(si)
    
    # If there are no ratings in common, return 0.
    if n == 0: return 0
    
    rate1 = [prefs[person1][it] for it in si]
    rate2 = [prefs[person2][it] for it in si]
    
    # Add up all preferences.
    sum1 = sum(rate1)
    sum2 = sum(rate2)
    
    # Sum of all squares.
    sum1_square = sum([i * i for i in rate1])
    sum2_square = sum([i * i for i in rate2])
    
    # Sum of product.
    sum_product = sum([a * b for (a, b) in zip(rate1, rate2)])
    
    # Pearson score.
    num = sum_product - (sum1 * sum2/n)
    den = sqrt((sum1_square - sum1 * sum1/n) * (sum2_square - sum2 * sum2/n))
    if den == 0: return 0
    return num /den

In [92]:
_pearson(critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977

## Ranking the critics

To rank the critics relative to a user, we just need to calculate the similarity score of other users
and return the results in descending order.

In [99]:
def top_matches(prefs, person, n = 5, similarity = similarity_pearson):
    scores = [(similarity(prefs, person, other), other) for other in prefs if other != person]
    scores.sort(reverse=True)
    return scores[:n]

In [101]:
top_matches(critics, 'Toby')

[((0.9912407071619302, 0.08432321632194426), 'Lisa Rose'),
 ((0.9244734516419049, 0.24901011701138978), 'Mick LaSalle'),
 ((0.8934051474415642, 0.2966188313316005), 'Claudia Puig'),
 ((0.6628489803598703, 0.5386942679789539), 'Jack Matthews'),
 ((0.3812464258315117, 0.7509898829886107), 'Gene Seymour')]

In [103]:
top_matches(critics, 'Toby', n = 3, similarity = similarity_euclidean)

[(0.3076923076923077, 'Mick LaSalle'),
 (0.2857142857142857, 'Michael Phillips'),
 (0.23529411764705882, 'Claudia Puig')]

## Recommending items.

For each user (excluding yourself), find the similarity score first. Then, for each items they rate, multiply it with the similarity score. Take the sum of the scores divided by the total similarity.

In [113]:
def get_recommendations(prefs, person, similarity=similarity_pearson):
    totals = {}
    sim_sums = {}
    for other in prefs:
        # Don't compare to yourself.
        if other == person: continue
        sim, _ = similarity(prefs, person, other)
        
        # Ignore scores of zero or lower.
        if sim <= 0: continue
        for item in prefs[other]:
            # Only score movie I haven't seen yet.
            if item not in prefs[person] or prefs[person][item] == 0:
                # Similarity * score.
                totals.setdefault(item, 0)
                totals[item] += prefs[other][item] * sim
                
                # Sum of similarity.
                sim_sums.setdefault(item, 0)
                sim_sums[item] += sim

    # Create the normalized list.
    rankings = [(total/sim_sums[item], item) for item, total in totals.items()]
    rankings.sort(reverse=True)
    return rankings

In [115]:
# What Toby has watched and rated.
critics['Toby'].keys()

dict_keys(['Snakes on a Plane', 'Superman Returns', 'You, Me and Dupree'])

In [116]:
# What is recommended for Toby.
get_recommendations(critics, 'Toby')

[(3.347789526713101, 'The Night Listener'),
 (2.832549918264162, 'Lady in the Water'),
 (2.5309807037655645, 'Just My Luck')]