Collaborative filtering starts with a large group of people and finds a smaller set with tastes similar to yours.

In [6]:
# A dictionary of movie critics and their ratings of a small
# set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 3.5},
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
'The Night Listener': 4.5, 'Superman Returns': 4.0,
'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

In [7]:
for critic in critics:
    print(critic)

Lisa Rose
Gene Seymour
Michael Phillips
Claudia Puig
Mick LaSalle
Jack Matthews
Toby


Simplest way to calculate similarity of critics is Euclidean distance.

Takes items that people have ranked in common and "uses them as axis for a chart".

Can then plot critics on the chart and see how close they are to each other.

In [8]:
import math

#euclidean distance is square root of the sum of squared differences
score = math.sqrt(pow(5-4,2)+pow(4-1,2))

print(score)

3.1622776601683795


In [9]:
#this gives lower values for people who are more similar
#we want higher values for similar people
#done by adding 1 (so we don't get division by 0) and inverting it

score = 1/(1+math.sqrt(pow(5-4,2)+pow(4-1,2)))

print(score)

#value should be between 0 and 1, where 1 means identical preferences

0.2402530733520421


In [34]:
import math

#returns distance similarity score for person 1 and person 2
def sim_distance(prefs, person1, person2):
    #get list of shared_items (si)
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1
            
    #if they have no rating in common, return 0
    if len(si)==0:
        return 0
    
    #add up the squares of all the differences
    
    sum_of_squares = sum([pow(prefs[person1][item]-prefs[person2][item],2) for item in prefs[person1] if item in prefs[person2]])
    
    return 1/(1+math.sqrt(sum_of_squares))

In [35]:
sim_distance(critics, 'Lisa Rose', 'Gene Seymour')

0.29429805508554946

In [36]:
#another similarity metric is the Pearson correlation coefficient
#it is a measure of how well two sets of data fit on a straight line
#works better when data is un-normalised
#i.e. some critics routinely rank harsher on average

#you now have a critic on the x and one on the y axis
#plot the films, if perfect match there should be a straight line
#between all the films

#finds the items rated by both critics
#calculates sum and sum of squares for the two critics
#then calculates sum of products of their rating
#uses this to calc. the Pearson correlation coefficient

def sim_pearson(prefs, p1, p2):
    #get list of mutually rated items
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
            
    #find the number of elements
    n = len(si)
    
    #if no ratings in common, return 0
    if n == 0:
        return 0
    
    #add up all preferences
    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])
    
    #sum up the squares
    sos1 = sum([pow(prefs[p1][it],2) for it in si])
    sos2 = sum([pow(prefs[p2][it],2) for it in si])
    
    #sum the producs
    sop = sum([prefs[p1][it]*prefs[p2][it] for it in si])
    
    #calculate Pearson score
    num = sop-(sum1*sum2/n)
    den = math.sqrt((sos1-pow(sum1,2)/n)*(sos2-pow(sum2,2)/n))
    if den == 0:
        return 0
    else:
        return num/den

In [37]:
sim_pearson(critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977

In [38]:
#can now write a function that scores everyone against a given person
#and finds their closest matches

#returns the best matches for person from the prefs dict
#number of results anf similarity function are optional params
def top_matches(prefs, person, n=5, similarity=sim_pearson):
    scores = [(similarity(prefs,person,other), other) for other in prefs if other!= person]
    
    #sort list so highest scores appear at top
    scores.sort()
    scores.reverse()
    return scores[0:n]

In [39]:
top_matches(critics, 'Toby', n=3)

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig')]

In [40]:
#finding similar critics is cool, but we want actual 
#movie recommendations
#could look for similar critics and find movie they like i haven't seen
#but:
#invasive???
#could turn up reviewers who haven't seen some movies i like
#could turn up receiver who liked a movie that got bad reviews from
#  all the other critics returned by top_matches

#need to score the items by producing a weighted score that ranks
#the critics
#take the votes of all the other critics and multiply by how similar
#they are to me by the score they gave each movie

In [41]:
#for each critic, get similarity score
#for each movie, multiply similarity score by the score they gave it
#sum the total of the similarity*score
#divide that by the sum of the similarities of all critics
#this is your weighted score

#people similar to you give more weight to the overall score
#need to divide as don't want popularity/number of reviews to matter

In [86]:
#get recommendations for a person using weighted average
#of every other user's rankings
def get_recommendations(prefs, person, similarity=sim_pearson):
    totals = {}
    sim_sums = {}
    for other in prefs:
        #don't compare me to myself
        if other == person:
            continue
        sim = similarity(prefs, person, other) #similarity of critic
        
        #ignore scores of zero or lower
        
        if sim < 0: #don't care about 0 similarity
            continue
        
        for item in prefs[other]:
            
            #only score movies I haven't seen yet
            #a score of 0 means i haven't seen it, apparently
            if item not in prefs[person] or prefs[person][item]==0:
                #similarity * score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                #sum of similarities
                sim_sums.setdefault(item,0)
                sim_sums[item]+=sim
                
    #create normalised list
    rankings = [(total/sim_sums[item],item) for item,total in totals.items() if sim_sums[item]!=0]
    
    #return the sorted liat
    rankings.sort()
    rankings.reverse()
    return rankings

In [87]:
get_recommendations(critics, 'Toby')

#you get a ranked list of movies but also a guess of what the rating
#will be for each film

[(3.3477895267131017, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.530980703765565, 'Just My Luck')]

In [88]:
#now we know how to find similar people and recommend products
#but what if you want to see which products are similar
#good for when site hasn't got a lot of data on you
#think about product suggestions on Amazon

#can determine similarity by lookng at who liked an item and 
#seeing other things they liked
#this is the same method as finding similar people
#just need to swap people and items

#just need to swap dictionary from:
# {CRITIC: {FILM: SCORE, FILM: SCORE}}
#to
# {FILM: {CRITIC: SCORE, CRITIC: SCORE}}

def transform_prefs(prefs):
    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            
            #flip item and person
            result[item][person]=prefs[person][item]
    return result

In [89]:
movies = transform_prefs(critics)

In [90]:
top_matches(movies, 'Superman Returns')

#note: negative correlation means people who LIKE superman returns
#tend to DISLIKE the night listener and just my luck

[(0.6579516949597695, 'You, Me and Dupree'),
 (0.4879500364742689, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.1798471947990544, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

In [91]:
#you can also recommend critics for a movie, 
#see who you'll invite to a premier

get_recommendations(movies, 'Just My Luck')

[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]

In [92]:
#comparing rankings from every single customer is ok with small-med
#sized datasets, but very slow when you have millions of people

#so far we have used 'used-based collaborative filtering'
#an alternative is 'item-based collaborative filtering'
#item-based is better with larger datasets
#can do calculations in advance so faster

#general technique:
#precompute most similar items for each item
#when making recommendation, look at customer's top rated items
#create weighted list of items similar to those
#although you examine all data, comparisons between items will not 
#  change as often as comparisons between users
#therefore don't have to continuously calculate each items most 
#  similar items, can do once in a while

#need a function to build the complete dataset of similar items
#build dataset once and re-use each time you need it

def calculate_sim_items(prefs, n=10):
    #create dictionary of items showing which other items
    #they are most similar to
    result = {}
    
    #invert preference matrix to be item centric
    item_prefs = transform_prefs(prefs)
    c = 0
    for item in item_prefs:
        #status updates for large datasets
        c+=1
        if c%100==0:
            print("%d / %d" % (c, len(item_prefs)))
        #find most similar items to this one
        scores = top_matches(item_prefs, item, n=n, similarity=sim_distance)
        result[item] = scores
    return result

#gives items with how they were rated by each user
#loops over every item and passes transformed dictionary to top_matches
#  this gets most similar items along with similarity scores
#creates and returns a dictionary of items along with a list of
#  their most similar items

In [93]:
calculate_sim_items(critics)

#you only need to do this frequently enough to keep item similarities
#up-to-date, i.e. more frequently when user base/number of ratings is 
#small
#as they grow, similarity scores will become more stable

{'Just My Luck': [(0.3483314773547883, 'Lady in the Water'),
  (0.32037724101704074, 'You, Me and Dupree'),
  (0.2989350844248255, 'The Night Listener'),
  (0.2553967929896867, 'Snakes on a Plane'),
  (0.20799159651347807, 'Superman Returns')],
 'Lady in the Water': [(0.4494897427831781, 'You, Me and Dupree'),
  (0.38742588672279304, 'The Night Listener'),
  (0.3483314773547883, 'Snakes on a Plane'),
  (0.3483314773547883, 'Just My Luck'),
  (0.2402530733520421, 'Superman Returns')],
 'Snakes on a Plane': [(0.3483314773547883, 'Lady in the Water'),
  (0.32037724101704074, 'The Night Listener'),
  (0.3090169943749474, 'Superman Returns'),
  (0.2553967929896867, 'Just My Luck'),
  (0.1886378647726465, 'You, Me and Dupree')],
 'Superman Returns': [(0.3090169943749474, 'Snakes on a Plane'),
  (0.252650308587072, 'The Night Listener'),
  (0.2402530733520421, 'Lady in the Water'),
  (0.20799159651347807, 'Just My Luck'),
  (0.1918253663634734, 'You, Me and Dupree')],
 'The Night Listener': [

In [94]:
#now to give recommendations using the item similarity dictionary
#without going through whole dataset
#get items user has ranked
#find similar items
#weight according to how similar they are
#item dictionary can be used to find the similarities

#for each movie you have seen
#  multiply similarity of unseen movie with that movie rating
#  sum the similarity*rating
#  normalise by dividing total of sum from above with sum of similarities

def get_recommended_items(prefs,item_match, user):
    user_ratings = prefs[user]
    scores = {}
    total_sim = {}
    
    #loop over items rated by this user
    for (item, rating) in user_ratings.items():
        
        #loop over items similar to this one
        for (similarity, item2) in item_match[item]:
            
            #ignore if this user has already rated this item
            if item2 in user_ratings:
                continue
                
            #weighted sum of rating times similarity
            scores.setdefault(item2,0)
            scores[item2]+=similarity*rating
            
            #sum of all similarities
            total_sim.setdefault(item2,0)
            total_sim[item2]+=similarity
            
    #divide each total score by total weighted to get an average
    rankings=[(score/total_sim[item],item) for item,score in scores.items()]

    #return rankings from highest to lowest
    rankings.sort()
    rankings.reverse()
    return rankings

In [95]:
#this is very similar to get_recommendations
#but instead uses a pre-calculated item similarity list

item_sim = calculate_sim_items(critics)

get_recommended_items(critics, item_sim, 'Toby')

#the values are slightly different as the item_sim score is calculated
#using 'top_matches' which only returns the top n matches
#default n for calculate_sim_items is 10, to make fully accurate

[(3.1667425234070894, 'The Night Listener'),
 (2.9366294028444346, 'Just My Luck'),
 (2.868767392626467, 'Lady in the Water')]

In [96]:
#movielens data is the "older" 100k one
#http://files.grouplens.org/datasets/movielens/ml-100k.zip

#u.item contains list of movie IDs and titles
#u.data has data in format of:
# <user ID> <movie ID> <rating (1-5)> <timestamp>

# there are 1682 movies with 943 users
# each user has rated at least 20 movies

def load_movielens(path='data/movielens'):
    
    #get movie titles
    movies = {}
    for line in open(path+'/u.item', 'rb'):
        (id, title) = line.decode("latin-1").split('|')[0:2]
        movies[id] = title
        
    #load data
    prefs = {}
    for line in open(path+'/u.data'):
        (user, movieid, rating, ts) = line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
    return prefs

In [97]:
prefs = load_movielens()

In [98]:
prefs['87'] #ratings for user 87

{'2001: A Space Odyssey (1968)': 5.0,
 'Ace Ventura: Pet Detective (1994)': 4.0,
 'Addams Family Values (1993)': 2.0,
 'Addicted to Love (1997)': 4.0,
 'Adventures of Priscilla, Queen of the Desert, The (1994)': 3.0,
 'Adventures of Robin Hood, The (1938)': 5.0,
 'Air Force One (1997)': 3.0,
 'Air Up There, The (1994)': 3.0,
 'Alien (1979)': 4.0,
 'American President, The (1995)': 5.0,
 'Annie Hall (1977)': 4.0,
 'Apocalypse Now (1979)': 4.0,
 'Babe (1995)': 5.0,
 'Baby-Sitters Club, The (1995)': 2.0,
 'Back to the Future (1985)': 5.0,
 'Bad Boys (1995)': 4.0,
 'Bananas (1971)': 5.0,
 'Barcelona (1994)': 3.0,
 'Batman & Robin (1997)': 4.0,
 'Batman (1989)': 3.0,
 'Batman Returns (1992)': 3.0,
 'Big Green, The (1995)': 3.0,
 'Big Squeeze, The (1996)': 2.0,
 'Birdcage, The (1996)': 4.0,
 'Blade Runner (1982)': 4.0,
 'Blues Brothers, The (1980)': 5.0,
 'Boomerang (1992)': 3.0,
 'Boot, Das (1981)': 4.0,
 'Brady Bunch Movie, The (1995)': 2.0,
 'Braveheart (1995)': 4.0,
 'Bridge on the River

In [104]:
get_recommendations(prefs, '87')[:25] #top 25 recommendations for user 87

[(5.0, 'They Made Me a Criminal (1939)'),
 (5.0, 'Star Kid (1997)'),
 (5.0, 'Santa with Muscles (1996)'),
 (5.0, 'Saint of Fort Washington, The (1993)'),
 (5.0, 'Marlene Dietrich: Shadow and Light (1996) '),
 (5.0, 'Great Day in Harlem, A (1994)'),
 (5.0, 'Entertaining Angels: The Dorothy Day Story (1996)'),
 (5.0, 'Boys, Les (1997)'),
 (4.89884443128923, 'Legal Deceit (1997)'),
 (4.815019082242709, 'Letter From Death Row, A (1998)'),
 (4.7321082983941425, 'Hearts and Minds (1996)'),
 (4.696244466490867, 'Pather Panchali (1955)'),
 (4.652397061026758, 'Lamerica (1994)'),
 (4.538723693474813, 'Leading Man, The (1996)'),
 (4.535081339106105, 'Mrs. Dalloway (1997)'),
 (4.532337612572981, 'Innocents, The (1961)'),
 (4.527998574747076, 'Casablanca (1942)'),
 (4.510270149719864, 'Everest (1998)'),
 (4.493967755428438, 'Dangerous Beauty (1998)'),
 (4.485151301801341, 'Wallace & Gromit: The Best of Aardman Animation (1996)'),
 (4.463287461290221, 'Wrong Trousers, The (1993)'),
 (4.450979436941

In [105]:
#runs pretty fast on my laptop, but if it didn't then it would be 
#better to precalculate item similarity

item_sim = calculate_sim_items(prefs,n=50)

100 / 1664
200 / 1664
300 / 1664
400 / 1664
500 / 1664
600 / 1664
700 / 1664
800 / 1664
900 / 1664
1000 / 1664
1100 / 1664
1200 / 1664
1300 / 1664
1400 / 1664
1500 / 1664
1600 / 1664


In [107]:
get_recommended_items(prefs, item_sim, '87')[:25]

[(5.0, "What's Eating Gilbert Grape (1993)"),
 (5.0, 'Vertigo (1958)'),
 (5.0, 'Usual Suspects, The (1995)'),
 (5.0, 'Toy Story (1995)'),
 (5.0, 'Titanic (1997)'),
 (5.0, 'Sword in the Stone, The (1963)'),
 (5.0, 'Stand by Me (1986)'),
 (5.0, 'Sling Blade (1996)'),
 (5.0, 'Silence of the Lambs, The (1991)'),
 (5.0, 'Shining, The (1980)'),
 (5.0, 'Shine (1996)'),
 (5.0, 'Sense and Sensibility (1995)'),
 (5.0, 'Scream (1996)'),
 (5.0, 'Rumble in the Bronx (1995)'),
 (5.0, 'Rock, The (1996)'),
 (5.0, 'Robin Hood: Prince of Thieves (1991)'),
 (5.0, 'Reservoir Dogs (1992)'),
 (5.0, 'Police Story 4: Project S (Chao ji ji hua) (1993)'),
 (5.0, 'House of the Spirits, The (1993)'),
 (5.0, 'Fresh (1994)'),
 (5.0, 'Denise Calls Up (1995)'),
 (5.0, 'Day the Sun Turned Cold, The (Tianguo niezi) (1994)'),
 (5.0, 'Before the Rain (Pred dozhdot) (1994)'),
 (5.0, 'Assignment, The (1997)'),
 (5.0, '1-900 (1994)')]