In [1]:
from math import sqrt
import numpy as np
import os.path
import pandas as pd

In [2]:
# Returns a distance-based similarity score for p1 and p2
def sim_distance(prefs,p1,p2):
    # Get the list of shared_items
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item]=1
                # if they have no ratings in common, return 0
        if len(si)==0: return 0
    # Add up the squares of all the differences
    sum_of_squares=sum([pow(prefs[p1][item]-prefs[p2][item],2)
    for item in prefs[p1] if item in prefs[p2]])
    return 1/(1+sum_of_squares)

In [3]:
def sim_manhattan(prefs,p1,p2):
    # Get the list of shared_items
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item]=1
                # if they have no ratings in common, return 0
        if len(si)==0: return 0
    # Add up the squares of all the differences
    sum_of_abs_distance=sum([abs(prefs[p1][item]-prefs[p2][item])
    for item in prefs[p1] if item in prefs[p2]])
    return 1/(1+sum_of_abs_distance)


In [4]:
def sim_cosine(prefs,p1,p2):
    # Get the list of mutually rated items
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: si[item]=1
    # Find the number of elements
    n=len(si)
    # if they are no ratings in common, return 0
    if n==0: return 0
    # Sum up the squares
    sum1Sq=sum([pow(prefs[p1][it],2) for it in prefs[p1]])
    sum2Sq=sum([pow(prefs[p2][it],2) for it in prefs[p2]])
    # Sum up the products
    pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
    # Calculate cosine similarity
    num=pSum
    den=sqrt(sum1Sq)*sqrt(sum2Sq)
    if pSum==0: return 0
    r=num/den
    return r

In [5]:
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
    # Get the list of mutually rated items
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: si[item]=1
    # Find the number of elements
    n=len(si)
    # if they are no ratings in common, return 0
    if n==0: return 0
    # Add up all the preferences
    sum1=sum([prefs[p1][it] for it in si])
    sum2=sum([prefs[p2][it] for it in si])
    # Sum up the squares
    sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[p2][it],2) for it in si])
    # Sum up the products
    pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
    # Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: return 0
    r=num/den
    return r

In [6]:
# Returns the Pearson correlation coefficient for p1 and p2
def sim_tanimoto(prefs,p1,p2):
    # Get the list of mutually rated items
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: si[item]=1
    # Find the number of elements
    n=len(si)
    # if they are no ratings in common, return 0
    if n==0: return 0
    # Sum up the squares
    sum1Sq=sum([pow(prefs[p1][it],2) for it in prefs[p1]])
    sum2Sq=sum([pow(prefs[p2][it],2) for it in prefs[p2]])
    # Sum up the products
    pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
    # Calculate Pearson score
    num=pSum
    den=sum1Sq + sum2Sq - pSum
    if den==0: return 0
    r=num/den
    return r

In [7]:
# Returns the best matches for person from the prefs dictionary.
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
    scores=[(similarity(prefs,person,other),other)
    for other in prefs if other!=person]
    # Sort the list so the highest scores appear at the top
    scores.sort( )
    scores.reverse( )
    return scores[0:n]

In [8]:
# Transform the preference from user based to item based & vice versa
def transformPrefs(prefs):
    result={}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            # Flip item and person
            result[item][person]=prefs[person][item]
    return result

In [9]:
# Calculate similar users for a given user
def calculateSimilarUsers(prefs,n=10):
    # Create a dictionary of users showing which other users they
    # are most similar to.
    result={}
    c=0
    for user in prefs:
        # Status updates for large datasets
        c+=1
        if c%100==0: print("%d / %d" % (c,len(prefs)))
        # Find the most similar users to this one
        scores=topMatches(prefs,user,n=n,similarity=sim_pearson)
        result[user]=scores
    return result

In [10]:
# Calculate similar items for a given item
def calculateSimilarItems(prefs,n=10):
    # Create a dictionary of items showing which other items they
    # are most similar to.
    result={}
    # Invert the preference matrix to be item-centric
    itemPrefs=transformPrefs(prefs)
    c=0
    for item in itemPrefs:
        # Status updates for large datasets
        c+=1
        if c%1000==0: print("%d / %d" % (c,len(itemPrefs)))
        # Find the most similar items to this one
        scores=topMatches(itemPrefs,item,n=n,similarity=sim_pearson)
        result[item]=scores
    return result

In [11]:
def getRecommendedItems(prefs,itemMatch,user):
    userRatings=prefs[user]
    scores={}
    totalSim={}
    # Loop over items rated by this user
    for (item,rating) in userRatings.items( ):
        # Loop over items similar to this one
        for (similarity,item2) in itemMatch[item]:
            # Ignore if this user has already rated this item
            if item2 in userRatings: continue
            # Weighted sum of rating times similarity
            scores.setdefault(item2,0)
            scores[item2]+=similarity*rating
            # Sum of all the similarities
            totalSim.setdefault(item2,0)
            totalSim[item2]+=similarity
    # Divide each total score by total weighting to get an average
    rankings=[(score/totalSim[item],item) for item,score in scores.items( )]
    # Return the rankings from highest to lowest
    rankings.sort( )
    rankings.reverse( )
    return rankings


In [12]:
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_tanimoto):
    totals={}
    simSums={}
    for other in prefs:
        # don't compare me to myself
        if other==person: continue
        sim=similarity(prefs,person,other)
        # ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:
            # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items( )]
    # Return the sorted list
    rankings.sort( )
    rankings.reverse( )
    return rankings

In [13]:
# Gets recommendations for a person by using a weighted average
# of top 5 similar user's rankings
def getFiveSimilarRecommendations(prefs,usersim,person,similarity=sim_pearson):
    totals={}
    simSums={}
    for (score,other) in dict(usersim[person]).items():
        for item in prefs[other]:
            # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*score
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=score
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items( )]
    # Return the sorted list
    rankings.sort( )
    rankings.reverse( )
    return rankings

In [14]:
def loadMovieLens(path='data'):
    # Get movie titles
    data1=pd.read_csv(os.path.join(*[path,'movies.csv']))
    data2=pd.read_csv(os.path.join(*[path,'ratings.csv']))
    movie_list=data1['title'].tolist()
    # Get movie titles
    movies = dict(zip(data1.movieId,data1.title))
    # Load data
    prefs={}
    for row in data2.itertuples():
        prefs.setdefault(row.userId,{})
        prefs[row.userId][movies[row.movieId]]=float(row.rating)
    return prefs,movie_list

In [15]:
prefs,movie_list=loadMovieLens()

my_pref={'Se7en': 5.0, 'Usual Suspects': 4.0,'Zodiac': 5.0,'Shutter Island': 5.0,'Momento':5.0,'Mystic River':4.0,'Identity':5.0,'Gone Girl':4.0,}

# Preprocess the user preferences
for key in list(my_pref): 
    out = [x for x in movie_list if key in x]
    if out:
        my_pref[out[0]]=my_pref[key]
    del my_pref[key]

In [None]:
# Content Based Recommendations

In [16]:
my_id=len(prefs)+1
prefs[my_id]=my_pref

## User User Colloborative Filtering 
# Get recommendations based on all the users
print(getRecommendations(prefs,my_id,similarity=sim_cosine)[0:9])

[(5.000000000000001, 'Zelary (2003)'), (5.000000000000001, 'To the Left of the Father (Lavoura Arcaica) (2001)'), (5.000000000000001, "Taste of Cherry (Ta'm e guilass) (1997)"), (5.000000000000001, 'Side by Side (2012)'), (5.000000000000001, 'Seventh Continent, The (Der siebente Kontinent) (1989)'), (5.000000000000001, "Pervert's Guide to Cinema, The (2006)"), (5.000000000000001, 'Neighbouring Sounds (O som ao redor) (2012)'), (5.000000000000001, 'Holy Motors (2012)'), (5.000000000000001, 'Hands in the Air (2010)')]


In [17]:
# User User Colloborative Filtering 
# Get recommendations based on user similarity(top 5 users)
usersim=calculateSimilarUsers(prefs,5)
print(getFiveSimilarRecommendations(prefs,usersim,my_id,similarity=sim_cosine)[0:9])

100 / 672
200 / 672
300 / 672
400 / 672
500 / 672
600 / 672
[(5.0, 'Wish Upon a Star (1996)'), (5.0, 'The Imitation Game (2014)'), (5.0, 'The Hunger Games (2012)'), (5.0, 'The Fault in Our Stars (2014)'), (5.0, 'The Artist (2011)'), (5.0, "Schindler's List (1993)"), (5.0, 'New World, The (2005)'), (5.0, 'Myth of the American Sleepover, The (2010)'), (5.0, 'It Follows (2014)')]


In [18]:
# Item Item Colloborative Filtering
# Get recommendations based on item similarity
itemsim=calculateSimilarItems(prefs,n=50)
print(getRecommendedItems(prefs,itemsim,my_id)[0:9])

1000 / 9064
2000 / 9064
3000 / 9064
4000 / 9064
5000 / 9064
6000 / 9064
7000 / 9064
8000 / 9064
9000 / 9064
[(5.0, '[REC] (2007)'), (5.0, 'Zathura (2005)'), (5.0, 'Youth in Revolt (2009)'), (5.0, 'Yours, Mine and Ours (1968)'), (5.0, 'You Kill Me (2007)'), (5.0, 'Year of the Horse (1997)'), (5.0, 'Year One (2009)'), (5.0, 'Wrong Turn (2003)'), (5.0, 'Wrong Guy, The (1997)')]
