In [1]:
# Similarity Scores: Euclidean and Pearson correlation

#1 Euclidean distance based dist = sqrt(sum((x1 - x2)**2)

# A dictionary of movie critics and their ratings of a small
# set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 3.5},
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
'The Night Listener': 4.5, 'Superman Returns': 4.0,
'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}


from math import sqrt
# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
# Get the list of shared_items
    si={}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1
# if they have no ratings in common, return 0
    if len(si)==0: return 0
# Add up the squares of all the differences
    sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)for item in prefs[person1] if item in prefs[person2]])
    return 1/(1+sum_of_squares)

print(sim_distance(critics, "Gene Seymour", "Lisa Rose"))

0.14814814814814814


In [2]:
# Pearson Correlation based
"""
xxxxxxxxxxx-----UNIQUE FEATURE-----xxxxxxxxxx
It takes care of grade inflation: If one gives 5,4,5,4,5 and two gives 4,3,4,3,4 their patter is similar
but one's grades are inflated, so pearson says they do have similarity taking care of inflation. 
"""

def sim_pearson(prefs,p1,p2):
# Get the list of mutually rated items
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: si[item]=1
    # Find the number of elements
    n=len(si)
    # if they are no ratings in common, return 0
    if n==0: return 0
    # Add up all the preferences
    sum1=sum([prefs[p1][it] for it in si])
    sum2=sum([prefs[p2][it] for it in si])
    # Sum up the squares
    sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[p2][it],2) for it in si])
    # Sum up the products
    pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
    # Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: return 0
    r=num/den
    return r
    
print(sim_pearson(critics, "Lisa Rose", "Gene Seymour"))

0.39605901719066977


In [3]:
'''
FINDING MOST SIMILAR PERSONS
'''

# Returns the best matches for person from the prefs dictionary.
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=3,similarity=sim_pearson):
    scores=[(similarity(prefs,person,other),other) for other in prefs if other!=person]
    # Sort the list so the highest scores appear at the top
    scores.sort( )
    scores.reverse( )
    return scores[0:n]

print(topMatches(critics, 'Toby'))

[(0.9912407071619299, 'Lisa Rose'), (0.9244734516419049, 'Mick LaSalle'), (0.8934051474415647, 'Claudia Puig')]


In [4]:
'''
RECOMMENDING AN ITEM BASED ON GIVEN RATINGS 
'''
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
    totals={}
    simSums={}
    for other in prefs:
    # don't compare me to myself
        if other==person: continue
        sim=similarity(prefs,person,other)
    # ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:
    # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
    # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
    # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items( )]
    # Return the sorted list
    rankings.sort( )
    rankings.reverse( )
    return rankings

print(getRecommendations(critics, 'Toby'))

[(3.3477895267131013, 'The Night Listener'), (2.832549918264162, 'Lady in the Water'), (2.530980703765565, 'Just My Luck')]


In [9]:
'''
MATCHING PRODUCTS: HOW SIMILAR ARE THEY?

YOU LIKE THIS, YOU MIGHT LIKE "THAT" :)

Transforming into item based dictionary


"""
Prints out

[(0.6579516949597695, 'You, Me and Dupree'), 
(0.4879500364742689, 'Lady in the Water'), 
(0.11180339887498941, 'Snakes on a Plane'), 
(-0.1798471947990544, 'The Night Listener')]

Negative correlation means: those who like superman returns tend to dislike 
the night listener

"""

'''

def transformPrefs(prefs):
    result={}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
    # Flip item and person
            result[item][person]=prefs[person][item]
    return result

movies = transformPrefs(critics)

'''Now call topMatches to find most similar items'''
print(topMatches(movies, 'Superman Returns', n=4))





[(0.6579516949597695, 'You, Me and Dupree'), (0.4879500364742689, 'Lady in the Water'), (0.11180339887498941, 'Snakes on a Plane'), (-0.1798471947990544, 'The Night Listener')]


In [10]:
'''Getting a person recommended to go with for a given movie
Here we found who will enjoy the movie most. In general case who's more likely to like the item.

'''

print(getRecommendations(movies,'Just My Luck'))

[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]
