In [2]:
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 3.5},
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
'The Night Listener': 4.5, 'Superman Returns': 4.0,
'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

In [13]:
from math import sqrt

def sim_distance(prefs, person1, person2):
    #Get list of shared items 
    si ={}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1
        # if they have no ratings in common, return 0
    if len(si) == 0:
        return 0
    # Add up the squares of all the differences 
    sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item],2)
                             for item in prefs[person1] if item in prefs[person2]])
    return 1/(1 + sum_of_squares)

In [14]:
sim_distance(critics, 'Lisa Rose', 'Gene Seymour')

0.14814814814814814

In [15]:
def sim_pearson(prefs, p1, p2):
    #Get list of mutual interests
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
        
    n = len(si)
    if n == 0:
        return 0 
    
    #Add up all preferences
    sum1 = sum([prefs[p1][it] for it in si])       
    sum2 = sum([prefs[p2][it] for it in si])
        
    #Sum up the squares 
    sum1Sq = sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it],2) for it in si])
        
    #Sum up the products
    pSum = sum([prefs[p1][it]*prefs[p2][it] for it in si])
        
    #Calculate Pearson score 
    num = pSum - (sum1*sum2/n)
    den = sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den == 0 : return 0
    r = num/den
    return r

In [16]:
sim_pearson(critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977

In [18]:
def top_matches(prefs, person, n =5, similarity= cosine_similarity):
    scores = [(similarity(prefs,person,other),other)
                     for other in prefs if other!= person]
    #Sort list in descending order
    scores.sort()
    scores.reverse()
    return scores[0:n]

In [7]:
top_matches(critics, 'Toby', n = 3)

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig')]

In [20]:
def getRecommendations(prefs, person, similarity = cosine_similarity):
    totals = {}
    simSums = {}
    for other in prefs:
        if other==person:continue
        sim = similarity(prefs, person, other)
        
        #ignore scores of zero or lower
        if sim<=0:continue
        for item in prefs[other]:
            #only score movies I haven't seen 
            if item not in prefs[person] or prefs[person][item]==0:
                #Sim * score 
                totals.setdefault(item,0)
                totals[item]+= prefs[other][item]*sim
                simSums.setdefault(item,0)
                simSums[item]+=sim
    
    # Create the normalized list 
    rankings = [(total/simSums[item], item ) for item,total in totals.items()]
    
    #Return the sorted list 
    rankings.sort()
    rankings.reverse()
    return rankings

In [21]:
getRecommendations(critics, 'Toby')

[(3.4238686478833218, 'The Night Listener'),
 (2.795964712878916, 'Lady in the Water'),
 (2.3823181325175566, 'Just My Luck')]

In [22]:
def transformPrefs(prefs):
    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            
            result[item][person] = prefs[person][item]
    
    return result

In [23]:
movies = transformPrefs(critics)

In [24]:
getRecommendations(movies, 'Just My Luck')

[(3.6937429138777662, 'Jack Matthews'),
 (3.2556381465577906, 'Michael Phillips'),
 (3.1877460052395947, 'Toby')]

In [25]:
## Item based Filtering 

In [26]:
def calculateSimilarItems(prefs, n = 10):
    #create a dictionary of items showing which other items 
    # are most similar to 
    result = {}
    
    # Invert the preference matrix to be item-centric
    itemPrefs = transformPrefs(prefs)
    c = 0
    for item in itemPrefs:
        # Status updates for large datasets
        c += 1 
        if c%100==0: print("%d / %d" %(c, len(itemPrefs)))
        # Find the most similar items to this one 
        scores = top_matches(itemPrefs, item, n=n, similarity=cosine_similarity)
        result[item]=scores
    return result

In [27]:
def getRecommendedItems(prefs, itemMatches, user):
    userRatings = prefs[user]
    scores = {}
    totalSim = {}
    
    #Loop over items rated by the user
    for (item,rating) in userRatings.items():
        
        # Loop over items similar to this one 
        for (similarity,item2) in itemMatches[item]:
            
            if item2 in userRatings: continue 
            
            # Weighted sum of rating times similarity
            
            scores.setdefault(item2,0)
            scores[item2] += similarity*rating
            
            # Sum of all the similarities
            
            totalSim.setdefault(item2,0)
            totalSim[item2] += similarity
            
    # Divide each total score by the total weighting to get an average
    rankings = [(score/totalSim[item],item) for item,score in scores.items()]
    
    # Return the rankings from highest to lowest 
    rankings.sort()
    rankings.reverse()
    return rankings

In [28]:
itemsim = calculateSimilarItems(critics)
getRecommendedItems(critics, itemsim, 'Toby')

[(3.1877460052395947, 'Just My Luck'),
 (3.1774592782687061, 'The Night Listener'),
 (3.177060614435486, 'Lady in the Water')]

## Let's use the Movies DataSet

In [29]:
import pandas as pd 
data = pd.read_csv('ml-latest-small/ratings.csv')

In [30]:
def loadMovieLens(path='ml-latest-small'):
    
    # Get movies titles
    movies = {}
    with open(path+'/movies.csv') as f:
        next(f)
        for line in f:
            (id_,title) = line.split(',')[0:2]
            movies[id_] = title
        
    # Load the data 
    prefs={}
    with open(path+'/ratings.csv') as f:
        next(f)
        for line in f:
            user,movieid,rating,ts =line.split(',')
            prefs.setdefault(user,{})
            prefs[user][movies[movieid]]= float(rating)
    return prefs    

In [31]:
prefs = loadMovieLens()

In [32]:
prefs['87']

{'"Birdcage': 4.0,
 '"Rock': 3.0,
 'Beavis and Butt-Head Do America (1996)': 2.0,
 'Black Sheep (1996)': 3.0,
 'Broken Arrow (1996)': 3.0,
 'Cold Comfort Farm (1995)': 5.0,
 'Eraser (1996)': 3.0,
 'Executive Decision (1996)': 4.0,
 'Fargo (1996)': 5.0,
 'Happy Gilmore (1996)': 4.0,
 'Independence Day (a.k.a. ID4) (1996)': 3.0,
 'Kids in the Hall: Brain Candy (1996)': 3.0,
 'Kingpin (1996)': 4.0,
 'Leaving Las Vegas (1995)': 4.0,
 'L\xc3\xa9on: The Professional (a.k.a. The Professional) (L\xc3\xa9on) (1994)': 5.0,
 'Mighty Aphrodite (1995)': 4.0,
 'Mission: Impossible (1996)': 3.0,
 "Mr. Holland's Opus (1995)": 1.0,
 'Phenomenon (1996)': 3.0,
 'Rumble in the Bronx (Hont faan kui) (1995)': 3.0,
 'Sabrina (1995)': 3.0,
 'Shine (1996)': 5.0,
 'Star Wars: Episode IV - A New Hope (1977)': 4.0,
 'Star Wars: Episode VI - Return of the Jedi (1983)': 3.0,
 'Striptease (1996)': 3.0,
 'Tin Cup (1996)': 1.0,
 'Toy Story (1995)': 3.0,
 'Trainspotting (1996)': 3.0,
 'Twelve Monkeys (a.k.a. 12 Monkeys

In [33]:
getRecommendations(prefs,'87')[0:30]

[(5.0000000000000009, 'Z Channel: A Magnificent Obsession (2004)'),
 (5.0000000000000009, 'Wild Zero (2000)'),
 (5.0000000000000009, 'Walker (1987)'),
 (5.0000000000000009, 'Undertow (2004)'),
 (5.0000000000000009, 'Totally F***ed Up (1993)'),
 (5.0000000000000009,
  'Through the Olive Trees (Zire darakhatan zeyton) (1994)'),
 (5.0000000000000009, "Prospero's Books (1991)"),
 (5.0000000000000009, 'Palindromes (2004)'),
 (5.0000000000000009, 'Padre padrone (1977)'),
 (5.0000000000000009, 'Offside (2006)'),
 (5.0000000000000009, "Max Keeble's Big Move (2001)"),
 (5.0000000000000009, "Love Me If You Dare (Jeux d'enfants) (2003)"),
 (5.0000000000000009, 'Howl (2010)'),
 (5.0000000000000009, 'Fox and His Friends (Faustrecht der Freiheit) (1975)'),
 (5.0000000000000009, 'Fireworks Wednesday (Chaharshanbe-soori) (2006)'),
 (5.0000000000000009, 'Fiorile (1993)'),
 (5.0000000000000009, "Eaux d'artifice (1953)"),
 (5.0000000000000009, 'Dream a Little Dream (1989)'),
 (5.0000000000000009, 'Cul-de

In [34]:
itemsim = calculateSimilarItems(prefs, n = 30)

100 / 8963
200 / 8963
300 / 8963
400 / 8963
500 / 8963
600 / 8963
700 / 8963
800 / 8963
900 / 8963
1000 / 8963
1100 / 8963
1200 / 8963
1300 / 8963
1400 / 8963
1500 / 8963
1600 / 8963
1700 / 8963
1800 / 8963
1900 / 8963
2000 / 8963
2100 / 8963
2200 / 8963
2300 / 8963
2400 / 8963
2500 / 8963
2600 / 8963
2700 / 8963
2800 / 8963
2900 / 8963
3000 / 8963
3100 / 8963
3200 / 8963
3300 / 8963
3400 / 8963
3500 / 8963
3600 / 8963
3700 / 8963
3800 / 8963
3900 / 8963
4000 / 8963
4100 / 8963
4200 / 8963
4300 / 8963
4400 / 8963
4500 / 8963
4600 / 8963
4700 / 8963
4800 / 8963
4900 / 8963
5000 / 8963
5100 / 8963
5200 / 8963
5300 / 8963
5400 / 8963
5500 / 8963
5600 / 8963
5700 / 8963
5800 / 8963
5900 / 8963
6000 / 8963
6100 / 8963
6200 / 8963
6300 / 8963
6400 / 8963
6500 / 8963
6600 / 8963
6700 / 8963
6800 / 8963
6900 / 8963
7000 / 8963
7100 / 8963
7200 / 8963
7300 / 8963
7400 / 8963
7500 / 8963
7600 / 8963
7700 / 8963
7800 / 8963
7900 / 8963
8000 / 8963
8100 / 8963
8200 / 8963
8300 / 8963
8400 / 8963
8

In [43]:
getRecommendedItems(prefs, item_dict, '87')[-10:]

KeyError: 'L\xc3\xa9on: The Professional (a.k.a. The Professional) (L\xc3\xa9on) (1994)'

In [44]:
prefs['87']

{'"Birdcage': 4.0,
 '"Rock': 3.0,
 'Beavis and Butt-Head Do America (1996)': 2.0,
 'Black Sheep (1996)': 3.0,
 'Broken Arrow (1996)': 3.0,
 'Cold Comfort Farm (1995)': 5.0,
 'Eraser (1996)': 3.0,
 'Executive Decision (1996)': 4.0,
 'Fargo (1996)': 5.0,
 'Happy Gilmore (1996)': 4.0,
 'Independence Day (a.k.a. ID4) (1996)': 3.0,
 'Kids in the Hall: Brain Candy (1996)': 3.0,
 'Kingpin (1996)': 4.0,
 'Leaving Las Vegas (1995)': 4.0,
 'L\xc3\xa9on: The Professional (a.k.a. The Professional) (L\xc3\xa9on) (1994)': 5.0,
 'Mighty Aphrodite (1995)': 4.0,
 'Mission: Impossible (1996)': 3.0,
 "Mr. Holland's Opus (1995)": 1.0,
 'Phenomenon (1996)': 3.0,
 'Rumble in the Bronx (Hont faan kui) (1995)': 3.0,
 'Sabrina (1995)': 3.0,
 'Shine (1996)': 5.0,
 'Star Wars: Episode IV - A New Hope (1977)': 4.0,
 'Star Wars: Episode VI - Return of the Jedi (1983)': 3.0,
 'Striptease (1996)': 3.0,
 'Tin Cup (1996)': 1.0,
 'Toy Story (1995)': 3.0,
 'Trainspotting (1996)': 3.0,
 'Twelve Monkeys (a.k.a. 12 Monkeys

In [35]:
import json
with open('item_sim_cos','w') as f:
    json.dump(itemsim,f)

In [37]:
import json 
with open('item_sim_cos', 'rb') as f:
    item_dict = json.load(f)

In [38]:
item_dict['Harry Potter and the Deathly Hallows: Part 1 (2010)']

[[1.0000000000000002, u'Step Up (2006)'],
 [1.0000000000000002, u'Star Trek IV: The Voyage Home (1986)'],
 [1.0000000000000002, u'Soylent Green (1973)'],
 [1.0000000000000002, u'Pretty in Pink (1986)'],
 [1.0000000000000002, u'Lethal Weapon 4 (1998)'],
 [1.0000000000000002, u'"Devil\'s Backbone'],
 [1.0, u'\xc0 nous la libert\xe9 (Freedom for Us) (1931)'],
 [1.0, u'\xa1Three Amigos! (1986)'],
 [1.0, u'xXx: State of the Union (2005)'],
 [1.0, u'eXistenZ (1999)'],
 [1.0, u'[REC] (2007)'],
 [1.0, u'Zoom (2006)'],
 [1.0, u'Zoolander 2 (2016)'],
 [1.0, u'Zombie (a.k.a. Zombie 2: The Dead Are Among Us) (Zombi 2) (1979)'],
 [1.0, u'Zero Effect (1998)'],
 [1.0, u'Zelig (1983)'],
 [1.0, u'Zeitgeist: The Movie (2007)'],
 [1.0, u'Zach Galifianakis: Live at the Purple Onion (2006)'],
 [1.0, u'Yu-Gi-Oh! (2004)'],
 [1.0, u'Youth of the Beast (Yaju no seishun) (1963)'],
 [1.0, u'Youth in Revolt (2009)'],
 [1.0, u"Your Sister's Sister (2011)"],
 [1.0, u'Your Highness (2011)'],
 [1.0, u'Young People Fu

In [39]:
new_users = {'1' : {'Harry Potter and the Goblet of Fire (2005)' : 5.0,
                    'Harry Potter and the Half-Blood Prince (2009)' : 5.0,
                    'Harry Potter and the Order of the Phoenix (2007)': 5.0,
                    'Harry Potter and the Deathly Hallows: Part 1 (2010)': 5.0},
             '2' :  {'Toy Story (1995)': 4.5,
                     'Willy Wonka & the Chocolate Factory (1971)': 4.5}}
             

In [40]:
getRecommendedItems(new_users, item_dict, '1')

[(5.0, u'\xc0 nous la libert\xe9 (Freedom for Us) (1931)'),
 (5.0, u'\xa1Three Amigos! (1986)'),
 (5.0, u'xXx: State of the Union (2005)'),
 (5.0, u'loudQUIETloud: A Film About the Pixies (2006)'),
 (5.0, u'eXistenZ (1999)'),
 (5.0, u'[REC] (2007)'),
 (5.0, u'Zorba the Greek (Alexis Zorbas) (1964)'),
 (5.0, u'Zoom (2006)'),
 (5.0, u'Zoolander 2 (2016)'),
 (5.0, u'Zombie (a.k.a. Zombie 2: The Dead Are Among Us) (Zombi 2) (1979)'),
 (5.0, u'Zero Effect (1998)'),
 (5.0, u'Zelig (1983)'),
 (5.0, u'Zelary (2003)'),
 (5.0, u'Zeitgeist: The Movie (2007)'),
 (5.0, u'Zack and Miri Make a Porno (2008)'),
 (5.0, u'Zach Galifianakis: Live at the Purple Onion (2006)'),
 (5.0, u'Yu-Gi-Oh! (2004)'),
 (5.0, u'Youth of the Beast (Yaju no seishun) (1963)'),
 (5.0, u'Youth in Revolt (2009)'),
 (5.0, u"Your Sister's Sister (2011)"),
 (5.0, u'Your Highness (2011)'),
 (5.0, u'Young at Heart (a.k.a. Young@Heart) (2007)'),
 (5.0, u'Young People Fucking (a.k.a. YPF) (2007)'),
 (5.0, u'Young Guns II (1990)'),
 

In [30]:
harry_list = [] 
for movie_name in item_dict.keys():
    if 'Harry Potter' in movie_name:
        harry_list.append(movie_name)
    

In [31]:
print(harry_list)

[u'Harry Potter and the Goblet of Fire (2005)', u'Harry Potter and the Half-Blood Prince (2009)', u'Harry Potter and the Order of the Phoenix (2007)', u"Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)", u'Harry Potter and the Prisoner of Azkaban (2004)', u'Harry Potter and the Deathly Hallows: Part 2 (2011)', u'Harry Potter and the Chamber of Secrets (2002)', u'Harry Potter and the Deathly Hallows: Part 1 (2010)']


In [57]:
item_dict.values()[10]

[[1.000000000000007, u'Invasion of the Body Snatchers (1956)'],
 [1.000000000000007, u'"Deep End'],
 [1.0000000000000033, u'Nymphomaniac: Volume I (2013)'],
 [1.0000000000000018, u'Far from Heaven (2002)'],
 [1.0000000000000018, u'Central Station (Central do Brasil) (1998)'],
 [1.0000000000000018, u'Arsenic and Old Lace (1944)'],
 [1.0000000000000007, u'Romeo Must Die (2000)'],
 [1.0000000000000007, u'Bambi (1942)'],
 [1.0000000000000004, u'Diner (1982)'],
 [1.0, u'Zelig (1983)'],
 [1.0, u'You Kill Me (2007)'],
 [1.0, u'Yes Man (2008)'],
 [1.0, u'Year One (2009)'],
 [1.0, u'X-Men: Apocalypse (2016)'],
 [1.0, u'Wrong Turn (2003)'],
 [1.0, u'Wristcutters: A Love Story (2006)'],
 [1.0,
  u'Women on the Verge of a Nervous Breakdown (Mujeres al borde de un ataque de nervios) (1988)'],
 [1.0, u"William Shakespeare's A Midsummer Night's Dream (1999)"],
 [1.0, u'Wild Tales (2014)'],
 [1.0, u'White Chicks (2004)'],
 [1.0, u"Weekend at Bernie's (1989)"],
 [1.0, u'Warm Bodies (2013)'],
 [1.0, u'W

In [59]:
itemPrefs = transformPrefs(prefs)

In [9]:
from scipy import spatial

def cosine_similarity(prefs,p1,p2):
    
    #Get list of mutual interests
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
        
    n = len(si)
    if n == 0:
        return 0 
    
    # Calculate cosine similarity
    df1 = [prefs[p1][it] for it in si]
    df2 = [prefs[p2][it] for it in si]
    r = 1 - spatial.distance.cosine(df1,df2)
    return r

In [10]:
cosine_similarity(critics, 'Lisa Rose', 'Gene Seymour')

0.96064630139802409

In [19]:
top_matches(critics, 'Toby', n = 3)

[(0.99083016804429891, 'Michael Phillips'),
 (0.97361631056780096, 'Mick LaSalle'),
 (0.95540583589057559, 'Claudia Puig')]