In [260]:
"""
This script demonstrates how to design the simplest recommender system based of
Collaborative Filtering. In order to make these predictions, we must first measure
similarity of users or items from the rows and columns of the Utility Matrix.
We will use the Pearson Correlation Similarity Measure to find similar users.
"""
#!/bin/python
from __future__ import division

from numpy import *
from sklearn.metrics import mean_squared_error
import math
# User class stores the names and average rating for each user
class User:
    def __init__(self, name):
        self.name = name
        self.avg_r = 0.0

# Item class stores the name of each item
class Item:
    def __init__(self, name):
        self.name = name

# Rating class is used to assign ratings
class Rating:
    def __init__(self, user_id, item_id, rating):
        self.user_id = user_id
        self.item_id = item_id
        self.rating = rating

# We store users in an array. The index of the array marks the id of that user
user = []
user.append(User("Ann"))
user.append(User("Bob"))
user.append(User("Carl"))
user.append(User("Doug"))

# Items are also stored in an array. The index of the array marks the id of that item.
item = []
item.append(Item("HP1"))
item.append(Item("HP2"))
item.append(Item("HP3"))
item.append(Item("SW1"))
item.append(Item("SW2"))
item.append(Item("SW3"))

rating = []
rating.append(Rating(1, 1, 4))
rating.append(Rating(1, 4, 1))
rating.append(Rating(2, 1, 5))
rating.append(Rating(2, 2, 5))
rating.append(Rating(2, 3, 4))
rating.append(Rating(3, 4, 4))
rating.append(Rating(3, 5, 5))
rating.append(Rating(4, 2, 3))
rating.append(Rating(4, 6, 3))

n_users = len(user)
n_items = len(item)
n_ratings = len(rating)

# The utility matrix stores the rating for each user-item pair in the matrix form.
utility = zeros((n_users, n_items))
for r in rating:
    utility[r.user_id-1][r.item_id-1] = r.rating
print utility

# Finds the average rating for each user and stores it in the user's object
for i in range(0, n_users):
    user[i].avg_r = mean([ri for ri in utility[i] if ri > 0])


[[ 4.  0.  0.  1.  0.  0.]
 [ 5.  5.  4.  0.  0.  0.]
 [ 0.  0.  0.  4.  5.  0.]
 [ 0.  3.  0.  0.  0.  3.]]


In [261]:
def norm():
    normalize = np.zeros((n_users, n_items))
    for i in range(0, n_users):
        for j in range(0, n_items):
            if utility[i][j] != 0:
                normalize[i][j] = utility[i][j] - user[i].avg_r
            else:
                normalize[i][j] = float('Inf')
    return normalize

In [262]:
"""
Definition of the pcs(x, y) and guess (u, i, top_n) functions.
Complete these after reading the project description.
"""
# Finds the Pearson Correlation Similarity Measure between two users
def pcs(x, y):
    num = 0
    A = utility[x - 1]
    B = utility[y - 1]
    avg_rx = mean([a for a in A if a>0])
    avg_ry = mean([b for b in B if b>0])
    I = [ (rxi,ryi) for (rxi,ryi) in zip(A,B) if rxi>0 and ryi>0  ]
    if len(I)>0:
        cima = sum([ (rxi-avg_rx)*(ryi-avg_ry) for (rxi,ryi) in I])
        baixo1 = sum([ (rxi-avg_rx)**2 for (rxi,ryi) in I])
        baixo2 = sum([ (ryi-avg_ry)**2 for (rxi,ryi) in I])
        
        baixo1Sqrt = math.sqrt(baixo1)
        baixo2Sqrt = math.sqrt(baixo2)
        
        return cima/(baixo1Sqrt*baixo2Sqrt) if (baixo1Sqrt*baixo2Sqrt)!=0 else 0
    else:
        return 0
    

    

In [263]:
# Guesses the ratings that user with id, user_id, might give to item with id, i_id.
# We will consider the top_n similar users to do this.
def guess(user_id, i_id, top_n):
    similarity = []
    for i in range(0, n_users):
        if i+1 != user_id:
            similarity.append((pcs_matrix[user_id-1][i],i+1))
    
    similarity.sort(key=lambda x:x[0],reverse=True)
    similarity = similarity[:top_n]
    
    rating_topN_ri = [(i,utility[i-1][i_id-1]) for v,i in similarity if utility[i-1][i_id-1]>0 ]
    rating_topN_avg_diff_ri_u = [ri-user[u-1].avg_r for u,ri in rating_topN_ri ]
    
    avg_diff_ri_u = mean(rating_topN_avg_diff_ri_u) if len(rating_topN_avg_diff_ri_u)>0 else 0
    responseF = abs(user[user_id-1].avg_r + avg_diff_ri_u)

    return responseF
    


2.6666666666666665

In [264]:

"""
Displays utility matrix and mean squared error.
This is for answering Q1,2 of Part 1.
"""

# Display the utility matrix as given in Part 1 of your project description
set_printoptions(precision=3)
print utility

# Finds the average rating for each user and stores it in the user's object
for i in range(0, n_users):
    user[i].avg_r = mean([ri for ri in utility[i] if ri > 0])

pcs_matrix = np.zeros((n_users, n_users))
for i in range(0, n_users):
    for j in range(0, n_users):
        pcs_matrix[i][j] = pcs(i + 1, j + 1)

print "\rGenerating Similarity Matrix [%d:%d] = %f" % (i+1, j+1, pcs_matrix[i][j])
print pcs_matrix

print norm()
for ii in range(n_users):
    print (ii+1),user[ii].avg_r
n = 3 # Assume top_n users

# Finds all the missing values of the utility matrix
utility_copy = copy(utility)
for i in range(0, n_users):
    for j in range(0, n_items):
        if utility_copy[i][j] == 0:
            utility_copy[i][j] = guess(i+1, j+1, n)

print utility_copy

# Finds the utility values of the particular users in the test set. Refer to Q2
print "Ann's rating for SW2 should be " + str(guess(1, 5, n))
print "Carl's rating for HP1 should be " + str(guess(3, 1, n))
print "Carl's rating for HP2 should be " + str(guess(3, 2, n))
print "Doug's rating for SW1 should be " + str(guess(4, 4, n))
print "Doug's rating for SW2 should be " + str(guess(4, 5, n))

guess = array([guess(1, 5, n), guess(3, 1, n), guess(3, 2, n), guess(4, 4, n), guess(4, 5, n)])

### Ratings from the test set
# Ann rates SW2 with 2 stars
# Carl rates HP1 with 2 stars
# Carl rates HP2 with 2 stars
# Doug rates SW1 with 4 stars
# Doug rates SW 2 with 3 stars

test = array([2, 2, 2, 4, 3])

# Finds the mean squared error of the ratings with respect to the test set
print "Mean Squared Error is " + str(mean_squared_error(guess, test))

[[ 4.  0.  0.  1.  0.  0.]
 [ 5.  5.  4.  0.  0.  0.]
 [ 0.  0.  0.  4.  5.  0.]
 [ 0.  3.  0.  0.  0.  3.]]
Generating Similarity Matrix [4:4] = 0.000000
[[ 1.  1.  1.  0.]
 [ 1.  1.  0.  0.]
 [ 1.  0.  1.  0.]
 [ 0.  0.  0.  0.]]
[[ 1.5      inf    inf -1.5      inf    inf]
 [ 0.333  0.333 -0.667    inf    inf    inf]
 [   inf    inf    inf -0.5    0.5      inf]
 [   inf  0.       inf    inf    inf  0.   ]]
1 2.5
2 4.66666666667
3 4.5
4 3.0
[[ 4.     2.667  1.833  1.     3.     2.5  ]
 [ 5.     5.     4.     3.667  5.167  4.667]
 [ 5.417  4.667  3.833  4.     5.     4.5  ]
 [ 3.917  3.     2.333  2.     3.5    3.   ]]
Ann's rating for SW2 should be 3.0
Carl's rating for HP1 should be 5.41666666667
Carl's rating for HP2 should be 4.66666666667
Doug's rating for SW1 should be 2.0
Doug's rating for SW2 should be 3.5
Mean Squared Error is 4.80694444444
