In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

In [2]:
def similarity(X,Y):
    #X = (n_samples, n_features)
    #Y = (n_samples, n_features)
    X = X.toarray()
    mean_X = np.mean(X,axis=-1)
    mean_X_2 = np.mean(X*X, axis=-1)
    Z = np.zeros((X.shape[0],X.shape[0]))
    for i in range(X.shape[0]):
        for j in range(i,X.shape[0]):
            XY = X[i] * X[j]
            E_XY = np.mean(XY,axis=-1)
            up = E_XY - mean_X[i] * mean_X[j]
            down = np.sqrt(mean_X_2[i] - mean_X[i]*mean_X[i]) * np.sqrt(mean_X_2[j] - mean_X[j]*mean_X[j]) + 1e-3
            Z[i,j] = up / down
    return Z + np.transpose(Z) - np.diag(np.diag(Z))

In [3]:
class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
    
    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)
    
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1] 
            # and the corresponding ratings 
            ratings = self.Y_data[ids, 2]
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
        
    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity() 
        
    def fit(self):
        self.refresh()
        
    
    def __pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    def pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)
            
    
    def recommend(self, u, num=None):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
        [recommended_items.append((i, self.__pred(u,i))) for i in range(self.n_items) if i not in items_rated_by_u and self.__pred(u,i) > 0]
        # for i in range(self.n_items):
        #     if i not in items_rated_by_u:
        #         rating = self.__pred(u, i)
        #         if rating > 0: 
        #             recommended_items.append(i)
        recommended_items.sort(key=lambda tup: tup[1], reverse=True)
        
        # return recommended_items 
        return [item[0] for item in recommended_items][:num]

    def print_recommendation(self, num):
        """
        print all items which should be recommended for each user 
        """
        print('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u, num)
            if self.uuCF:
                print('    Recommend item(s):', recommended_items, 'for user', u)
            else: 
                print('    Recommend item', u, 'for user(s) : ', recommended_items)

In [4]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('data/ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('data/ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values
rate_test = ratings_test.values

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [5]:
rs_uu = CF(rate_train, k = 30, dist_func=similarity, uuCF = 1)
rs_uu.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs_uu.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print('User-user CF, RMSE =', RMSE)

[[ 9.91861871e-01  5.57351158e-02  7.94089086e-05 ...  0.00000000e+00
  -2.14835098e-02  3.14951238e-02]
 [ 5.57351158e-02  9.34482391e-01 -8.96239682e-04 ...  0.00000000e+00
   1.38870413e-03  3.72251304e-02]
 [ 7.94089086e-05 -8.96239682e-04  9.57521378e-01 ...  0.00000000e+00
  -9.14243691e-02  2.48017544e-03]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-2.14835098e-02  1.38870413e-03 -9.14243691e-02 ...  0.00000000e+00
   8.55281290e-01 -3.94484452e-03]
 [ 3.14951238e-02  3.72251304e-02  2.48017544e-03 ...  0.00000000e+00
  -3.94484452e-03  9.86138073e-01]]
User-user CF, RMSE = 0.9931657760992122


In [8]:
rs_ii = CF(rate_train, k = 30, dist_func=similarity, uuCF = 0)
rs_ii.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs_ii.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print('Item-item CF, RMSE =', RMSE)

[[0.99380924 0.07192441 0.03822453 ... 0.         0.         0.        ]
 [0.07192441 0.98267491 0.10029814 ... 0.         0.         0.        ]
 [0.03822453 0.10029814 0.98709124 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
Item-item CF, RMSE = 0.983189014061999


In [6]:
import pickle

In [None]:
rs_uu.print_recommendation(10)

In [7]:
def save_object(obj, filename):
    with open(filename, 'wb') as outp:
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

In [8]:
save_object(rs_uu,'user_user.pkl')

In [1]:
a = [(1,2),(3,4)]
print(a[:,1])

TypeError: list indices must be integers or slices, not tuple