In [79]:
from math import sqrt, log, log10, log2
from operator import itemgetter

import numpy as np
from scipy.linalg import svd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn import metrics



In [191]:
class OCCF:
    def __init__(self, records_train, records_test):
        records = np.vstack([records_train, records_test])

        self.n = records[:, 0].max() + 1
        self.m = records[:, 1].max() + 1
        
        # Initial R
        self.R = np.zeros([self.n, self.m], dtype=bool)
        for record in records_train:
            if record[2] < 4:
                continue
            self.R[record[0], record[1]] = True

        # Initial R_test
        self.R_test = np.zeros([self.n, self.m], dtype=bool)
        for record in records_test:
            if record[2] < 4:
                continue
            self.R_test[record[0], record[1]] = True
        
        # Initial indicator
        y = np.where(self.R, 1, 0)
        y_user = np.sum(y, axis=1)
        y_item = np.sum(y, axis=0)
        
        y_test = np.where(self.R_test, 1, 0)
        self.y_user_test = np.sum(y_test, axis=1)
        y_item_test = np.sum(y_test, axis=0)

        # Global average of rating
        self.mu = np.sum(y) / self.n / self.m

        # bias of item
        self.b_i = np.where(y_item,
                            y_item / self.n - self.mu,
                            -10)
     
        self.users_test = np.nonzero(self.y_user_test)[0]
        
        # Calculate similarity of users
        self.s_u = np.zeros([self.n, self.n])
        for u in range(self.n):
            for w in range(u):
                items_intersect = (self.R[u, :] * self.R[w, :]) != 0
                items_union = (self.R[u, :] + self.R[w, :]) != 0
                similarity = items_intersect.sum() / items_union.sum()

                self.s_u[u, w] = self.s_u[w, u] = similarity

        # normalize
        for u in range(self.n):
            self.s_u[u] /= self.s_u[u].max()

        # Initialize the neighbours of users
        self.n_u = np.zeros([self.n, K], dtype=np.int32)
        for u in range(self.n):
            self.n_u[u] = np.argsort(self.s_u[:, u])[-K:]

        # Calculate similarity of items
        self.s_i = np.zeros([self.m, self.m])
        for i in range(self.m):
            for j in range(i):
                users_intersect = (self.R[:, i] * self.R[:, j]) != 0
                users_union = (self.R[:, i] + self.R[:, j]) != 0
                similarity = users_intersect.sum() / users_union.sum()
                self.s_i[i, j] = self.s_i[j, i] = similarity

        # normalize
        for i in range(self.m):
            self.s_i[i] /= self.s_i[i].max()

        # Initialize the neighbours of items
        self.n_i = np.zeros([self.m, K], dtype=np.int32)
        for i in range(self.m):
            self.n_i[i] = np.argsort(self.s_i[:, i])[-K:]
                
    def user_based_performance(self):
        # Initialize I
        self.i_rec_user_based = np.zeros([self.n, 5], dtype=np.int32)
        for user in range(self.n):
            items = np.where(self.R[user, :] == 0)[0]
            ratings_predict = np.zeros(self.m)
            for item in items:
                # Neighbours
                neighbours = np.intersect1d(self.n_u[user], np.where(self.R[:, item] != 0)[0])
                # neighbours = np.where(self.R[:, item] != 0)[0]
                # sort by similarity and select K of them
                # neighbours_aug = np.vstack([neighbours, self.s_u[neighbours, user]])
                # neighbours = neighbours[np.argsort(neighbours_aug[1, :])[-K:]]
                # have zero neighbour who has rated this item
                if len(neighbours) == 0:
                    # ratings_predict[i] = self.r_u[user]
                    continue
                # predict 
                ratings_predict[item] = self.s_u[neighbours, user].sum()
            
            self.i_rec_user_based[user] = sorted(items, key=lambda x: ratings_predict[x], reverse=True)[:5]
        
        # return ratings_predict
        self.get_pre(self.i_rec_user_based)
        self.get_rec(self.i_rec_user_based)

    def item_based_performance(self):
        # Initialize I
        self.i_rec_item_based = np.zeros([self.n, 5], dtype=np.int32)
        for user in range(self.n):
            items = np.where(self.R[user, :] == 0)[0]
            ratings_predict = np.zeros(self.m)
            for item in items:
                # Neighbours
                # neighbours = np.intersect1d(self.n_i[item], np.where(self.R[user] != 0)[0])
                neighbours = np.where(self.R[user, :] != 0)[0]
                # sort by similarity and select K of them
                neighbours_aug = np.vstack([neighbours, self.s_i[neighbours, item]])
                neighbours = neighbours[np.argsort(neighbours_aug[1, :])[-K:]]
                # have zero neighbour who has rated this item
                if len(neighbours) == 0:
                    # ratings_predict[i] = self.r_u[user]
                    continue
                # predict
                ratings_predict[item] = self.s_i[neighbours, item].sum()
            
            self.i_rec_item_based[user] = sorted(items, key=lambda x: ratings_predict[x], reverse=True)[:5]
        
        # return ratings_predict
        self.get_pre(self.i_rec_item_based)
        self.get_rec(self.i_rec_item_based)
    
    def hybrid_performance(self):
        # Initialize I
        self.i_rec_hybrid = np.zeros([self.n, 5], dtype=np.int32)
        for user in range(self.n):
            items = np.where(self.R[user, :] == 0)[0]
            ratings_predict = np.zeros(self.m)
            for item in items:
                # Neighbours
                neighbours = np.where(self.R[user, :] != 0)[0]
                # sort by similarity and select K of them
                neighbours_aug = np.vstack([neighbours, self.s_i[neighbours, item]])
                neighbours = neighbours[np.argsort(neighbours_aug[1, :])[-K:]]
                # have zero neighbour who has rated this item
                if len(neighbours) == 0:
                    # ratings_predict[i] = self.r_u[user]
                    continue
                # predict
                ratings_predict[item] = self.s_i[neighbours, item].sum()
                
                neighbours = np.intersect1d(self.n_u[user], np.where(self.R[:, item] != 0)[0])
                # neighbours = np.where(self.R[:, item] != 0)[0]
                # sort by similarity and select K of them
                # neighbours_aug = np.vstack([neighbours, self.s_u[neighbours, user]])
                # neighbours = neighbours[np.argsort(neighbours_aug[1, :])[-K:]]
                # have zero neighbour who has rated this item
                if len(neighbours) == 0:
                    # ratings_predict[i] = self.r_u[user]
                    continue
                # predict 
                ratings_predict[item] = (ratings_predict[item] + self.s_u[neighbours, user].sum()) / 2
            
            self.i_rec_hybrid[user] = sorted(items, key=lambda x: ratings_predict[x], reverse=True)[:5]
        
        # return ratings_predict
        self.get_pre(self.i_rec_hybrid)
        self.get_rec(self.i_rec_hybrid)
        
    def get_pre(self, i_rec):
        self.pre = 0
        self.pre_u = np.zeros(self.n)
        for user in self.users_test:
            self.pre_u[user] = self.R_test[user, i_rec[user]].sum() / 5
            self.pre += self.R_test[user, i_rec[user]].sum() / 5
        self.pre /= len(self.users_test)
        print(round(self.pre, 4))
    
    def get_rec(self, i_rec):
        self.rec = 0
        self.rec_u = np.zeros(self.n)
        for user in self.users_test:
            self.rec_u[user] = self.R_test[user, i_rec[user]].sum() / self.y_user_test[user].sum()
            self.rec += self.R_test[user, i_rec[user]].sum() / self.y_user_test[user].sum()
        self.rec /= len(self.users_test)
        print(round(self.rec, 4))

In [178]:
# Load the records
records_train = np.loadtxt('../data/ml-100k/u1.base', dtype=np.int32)
records_test = np.loadtxt('../data/ml-100k/u1.test', dtype=np.int32)

# Preprocess
records_train[:, :2] -= 1
records_test[:, :2] -= 1

K = 50

In [192]:
occf = OCCF(records_train, records_test)




In [180]:
occf.user_based_performance()

0.393
0.1275


In [183]:
occf.item_based_performance()

0.3654
0.1175


In [193]:
occf.hybrid_performance()

0.3939
0.132


In [123]:
print(occf.s_u.max())
tt = np.array([1, 2, 3])
tt.max()

1.0


3