In [25]:
from math import sqrt
from operator import itemgetter

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn import metrics, preprocessing


In [84]:
class CollaborativeFilter:
    def __init__(self, records_train, records_test, threhold):
        records = np.vstack([records_train, records_test])
        n = len(np.unique(np.sort(records[:, 0])))
        m = len(np.unique(np.sort(records[:, 1])))

        # Initial R
        R = np.zeros([n, m], dtype=np.int32)
        self.R = R
        self.n = n
        for record in records_train:
            R[record[0], record[1]] = record[2]

        # Initial indicator
        y = np.where(R, 1, 0)
        y_user = np.sum(y, axis=1)
        y_item = np.sum(y, axis=0)

        # Global average of rating
        self.r = np.sum(R) / np.sum(y)

        # average rating of user
        self.r_u = np.where(y_user,
                            np.sum(R, axis=1) / y_user,
                            self.r)

        # average rating of item
        self.r_i = np.where(y_item,
                            np.sum(R, axis=0) / y_item,
                            self.r)

        # Calculate similarity of users
        self.s_u = np.zeros([n, n])
        for u in range(n):
            for w in range(u):
                items = (R[u, :] != 0) * (R[w, :] != 0)
                if items.sum() <= threhold:
                    continue
                vec_u = (R[u, items] - self.r_u[u]) / (np.log(y_item[items]) + 1)
                vec_w = (R[w, items] - self.r_u[w]) / (np.log(y_item[items]) + 1)
                dot = vec_u.dot(vec_w)
                mag_vec_u = sqrt(np.square(vec_u).sum()) 
                mag_vec_w = sqrt(np.square(vec_w).sum())
                if mag_vec_u == 0 or mag_vec_w == 0:
                    continue
                similarity = dot / mag_vec_u / mag_vec_w
                
                if similarity < 0:
                    continue
                self.s_u[u, w] = self.s_u[w, u] = similarity
            

        # max_abs_scaler = preprocessing.MaxAbsScaler()
        #Initialize the scaler and scale the X_trai|n data
        # self.s_u = max_abs_scaler.fit_transform(self.s_u.T).T
        
        
        
        # Calculate similarity of items
        # self.s_i = np.zeros([m, m])
        # for i in range(m):
        #     for j in range(i):
        #         users = (R[:, i] != 0) * (R[:, j] != 0)
        #         if users.sum() == 0:
        #             continue
        #         vec_i = R[users, i] - self.r_u[users]
        #         vec_j = R[users, j] - self.r_u[users]
        # 
        #         dot = vec_i.dot(vec_j)
        #         mag_vec_i = sqrt(np.square(vec_i).sum())
        #         mag_vec_j = sqrt(np.square(vec_j).sum())
        #         if mag_vec_i == 0 or mag_vec_j == 0:
        #             continue
        #         similarity = dot / mag_vec_i / mag_vec_j
        # 
        #         self.s_i[i, j] = self.s_i[j, i] = similarity
        
            
    def user_based_performance(self, records_test, K):
        ratings_predict = np.zeros([len(records_test), 1])
        for i in range(len(records_test)):
            user = records_test[i, 0]
            item = records_test[i, 1]

            # Neighbours
            neighbours = np.intersect1d(np.where(self.s_u[user, :] > 0)[0], np.where(self.R[:, item] != 0)[0])

            # have zero neighbour who has rated this item
            if len(neighbours) == 0:
                ratings_predict[i] = self.r_u[user]
                continue

            # sort by similarity and select K of them
            neighbours_aug = np.vstack([neighbours, self.s_u[user, neighbours]])
            neighbours = neighbours[np.argsort(neighbours_aug[1, :])[-K:]]

            # predict
            ratings_predict[i] = (self.R[neighbours, item] - self.r_u[neighbours]).dot(self.s_u[user, neighbours])
            ratings_predict[i] /= np.abs(self.s_u[user, neighbours]).sum()
            ratings_predict[i] += self.r_u[user]

        return ratings_predict

    def item_based_performance(self, records_test, K):
        ratings_predict = np.zeros([len(records_test), 1])
        for i in range(len(records_test)):
            user = records_test[i, 0]
            item = records_test[i, 1]

            # Neighbours
            neighbours = np.intersect1d(np.where(self.s_i[:, item] > 0)[0], np.where(self.R[user, :] != 0)[0])

            # have zero neighbour which was rated by the user
            if len(neighbours) == 0:
                ratings_predict[i] = self.r_u[user]
                continue

            # sort by similarity and select K of them
            neighbours_aug = np.vstack([neighbours, self.s_i[item, neighbours]])
            neighbours = neighbours[np.argsort(neighbours_aug[1, :])[-K:]]

            # predict
            ratings_predict[i] = self.R[user, neighbours].dot(self.s_i[item, neighbours])
            ratings_predict[i] /= np.abs(self.s_i[item, neighbours]).sum()

        return ratings_predict

In [16]:
def score(rating_test, rating_predict):
    return [round(sqrt(metrics.mean_squared_error(rating_test, rating_predict)), 4),
            round(metrics.mean_absolute_error(rating_test, rating_predict), 4)]

In [5]:
# Load the records
records_train = np.loadtxt('../data/ml-100k/u1.base', dtype=np.int32)
records_test = np.loadtxt('../data/ml-100k/u1.test', dtype=np.int32)

# Preprocess
records_train[:, :2] -= 1
records_test[:, :2] -= 1
ratings_test = records_test[:, 2]
records = np.vstack([records_train, records_test])

In [85]:
%%time
threshold = 4
usercf = CollaborativeFilter(records_train, records_test, threshold)



CPU times: user 13 s, sys: 21.5 ms, total: 13 s
Wall time: 13 s


In [88]:
print(usercf.R[:5, :5])

[[5 3 4 3 3]
 [4 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]


In [None]:
%%time
for line in usercf.s_u:
    if line.max() != 1.0 and np.abs(line).max() != 0.0:
        print(line.max())
        break

In [87]:
K = 50
for K in [30, 40, 45, 50, 55]:
    ratings_predict_user_based = usercf.user_based_performance(records_test, K)
    print(score(np.clip(ratings_predict_user_based, 1, 5), ratings_test))

[0.9559, 0.746]
[0.9542, 0.7447]
[0.9542, 0.7446]
[0.9542, 0.7446]
[0.9543, 0.7448]


In [18]:
%%time
ratings_predict_item_based = usercf.item_based_performance(records_test, K)

print(score(np.clip(ratings_predict_item_based, 1, 5), ratings_test))

[0.9981, 0.7776]
[0.9757, 0.7616]
[0.9688, 0.7563]
[0.9678, 0.7556]
[0.9664, 0.755]
[0.9655, 0.7542]
[0.9658, 0.7544]
[0.9665, 0.755]
[0.9673, 0.7558]
[0.968, 0.7566]
CPU times: user 21.7 s, sys: 39.2 ms, total: 21.7 s
Wall time: 21.7 s


In [None]:
print(score(np.clip((ratings_predict_item_based + ratings_predict_user_based) / 2, 1, 5), ratings_test))

In [None]:
results = np.array([score(np.clip(ratings_predict_user_based, 1, 5), ratings_test),
                    score(np.clip(ratings_predict_item_based, 1, 5), ratings_test),
                    score(np.clip((ratings_predict_item_based + ratings_predict_user_based) / 2, 1, 5), ratings_test)])

In [None]:
results

In [None]:
# Plot the figure
metrics = ['RMSE', 'RAE']
labels = ['user-based CF', 'item-based CF', 'hybrid CF']
plt.style.use('seaborn')
fig, ax = plt.subplots()

for result, label in zip(results, labels):
    ax.plot(metrics, result, label=label)

ax.legend(fontsize="x-large", loc=2, bbox_to_anchor=(0.04 ,-0.2),borderaxespad = 0.) 
x_major_locator= MultipleLocator(1)

ax.xaxis.set_major_locator(x_major_locator)

# Format plot
plt.title("Comparision between different prediction rules", fontsize=24)
plt.xlabel('Metrics', fontsize=16)
plt.ylabel("Performance", fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=16)

print(plt.xlim())

plt.xlim(-0.4, 1.4)

plt.show()

#[0.9527, 0.7439]

In [79]:
tt = np.array([1, 2, 3, 4, 5])
gg = np.array([1, 2, 3, 4, 5])
tt / gg

array([1., 1., 1., 1., 1.])

In [30]:
(tt - np.average(tt)) / np.std(tt)

array([[-1.41421356, -0.70710678,  0.        ,  0.70710678,  1.41421356]])