# Recommender System

This is a recommender system using collaborating method. I used Matrix Factorisation algorithm to build the recommender. The dataset used is a real world rating matrix provided by Dabolinux Technology Company. 

First we start by downloading the dataset:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_rows = 10
#from mxnet import autograd, gluon, 
import numpy as np
#from mxnet.gluon import nn
#import mxnet as mx
#npx.set_np()
from IPython import display
import math
import time

In [None]:
names = [i for i in range(1, 5826)]
df = pd.read_excel('rtmatric.xlsx', names=names)
df = pd.DataFrame(df, index=[i for i in range(1, 338)])
df.index.name, df.columns.name ='userid', 'itemid'
df

# Statistics of the Dataset

Next, we are going to inspect the data manually:

In [None]:
num_users = df.shape[0]
num_items = df.shape[1]
sparsity = 1 - len(df) / (num_users * num_items)

In [None]:
print('number of users: %d, number of items: %d.' % (num_users, num_items))
print('matrix sparsity: %f' % sparsity)
df.max().sort_values(ascending=False)

The sparsity of the data shows that the data is very sparse,(sparsity=99.9%).

In [None]:
df.info()

In [None]:
df.isnull()

Then, we plot the distribution of different rating matrix:

In [None]:
ratings = df.values.reshape(-1, 1)

In [None]:
plt.hist(ratings, bins=8, ec='black')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Distribution of Rating Matrix')
plt.show()

We can see that majority of the rating matrix is between -0.25 to 0.2.

# Splitting the dataset

Next, We split the dataset into training and test sets.

In [None]:
def split_df(df, num_users, num_items, split_mode='random', test_ratio=0.1):

  if split_mode == 'random':
    mask = [True if x == 1 else False for x in np.random.uniform(0, 1, (len(df))) < 1 - test_ratio]
    neg_mask = [not x for x in mask]
    train_data, test_data = df[mask], df[neg_mask]
    return train_data, test_data

In [None]:
train_data, test_data = split_df(df, num_users, num_items, split_mode='random', test_ratio=0.1)

In [11]:
test_data

itemid,1,2,3,4,5,6,7,8,9,10,...,5816,5817,5818,5819,5820,5821,5822,5823,5824,5825
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.866,0.227,0.233,0.220,0.221,0.366,0.836,0.863,0.372,0.619,...,-1,0.413,0.150,0.173,0.091,0.568,0.092,0.164,0.090,0.095
4,1.833,0.251,0.257,0.261,0.239,0.254,0.896,0.920,0.421,0.860,...,-1,0.390,0.195,0.476,0.296,0.340,0.251,1.059,0.412,0.247
18,3.236,0.268,0.278,0.259,0.260,2.493,0.688,1.053,0.467,0.813,...,-1,0.615,0.337,0.713,0.537,0.448,0.490,0.490,0.481,0.460
21,6.892,0.268,0.276,0.259,1.364,0.283,0.657,1.007,0.449,0.925,...,-1,0.507,0.121,0.708,0.204,0.310,0.220,0.319,0.204,0.212
56,0.814,0.219,0.229,0.211,0.212,0.453,0.880,0.907,0.354,0.580,...,-1,0.256,0.053,0.460,0.201,0.175,0.232,0.318,0.196,0.212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,6.022,0.321,0.328,0.311,0.313,6.582,5.550,0.828,0.563,6.126,...,-1,5.661,5.466,5.801,5.381,5.383,5.450,5.443,5.530,5.384
310,0.861,0.225,0.232,0.211,0.212,0.411,0.541,0.534,0.372,0.540,...,-1,0.461,0.221,0.457,0.064,0.091,0.089,0.093,0.064,0.093
311,6.247,0.239,0.295,0.229,0.227,0.448,0.563,0.866,0.398,0.677,...,-1,0.618,18.519,0.586,0.400,0.435,0.383,0.330,0.496,0.417
312,8.351,0.233,0.245,0.227,0.225,0.333,1.012,0.966,0.401,0.610,...,-1,0.299,0.128,0.183,0.139,0.146,0.237,0.168,0.134,0.134


# Loading the data

After dataset splitting, we will convert the training set and test set into lists and dictionaries/matrix
for the sake of convenience.

In [12]:
def load_df(df, num_users, num_items, feedback='explicit'):
    users, items, scores = [], [], []
    inter = np.zeros((num_items, num_users)) if feedback == 'explicit' else {}
    for line in df.itertuples():
        user_index, item_index = int(line[1] - 1), int(line[2] - 1)
        score = int(line[3]) if feedback == 'explicit' else 1
        users.append(user_index)
        items.append(item_index)
        scores.append(score)
        if feedback == 'implicit':
            inter.setdefault(user_index, []).append(item_index)
        else:
            inter[item_index, user_index] = score
    return users, items, scores, inter

Afterwards, we put the above two functions together and it will be used during training the model.

In [13]:
def split_and_load(split_mode='random', feedback='explicit', test_ratio=0.1, batch_size=256):
    df3, num_users, num_items = df, df.shape[0], df.shape[1]
    train_data, test_data = split_df(df3, num_users, num_items, split_mode, test_ratio)
    train_u, train_i, train_r, _ = load_df(train_data, num_users, num_items, feedback)
    test_u, test_i, test_r, _ = load_df(test_data, num_users, num_items, feedback)
    train_set = gluon.data.ArrayDataset(np.array(train_u), np.array(train_i), np.array(train_r))
    test_set = gluon.data.ArrayDataset(np.array(test_u), np.array(test_i), np.array(test_r))
    train_iter = gluon.data.DataLoader(train_set, shuffle=True, last_batch='rollover', batch_size=batch_size)
    test_iter = gluon.data.DataLoader(test_set, batch_size=batch_size)
    return num_users, num_items, train_iter, test_iter

Next, we define two functions which will allow the model to use GPUs or CPU otherwise: 

In [14]:
def try_gpu(i=0):
  return npx.gpu(i) if npx.num_gpus() >= i + 1 else npx.cpu()

def try_all_gpus():
  ctxes = [npx.gpu(i) for i in range(npx.num_gpus())]
  return ctxes if ctxes else [npx.cpu()]
try_gpu(), try_gpu(3), try_all_gpus()

NameError: name 'npx' is not defined

Then, we create a function that will record multiple running times: 

In [15]:
import math
import time
class Timer:
  def __init__(self):
    self.times = []
    self.start()
  def start(self):
    self.tik = time.time()
  def stop(self):
    self.times.append(time.time() - self.tik)
    return self.times[-1]
  def avg(self):
    return sum(self.times) / len(self.times)
  def sum(self):
    return sum(self.times)
  def cumsum(self):
    return np.array(self.times).cumsum().tolist()

Then, we create a function and use the svg format display in the jupyter.

In [16]:
def use_svg_display():
  display.set_matplotlib_formats('svg')

Then, a function that set the axes for matplotlib:

In [17]:
def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
  axes.set_xlabel(xlabel)
  axes.set_ylabel(ylabel)
  axes.set_xscale(xscale)
  axes.set_yscale(yscale)
  axes.set_xlim(xlim)
  axes.set_ylim(ylim)
  if legend:
    axes.legend(legend)
    axes.grid()

Then, a function that incrementally plot the lines and add multiple datapoints into the figure:

In [18]:
class Animator:
  def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None, ylim=None, xscale='linear', yscale='linear', fmts=None, nrows=1, ncols=1, figsize=(4, 5)):
    if legend is None:
      legend = []
    use_svg_display()
    self.fig, self.axes = plt.subplots(nrows, ncols, figsize=figsize)
    if nrows * ncols == 1:
      self.axes = [self.axes, ]
    self.config_axes = lambda: set_axes(self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
    self.X, self.Y, self.fmts = None, None, fmts

  def add(self, x, y):
    if not hasattr(y, "__len__"):
      y = [y]
    n = len(y)
    if not hasattr(x, "__len__"):
      x = [x] * n
    if not self.X:
      self.X = [[] for _ in range(n)]
    if not self.Y:
      self.Y = [[] for _ in range(n)]
    if not self.fmts:
      self.fmts = ['-'] * n
    for i, (a, b) in enumerate(zip(x, y)):
      if a is not None and b is not None:
        self.X[i].append(a)
        self.Y[i].append(b)
    self.axes[0].cla()
    for x, y, fmt in zip(self.X, self.Y, self.fmts):
      self.axes[0].plot(x, y, fmt)
    self.config_axes()
    display.display(self.fig)
    display.clear_output(wait=True)

Then, a function that create a list of numbers over time:

In [19]:
class Accumulator:
  def __init__(self, n):
    self.data = [0.0] * n
  def add(self, *args):
    self.data = [a+float(b) for a, b in zip(self.data, args)]
  def reset(self):
    self.data = [0] * len(self.data)
  def __getitem__(self, idx):
    return self.data[idx]

# Model Implementation

In [20]:


import math as mt
from scipy.sparse.linalg import * 
from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix



def comp_svd(urm, K):
    U, s, Vt = svds(urm, K)

    dim = (len(s), len(s))
    S = np.zeros(dim, dtype=np.float32)
    for i in range(0, len(s)):
        S[i,i] = mt.sqrt(s[i])

    U = csc_matrix(U, dtype=np.float32)
    S = csc_matrix(S, dtype=np.float32)
    Vt = csc_matrix(Vt, dtype=np.float32)
    
    return U, S, Vt

def comp_est_matx(urm, U, S, Vt, uTest, K, test):
    rightTerm = S*Vt 
    max_recommendation = 250
    estimatedRatings = np.zeros(shape=(MAX_UID, MAX_PID), dtype=np.float16)
    recomendRatings = np.zeros(shape=(MAX_UID,max_recommendation ), dtype=np.float16)
    for userTest in uTest:
        prod = U[userTest, :]*rightTerm
        estimatedRatings[userTest, :] = prod.todense()
        recomendRatings[userTest, :] = (-estimatedRatings[userTest, :]).argsort()[:max_recommendation]
    return recomendRatings

In [21]:
from scipy.sparse import coo_matrix
from scipy import sparse
data_array = df.values
row_array = df.iloc[1:338].values
col_array = df[1:5826].values
data_sparse = coo_matrix((data_array, (row_array, col_array)))
#data_sparse = sparse.csr_matrix(data_array)


K=50
urm = data_sparse
MAX_PID = urm.shape[1]
MAX_UID = urm.shape[0]

U, S, Vt = comp_svd(urm, K)
uTest = [10]
#Get estimated rating for test user
print("Predicted ratings:")
uTest_recommended_items = comp_est_matx(urm, U, S, Vt, uTest, K, True)
#uTest_recommended_items

for user in uTest:
    print("Recommendation for user with user id {}". format(user))
    rank_value = 1
    for i in uTest_recommended_items[user,0:10]:
        item_details = train_data.columns == i
        print("The number {} recommended item is {} ".format(rank_value, list(item_details)[0]))
        rank_value+=1
        

TypeError: 'numpy.float64' object cannot be interpreted as an integer

# Evaluation Measures

We then implement the RMSE (root-mean-square error) measure:

# Training and Evaluating the Model

In [22]:
from surprise import Dataset
from surprise import Reader

In [24]:
from lightfm import LightFM

In [25]:
train_sparse = sparse.csr_matrix(train_data)
test_sparse = sparse.csr_matrix(test_data)



<337x5825 sparse matrix of type '<class 'numpy.float64'>'
	with 1963025 stored elements in Compressed Sparse Row format>

In [None]:
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM(learning_rate=0.05, loss='bpr')
model.fit(data_sparse, epochs=10, num_threads=4)