<a href="https://colab.research.google.com/github/bvaisakh/rec_sys/blob/master/Collaborative_Filtering_Model_Based_%5BFunk_SVD%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# importing libraries

import pandas as pd
import numpy as np

In [0]:
# defining filtering class

class CF():

    # initializing the user-product rating matrix, no. of latent features, alpha and beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    # initializing user-feature and product-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # list of training samples
        self.samples = [
        (i, j, self.R[i, j])
        for i in range(self.num_users)
        for j in range(self.num_items)
        if self.R[i, j] > 0
        ]

        # stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse = np.sqrt(self.mse())
            training_process.append((i, rmse))
            if (i+1) % 20 == 0:
                print("Iteration: %d; error = %.4f" % (i+1, rmse))

    # computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        mse = error / (self.full_matrix().shape[0] * self.full_matrix().shape[1])
        return mse

    # stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    # ratings for user i and movie j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # full user-product rating matrix
    def full_matrix(self):
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)

In [6]:
# loading the training data

r_cols = ['user_id', 'product_id', 'rating', 'unix_timestamp']

ratings_train = pd.read_csv('ratings_train.data', sep='\t', names=r_cols,encoding='latin-1')
print("\nTraining Data:")
print("shape : ", ratings_train.shape)
print(ratings_train.head())

training_data = np.array(ratings_train.pivot(index = 'user_id', columns ='product_id', values = 'rating').fillna(0))
print("\nRating Matrix:")
print("shape : ", training_data.shape)
print(training_data)


Training Data:
shape :  (90570, 4)
   user_id  product_id  rating  unix_timestamp
0        1           1       5       874965758
1        1           2       3       876893171
2        1           3       4       878542960
3        1           4       3       876893119
4        1           5       3       889751712

Rating Matrix:
shape :  (943, 1680)
[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]


In [7]:
# initializes filtering class and trains

cf = CF(training_data, K=20, alpha=0.001, beta=0.01, iterations=100)
cf.train()

Iteration: 20; error = 0.2239
Iteration: 40; error = 0.2199
Iteration: 60; error = 0.2173
Iteration: 80; error = 0.2135
Iteration: 100; error = 0.2069


In [8]:
# loads the test data

ratings_test = pd.read_csv('ratings_test.data', sep='\t', names=r_cols,encoding='latin-1')
print("\nTest Data:")
print("shape : ", ratings_test.shape)
print(ratings_test.head())

test_data= np.array(ratings_test.pivot(index = 'user_id', columns ='product_id', values = 'rating').fillna(0))


Test Data:
shape :  (9430, 4)
   user_id  product_id  rating  unix_timestamp
0        1          20       4       887431883
1        1          33       4       878542699
2        1          61       4       878542420
3        1         117       3       874965739
4        1         155       2       878542201


In [9]:
# generates the test results and measures the accuracy

n_users = ratings_test.user_id.unique().shape[0]
n_items = ratings_test.product_id.unique().shape[0]

test_results = [
(i, j, test_data[i, j], cf.get_rating(i, j))
for i in range(n_users)
for j in range(n_items)
if test_data[i, j] > 0
]

targets = np.asarray([result[2] for result in test_results])
predictions = np.asarray([result[3] for result in test_results])
rmse = np.sqrt(((predictions - targets) ** 2).mean())

print("RMSE on test data: {}".format(rmse))

RMSE on test data: 1.1328672357296004


In [10]:
test_results

[(0, 18, 4.0, 4.024793590901463),
 (0, 31, 4.0, 3.8108878281615355),
 (0, 54, 4.0, 3.999355046047323),
 (0, 106, 3.0, 3.560711528399995),
 (0, 143, 2.0, 4.186890739320551),
 (0, 147, 4.0, 3.063396594760983),
 (0, 158, 5.0, 3.2145598498787913),
 (0, 176, 3.0, 4.073415670624072),
 (0, 189, 5.0, 4.340697509651078),
 (0, 251, 4.0, 2.7932490566587287),
 (1, 12, 4.0, 3.5657287975197884),
 (1, 43, 5.0, 3.466154747770046),
 (1, 237, 5.0, 4.025856847102739),
 (1, 266, 3.0, 3.5211432256716897),
 (1, 267, 3.0, 3.9325501775109855),
 (1, 276, 3.0, 3.6296124179466096),
 (1, 278, 4.0, 3.515872369794113),
 (1, 283, 4.0, 3.3319333702351503),
 (1, 298, 3.0, 3.0698013115152087),
 (1, 300, 1.0, 3.432496682866042),
 (2, 232, 1.0, 2.9494113713810854),
 (2, 280, 2.0, 2.7824783338952703),
 (2, 308, 2.0, 2.492494476116184),
 (2, 313, 5.0, 2.530762626031869),
 (2, 316, 4.0, 3.5290098818921307),
 (2, 317, 1.0, 4.1411376539293965),
 (2, 319, 3.0, 3.7314251392702102),
 (2, 320, 1.0, 2.7688847119869417),
 (2, 322, 