# Lab-7 - Collaborative filtering

## Before you start

- Notebook tasks can be done individually or in a group of two
- Please save notebooks with outputs filled in - this will speed up the checking process
- Send notebooks with solutions via email:
  - To: michal.wojcik@doctorate.put.poznan.pl
  - Subject format example: [IR] Lab 7 - Jan Kowalski 123456, Anna Nowak 789012
  - Attach: notebook file
- Deadline - 14 days after the class
- All of the tasks require implementation - complete the code
- The number of points for each task is next to the command


## Dataset

Data source: https://grouplens.org/datasets/movielens/ (File [ml-100k.zip](https://files.grouplens.org/datasets/movielens/ml-100k.zip))

- 943 users
- 1682 items
- 100000 ratings (1-5 scale)

### Team members:

- Sofya Aksenyuk, 150284;

- Uladzimir Ivashka, 150281.

# Imports

In [1]:
import os
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np
from surprise import AlgoBase, SlopeOne, KNNBasic, NormalPredictor, SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
import random

pd.options.display.max_columns = 50

# Read data

In [2]:
train_set_path = 'data/ua.base'
test_set_path = 'data/ua.test'
sets_column_names = ['User_ID', 'Item_ID', 'Rating', 'Timestamp']

items_path = 'data/u.item'
items_column_names = [
    'Item_ID', 'Movie_Title', 'Release_Date', 'Video_Release_Date', 
    'IMDb_URL', 'Unknown', 'Action', 'Adventure', 'Animation', 
    'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 
    'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
    'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

users_path = 'data/u.user'
users_column_names=['User_ID', 'Age', 'Gender', 'Occupation', 'Zip_Code']

train_set = pd.read_csv(train_set_path, sep='\t', names=sets_column_names, encoding = "ISO-8859-1")
display(train_set)

test_set = pd.read_csv(test_set_path, sep='\t', names=sets_column_names, encoding = "ISO-8859-1")
display(test_set)

items = pd.read_csv(items_path, sep='|', names=items_column_names, encoding = "ISO-8859-1")
display(items)

users = pd.read_csv(users_path, sep='|', names=users_column_names, encoding = "ISO-8859-1")
display(users)

Unnamed: 0,User_ID,Item_ID,Rating,Timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
90565,943,1047,2,875502146
90566,943,1074,4,888640250
90567,943,1188,3,888640250
90568,943,1228,3,888640275


Unnamed: 0,User_ID,Item_ID,Rating,Timestamp
0,1,20,4,887431883
1,1,33,4,878542699
2,1,61,4,878542420
3,1,117,3,874965739
4,1,155,2,878542201
...,...,...,...,...
9425,943,232,4,888639867
9426,943,356,4,888639598
9427,943,570,1,888640125
9428,943,808,4,888639868


Unnamed: 0,Item_ID,Movie_Title,Release_Date,Video_Release_Date,IMDb_URL,Unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,User_ID,Age,Gender,Occupation,Zip_Code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [3]:
users_agg = train_set[['User_ID', 'Rating']].groupby('User_ID').agg(['mean', 'count'])
display(users_agg.sort_values([('Rating', 'mean')]))
display(users_agg.sort_values([('Rating', 'count')]))

Unnamed: 0_level_0,Rating,Rating
Unnamed: 0_level_1,mean,count
User_ID,Unnamed: 1_level_2,Unnamed: 2_level_2
181,1.489412,425
405,1.839065,727
445,2.000000,125
774,2.074766,214
685,2.100000,10
...,...,...
583,4.647059,17
225,4.647059,17
507,4.708333,48
849,4.846154,13


Unnamed: 0_level_0,Rating,Rating
Unnamed: 0_level_1,mean,count
User_ID,Unnamed: 1_level_2,Unnamed: 2_level_2
202,2.700000,10
441,3.700000,10
685,2.100000,10
34,3.800000,10
36,4.000000,10
...,...,...
276,3.474409,508
450,3.860377,530
13,3.089457,626
655,2.909630,675


In [4]:
items_agg = train_set[['Item_ID', 'Rating']].groupby('Item_ID').agg(['mean', 'count'])
display(items_agg.sort_values([('Rating', 'mean')]))
display(items_agg.sort_values([('Rating', 'count')]))

Unnamed: 0_level_0,Rating,Rating
Unnamed: 0_level_1,mean,count
Item_ID,Unnamed: 1_level_2,Unnamed: 2_level_2
1486,1.0,1
1548,1.0,1
1343,1.0,1
830,1.0,1
1617,1.0,1
...,...,...
1656,5.0,1
1189,5.0,3
1122,5.0,1
1293,5.0,2


Unnamed: 0_level_0,Rating,Rating
Unnamed: 0_level_1,mean,count
Item_ID,Unnamed: 1_level_2,Unnamed: 2_level_2
1682,3.000000,1
1571,1.000000,1
1570,1.000000,1
1569,1.000000,1
1568,1.000000,1
...,...,...
286,3.692500,400
258,3.791262,412
181,4.011390,439
100,4.148984,443


# Helpers

In [5]:
def print_true_value(dataset, user_id, item_id):
    true_df = dataset[(dataset['User_ID'] == user_id) & (dataset['Item_ID'] == item_id)]
    if len(true_df):
        print("TRUE VALUE:", true_df.iloc[0]['Rating'])
    else:
        print("TRUE VALUE: UNKNOWN")

# Models

#### *Surprise* package

"Surprise is a Python scikit for building and analyzing recommender systems that deal with explicit rating data."

The package will be used in several places. Link to the documentation: https://surprise.readthedocs.io/en/stable/index.html

#### Task 0 [3p]

- Convert dataframes (*train_set* and *test_set*) to a form that will allow you to perform *fit* and *predict* on models from the *Surprise* package **[2p]**
- Test the solution on a model that performs random predictions for the user $289$ and item $815$ **[1p]**

In [6]:
# TODO - load dataframes to Surprise framework train_set and test_set form [2p]

# Hint:
# Reader - https://surprise.readthedocs.io/en/stable/reader.html?highlight=reader#surprise.reader.Reader
# Dataset - https://surprise.readthedocs.io/en/stable/dataset.html?highlight=Dataset#surprise.dataset.Dataset
# load_from_df - https://surprise.readthedocs.io/en/stable/dataset.html?highlight=build_full_trainset#surprise.dataset.Dataset.load_from_df
# build_full_trainset - https://surprise.readthedocs.io/en/stable/dataset.html?highlight=build_full_trainset#surprise.dataset.DatasetAutoFolds.build_full_trainset
# build_testset - https://surprise.readthedocs.io/en/stable/trainset.html#surprise.Trainset.build_testset

reader = Reader()
train_set_data = Dataset.load_from_df(train_set[["User_ID", "Item_ID", "Rating"]], reader)
test_set_data = Dataset.load_from_df(test_set[["User_ID", "Item_ID", "Rating"]], reader)

trainset = train_set_data.build_full_trainset()
testset = test_set_data.build_full_trainset()

testset = testset.build_testset()

In [7]:
# TODO - test the solution with the random model [1p]

# Hint:
# NormalPredictor - https://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor

user_id = 289
item_id = 815

model = NormalPredictor()
model.fit(trainset)
model.predict(user_id, item_id)

Prediction(uid=289, iid=815, r_ui=None, est=3.2831250517585047, details={'was_impossible': False})

#### Symbols

- $r_{ui}$ - Rating given by the user $u$ to item $i$
- $\hat{r}_{ui}$ - Prediction of the rating given by the user $u$ to item $i$
- $\mu_u$ - Average rating provided by the user $u$ to all of the rated items
- $R_i(u)$ - Set of items that was rated by $u$ and was rated by at least one other user that also rated item $i$
- $U_{i}$ - A set of users who rated item $i$
- $U_{ij}$ - A set of users who rated both item $i$ and $j$
- $I_{u}$ - A set of items which was rated by $u$
- $I_{uv}$ - A set of items which was rated both by $u$ and $v$
- $\text{dev}(i, j)$ - Average difference in ratings between items $i$ and $j$

## 1) Average rating per Item

**General idea:**

The user $u$ will rate the item $i$ according to the average rating for $i$ provided by other users:

$$
\hat{r}_{ui} = \frac{\sum\limits_{v \in U_{i}} r_{vi}}{|U_{i}|}
$$

### Task 1 [4p]

- Implement the code that executes the above algorithm **[3p]**
  - Use *train_set* for training and predict the *test_set* ratings
- Calculate the root mean squared error (RMSE) on the *test_set* predictions **[1p]**

In [8]:
# TODO - Prepare a prediction for a test set based on average ratings for items in the train set [3p]
ruis = dict()

for item in train_set['Item_ID'].unique():
    x = train_set[train_set['Item_ID'] == item]
    ruis[item] = np.mean(x['Rating'])

In [9]:
# TODO - Measure RMSE for test set [1p]

# Hint: mean_squared_error - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn-metrics-mean-squared-error

pred = [ruis[i] if i in ruis else -1 for i in test_set['Item_ID']] ## -1 when not in train_set

print(f'Amount of missing Item_IDs that are not present in train_set, but are in test_set: {pred.count(-1)}')

mse = mean_squared_error(pred, test_set['Rating'])
print(f'MSE: {mse}')

Amount of missing Item_IDs that are not present in train_set, but are in test_set: 2
MSE: 1.0886091212940086


## 2) Slope One

**General idea:**

In order to predict the user $u$ rating for the item $i$, check how any other item ($j$) was rated by $u$ and what was the average difference between the ratings of items $i$ and $j$ among users who rated both of them.

To generalize the above approach to all users and items that are available in the dataset, and then aggregate the estimates into one prediction, it is worth using the following formulas.

$$
\hat{r}_{ui} = \mu_u + \frac{1}{|R_i(u)|}\sum\limits_{j \in R_i(u)} \text{dev}(i, j)
$$

$$
\text{dev}(i, j) = \frac{1}{|U_{ij}|}\sum\limits_{u \in U_{ij}} r_{ui} - r_{uj}
$$

**Algorithm:**

To estimate unknown $\hat{r}_{ui}$:

- Find all items that was rated by the user $u$
- For each of item ($j$):
  - find all of the users ($U_{ij}$) that rated both items ($i$ and $j$)
  - if $U_{ij} \neq \varnothing$, then calculate average difference ($\text{dev}(i, j)$) between ratings for item $i$ and $j$
- calculate average of each $\text{dev}(i, j)$ and add it to the average rating for the user $u$ ($\mu_u$) to get the final estimation

**Note:** This approach differs from the one presented on the lecture - here we use the arithmetic mean instead of the weighted one.

Details: https://surprise.readthedocs.io/en/stable/slope_one.html#surprise.prediction_algorithms.slope_one.SlopeOne

### Task 2 [7p]

- Implement a function that will allow you to make a prediction for a single user-item pair, based on available ratings, on the basis of a data set **[4p]**
- Predict the user $70$ rating prediction for the item $50$ **[1p]**
    - Use data from train_set for prediction
    - Print the actual rating value from test_set for the mentioned user-item pair
- Verify the result with the *Surprise* package model **[1p]**
- Calculate RMSE for the *test_set* using the *Surprise* package **[1p]**

In [10]:
# TODO - implement a function [4p]

def slope_one(dataset, user_id, item_id):
    prediction = np.mean(dataset[dataset['User_ID'] == user_id]['Rating'])
    avgs = []
    for item in dataset[(dataset['User_ID'] == user_id) & (dataset['Item_ID'] != item_id)]['Item_ID']:
        diffs = []
        for user in dataset['User_ID'].unique():
            user_data = dataset[dataset['User_ID'] == user]
            if item in user_data['Item_ID'].values and item_id in user_data['Item_ID'].values:
                diff = user_data[user_data['Item_ID'] == item_id]['Rating'].values[0] - \
                       user_data[user_data['Item_ID'] == item]['Rating'].values[0]
                
                diffs.append(diff)
        if len(diffs):
            avg = sum(diffs) / len(diffs)
            avgs.append(avg)
        else:
            continue
    prediction += sum(avgs) / len(avgs)
    
    return prediction

In [11]:
# TODO - Make a prediction based on the train_set data [1p]

user_id = 70
item_id = 50

print_true_value(test_set, user_id, item_id)
print('PREDICTED VALUE: ', slope_one(train_set, user_id, item_id))

TRUE VALUE: 4
PREDICTED VALUE:  4.3715204626187525


In [12]:
# TODO - fit and predict the SlopeOne model from the surprise package on the loaded data and check if you get the same result [1p]
user_id = 70
item_id = 50

so_model = SlopeOne()
so_model.fit(trainset)
so_model.predict(user_id, item_id)

Prediction(uid=70, iid=50, r_ui=None, est=4.3715204626187525, details={'was_impossible': False})

In [13]:
# TODO - calculate the RMSE on the test set [1p]

# Hint: rmse - https://surprise.readthedocs.io/en/stable/accuracy.html#surprise.accuracy.rmse

predictions = so_model.test(testset)
rmse = accuracy.rmse(predictions)

RMSE: 0.9649


## 3) k-NN user-based

**General idea:**

User $u$ will rate item $i$ in a similar way as people who rated other items similarly to $u$. Therefore, it is necessary to select those users for whom:
- the rating for the item $i$ is known
- it is possible to evaluate the similarity to the user $u$

The similarity between users is measurable, e.g.:

$$
\text{cosine_sim}(u, v) = \frac{
\sum\limits_{i \in I_{uv}} r_{ui} \cdot r_{vi}}
{\sqrt{\sum\limits_{i \in I_{uv}} r_{ui}^2} \cdot
\sqrt{\sum\limits_{i \in I_{uv}} r_{vi}^2}
}
$$

$$
\text{msd_distance}(u, v) = \frac{1}{|I_{uv}|} \cdot
\sum\limits_{i \in I_{uv}} (r_{ui} - r_{vi})^2
$$

$$
\text{msd_sim}(u, v) = \frac{1}{\text{msd}(u, v) + 1}
$$

The parameter $k$ must be chosen, which indicates how many nearest neighbors (NN) are being considered. In the basic approach, each of the neighbors *votes* for the rating they gave for the item $i$, and the weight of their vote is the similarity measure to user $u$.

$$
\hat{r}_{ui} = \frac{
\sum\limits_{v \in N^k_i(u)} \text{sim}(u, v) \cdot r_{vi}}
{\sum\limits_{v \in N^k_i(u)} \text{sim}(u, v)}
$$

Details: https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic

### Task 3 [7p]

- Implement a function that will allow you to make a prediction for a single user-item pair, based on available ratings, on the basis of a data set **[4p]**
    - Use mean squared difference (MSD) to estimate the similarity between the users
- Predict the user $262$ rating prediction for the item $1147$ **[1p]**
    - Use data from *train_set* for prediction
    - Print the actual rating value from *test_set* for the mentioned user-item pair
- Verify the result with the *Surprise* package model **[1p]**
- Calculate RMSE for the *test_set* using the *Surprise* package **[1p]**

In [14]:
# TODO - implement a function [4p]

def k_nn(dataset, user_id, item_id, k=40):
    sims = dict()
    main_user_data = dataset[dataset['User_ID'] == user_id]
    
    for user in dataset['User_ID'].unique():
        sim = []
        user_data = dataset[dataset['User_ID'] == user]
        for _, row in user_data.iterrows():
            if row['Item_ID'] in main_user_data['Item_ID'].values:
                diff = row['Rating'] - main_user_data[main_user_data['Item_ID'] == row['Item_ID']]["Rating"].values[0]
                sim.append(diff**2)
        sim = sum(sim) / len(sim) if sim else 0
        sims[user] = 1 / (sim + 1)
        
    sims = sorted(sims.items(), key=lambda x: x[1], reverse=True)

    nominator = 0
    denominator = 0
    count_n = 0
    for user, sim in sims:
        if count_n == 40:
            break
            
        cur = dataset[(dataset['User_ID'] == user) & (dataset['Item_ID'] == item_id)]
        if not cur.empty:
            nominator += sim * cur['Rating'].values[0]
            denominator += sim
            count_n += 1

    prediction = nominator / denominator
    
    return prediction

In [15]:
# TODO - Make a prediction based on the train_set data [1p]

user_id = 262
item_id = 1147

print_true_value(test_set, user_id, item_id)
print(k_nn(train_set, user_id, item_id))

TRUE VALUE: 4
3.702814685918104


In [16]:
# TODO - fit and predict the KNNBasic model on the loaded data and check if you get the same result [1p]

knn_model = KNNBasic()
knn_model.fit(trainset)
knn_model.predict(user_id, item_id)

Computing the msd similarity matrix...
Done computing similarity matrix.


Prediction(uid=262, iid=1147, r_ui=None, est=3.702814685918104, details={'actual_k': 19, 'was_impossible': False})

In [17]:
# TODO - calculate the RMSE on the test set [1p]

predictions = knn_model.test(testset)
rmse = accuracy.rmse(predictions)

RMSE: 0.9963


## Matrix factorization (SVD inspired)

**General idea:**

The user ratings of items are influenced by *latent factors*, which are unknown. It is possible to find such values of these factors that make it possible to reproduce the resulting matrix. Stochastic Gradient Descent (SGD) can be used for this purpose.

In the basic version of the algorithm, we are looking for two matrices $p$ and $q^T$, the dot product of which will recreate known user ratings (and also calculate predictions for unknowns):

$$
\hat{r}_{ui} = p_uq_{i}^T
$$

To find the values of the $p$ and $q$ matrices, the error for known ratings should be minimized:

$$
e_{ui} = \frac{(r_{ui} - \hat{r}_{ui})^{2}}{2}
$$

$$
e_{ui}' = r_{ui} - \hat{r}_{ui}
$$

So at each step we take a different known user rating for the item, compute the error, and then update the $p$ and $q$ matrices, thus minimizing the error.

$$
p_{u}' = p_{u} + \gamma \cdot e_{ui}' \cdot q_{i} \\
q_{i}' = q_{i} + \gamma \cdot e_{ui}' \cdot p_{u}
$$

Where $\gamma$ is *learning rate*.

After the optimization step, predictions for unknown evaluations can be obtained using the dot product for the $p$ and $q^T$ matrices.

More details about the algorithm and the more sophisticated version:
https://surprise.readthedocs.io/en/stable/matrix_factorization.html

### Task 4 [9p]

- Implement a function that optimizes $p$ and $q$ matrices with the SGD algorithm **[6p]**
    - Measure and print the mean error for the entire training set for each epoch to see if it is decreasing
    - Return a matrix that contains predictions for all user-item pairs
- Check the real and predicted rating values for the user $619$ and item $332$ **[1p]**
    - Use data from *train_set* for prediction
    - Print the actual rating value from *test_set* for the mentioned user-item pair
- Check the prediction using the *Surprise* SVN model for the same pair **[1p]**
- Calculate RMSE using the *Surprise* package **[1p]**

In [164]:
# TODO - implement a function [6p]

def matrix_factorization(dataset, epochs=20, learning_rate=0.01, factors=25):
    avg = np.mean(dataset.Rating)
    user_item_matrix = np.ndarray(shape=(np.max(dataset.User_ID.values)+1, 
                                         np.max(dataset.Item_ID.values)+1))
    user_item_matrix[dataset.User_ID.values, dataset.Item_ID.values] = dataset.Rating
    
    n_rows, n_cols = user_item_matrix.shape   
    p = np.full((n_rows, factors), (avg / factors)**(0.5))
    q = np.full((n_cols, factors), (avg / factors)**(0.5))
    rui = p @ q.T
    for epoch in range(epochs):
        error = user_item_matrix - rui
        p_new, q_new = p.copy(), q.copy()
        
        for row in range(len(error)):
            for col in range(len(error[row])):
                if not pd.isna(error[row][col]):
                    p_new[row] = p[row] + learning_rate * error[row][col] * q[col]
                    q_new[col] = q[col] + learning_rate * error[row][col] * p[row]
        
        p, q = p_new.copy(), q_new.copy()
        rui = p @ q.T
        
        mean_error = np.nanmean(error)
        print(f'epoch: {epoch}, error: {mean_error}')
        
    return p @ q.T

In [166]:
# TODO - Make a prediction based on the matrix [1p]

user_id = 619
item_id = 332

print_true_value(test_set, user_id, item_id)
u_i_matrix = matrix_factorization(train_set, factors=50)
print(u_i_matrix[user_id, item_id])

TRUE VALUE: 4
epoch: 0, error: -2.043077723644391
epoch: 1, error: -1.8935503776376015
epoch: 2, error: -1.7606066206090414
epoch: 3, error: -1.6416556316817195
epoch: 4, error: -1.5346236700937739
epoch: 5, error: -1.4378304180812485
epoch: 6, error: -1.3498993576816511
epoch: 7, error: -1.26969169110172
epoch: 8, error: -1.1962568653302847
epoch: 9, error: -1.1287950117922383
epoch: 10, error: -1.0666280710368976
epoch: 11, error: -1.009177338685456
epoch: 12, error: -0.9559458208657734
epoch: 13, error: -0.9065042349551592
epoch: 14, error: -0.8604798035838522
epoch: 15, error: -0.8175472106814291
epoch: 16, error: -0.7774212466719115
epoch: 17, error: -0.739850784836565
epoch: 18, error: -0.7046138152235366
epoch: 19, error: -0.6715133250742991
2.0474721233687436


In [165]:
# Prediction with parameters same as in SVD Surprise package

user_id = 619
item_id = 332

print_true_value(test_set, user_id, item_id)
u_i_matrix = matrix_factorization(train_set, learning_rate=0.005, factors=100)
print(u_i_matrix[user_id, item_id])

TRUE VALUE: 4
epoch: 0, error: -1.6201161191282694
epoch: 1, error: -1.5600004937055618
epoch: 2, error: -1.5030491580711436
epoch: 3, error: -1.449027471570122
epoch: 4, error: -1.397723489609553
epoch: 5, error: -1.3489452745917585
epoch: 6, error: -1.302518581340141
epoch: 7, error: -1.2582848574190522
epoch: 8, error: -1.2160995093680982
epoch: 9, error: -1.1758303944045723
epoch: 10, error: -1.1373565040415978
epoch: 11, error: -1.1005668116670038
epoch: 12, error: -1.0653592606947924
epoch: 13, error: -1.0316398736441428
epoch: 14, error: -0.9993219655822213
epoch: 15, error: -0.9683254479144459
epoch: 16, error: -0.9385762106201013
epoch: 17, error: -0.9100055727928814
epoch: 18, error: -0.8825497928191175
epoch: 19, error: -0.856149630762941
2.6974692234983606


In [82]:
# TODO - fit and predict the SVD model on the loaded data and check the result for the user-item pair [1p]

user_id = 619
item_id = 332

svd_model = SVD()
svd_model.fit(trainset)
svd_model.predict(user_id, item_id)

Prediction(uid=619, iid=332, r_ui=None, est=3.1388847482679605, details={'was_impossible': False})

In [83]:
# TODO - calculate the RMSE on the test set [1p]

predictions = svd_model.test(testset)
rmse = accuracy.rmse(predictions)

RMSE: 0.9529
