# Recommender Systems and Collaborative Filtering

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
filepath = './data/user_ratedmovies.dat'  #dataset with users's rating for each film
df_rates = pd.read_csv(filepath, sep='\t')

In [None]:
filepath = './data/movies.dat'  #film id and more detailed other information about it
df_movies = pd.read_csv(filepath, sep='\t', encoding='iso-8859-1')

# Encoding films ID and users ID

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
enc_user = LabelEncoder()
enc_mov = LabelEncoder()

In [None]:
enc_user = enc_user.fit(df_rates.userID.values)
enc_mov = enc_mov.fit(df_rates.movieID.values)

In [None]:
idx = df_movies.loc[:, 'id'].isin(df_rates.movieID)
df_movies = df_movies.loc[idx]

In [None]:
df_rates.loc[:, 'userID'] = enc_user.transform(df_rates.loc[:, 'userID'].values)
df_rates.loc[:, 'movieID'] = enc_mov.transform(df_rates.loc[:, 'movieID'].values)
df_movies.loc[:, 'id'] = enc_mov.transform(df_movies.loc[:, 'id'].values)

In [None]:
df_rates.head()

## Building a Matrix of ratings

In [None]:
from scipy.sparse import coo_matrix, csr_matrix

In [None]:
R = coo_matrix((df_rates.rating.values, (df_rates.userID.values, df_rates.movieID.values)))

In [None]:
R

## Similarity between users

In the future, it will be more convenient for us to work with the 'Compressed Sparse Row matrix format'. Fortunately, we can transform the resulting matrix with one command:

In [None]:
R = R.tocsr()

Now, for example, the ratings for the first user can be obtained like this:

In [None]:
user_1 = R[0]
user_1

In [None]:
user_1.dot(user_2.T)[0, 0]

And we can transform a sparse matrix (vector) into a dense one using the following command:

In [None]:
user_1_dense = user_1.toarray()
user_1_dense

Let's try to implement the function of calculating the similarity between a pair of users $u$ and $v$:

$$ s_{uv} = \frac{\sum\limits_{i \in I_u\cap I_v} R_{ui} R_{vi}}{\sqrt{{\sum\limits_{i \in I_u\cap I_v}R_{ui}^2}}\sqrt{{\sum\limits_{i \in I_u\cap I_v}R_{vi}^2}}}$$

Let's assume that if the number of movies that user $ u $ and $ v $ watched together is $ <= 2 $, then their cosine measure is 0.0.


In [None]:
def cosine_similarity_pair_users(u, v):
    u_rated = (u != 0)
    v_rated = (v != 0)
    common_items = (u_rated).multiply(v_rated)
    if common_items.nnz > 2:
        scalar = u.dot(v.T)[0, 0]
        norm = np.linalg.norm(u[common_items]) * np.linalg.norm(v[common_items])
        return scalar / norm
    else:
        return 0.0

## Rating prediction function

Let's implement a function that takes as input:
* Users's index
* Matrix of ratings
* The number of nearest neighbors (despite the fact that each user is a closest neighbor to itself, it should not be used in calculations)

and returns a vector with predicted ratings for all products for this user.

In order to calculate the rating forecast, we will use the simplified formula for User-Based Recommender Systems:

$$ \hat{R}_{ui} = \frac{\sum_{v \in N(u)} s_{uv}R_{vi}}{\sum_{v \in N(u)} \left| s_{uv}\right|} $$


### _Решение_

In [None]:
def rate_items_user(u, R, n_neigbours=30):
    predictions = csr_matrix((1, R.shape[1]))
    cumsim = 0.0
    num_predictions = csr_matrix((1, R.shape[1]))
    s = np.array([cosine_similarity_pair_users(R[u], R[v]) for v in range(R.shape[0])])
    similar_users = np.argsort(s)[::-1]
    print(np.sort(s)[::-1])
    for v in similar_users[:n_neigbours]:
        if v == u:
            continue
        user_sim = s[v]
        predictions += user_sim * R[v]
        cumsim += np.abs(user_sim)
    predictions /= cumsim
    return predictions

In [None]:
R_hat = rate_items_user(20, R, n_neigbours=30)

In [None]:
rated_items = (R[20] == 0)
unseen_ratings = R_hat.multiply(rated_items)
unseen_ratings = unseen_ratings.toarray()[0]

In [None]:
idx = unseen_ratings.argsort()[::-1]

In [None]:
unseen_ratings[idx]

In [None]:
top5 = idx[:5]

In [None]:
top5  #answer