# Рекомендательные системы

In [None]:
!pip install --quiet -U jsonlines pyserini jsonlines scikit-surprise fastrank

!git clone --recursive https://github.com/KarypisLab/SLIM.git

!apt install build-essential cmake python-setuptools
!cd SLIM/lib/GKlib && make config openmp=set && make
!cd SLIM/ && make config shared=1 cc=gcc cxx=gcc prefix=~/.local && make install
!python3 -m pip install  --user SLIM/python-package/

# Библиотеки

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from matplotlib import gridspec

from tqdm.notebook import tqdm
import numpy as np
import  pandas as pd
import seaborn as sns
import torch
import scipy
import json
import sys
import re
import os

from functools import partial
from multiprocess import Pool

from SLIM.core import SLIM, SLIMatrix
from surprise import Dataset, Reader, SVD, KNNBasic

## Данные

In [None]:
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
dataframe = pd.DataFrame(
    trainset.all_ratings(),
    columns=['uid', 'iid', 'rating'])
dataframe.sample(5, random_state=42)

In [None]:
uids = np.unique(dataframe['uid'].values)[:300]
means = {uid: dataframe.loc[dataframe['uid'] == uid]['rating'].values.mean() for uid in uids}

In [None]:
mapping = dict()
for index in tqdm(dataframe.index):
    item = dataframe.loc[index]
    if item['uid'] in means:
        if item['uid'] not in mapping:
            mapping[item['uid']] = dict()
        mapping[item['uid']][item['iid']] = item['rating']

In [None]:
mapping_delta = dict()
for index in tqdm(dataframe.index):
    item = dataframe.loc[index]
    if item['uid'] in means:
        if item['uid'] not in mapping_delta:
            mapping_delta[item['uid']] = dict()
        mapping_delta[item['uid']][item['iid']] = item['rating'] - means[item['uid']]

In [None]:
def r(uid, iid=None, delta=False):
    if uid not in mapping:
        return None
    
    mean = means[uid]
    if iid is None:
        if delta:
            all_iid = mapping_delta.get(uid, None)
            return [(key, all_iid[key]) for key in all_iid]
        else:
            all_iid = mapping.get(uid, None)
            return [(key, all_iid[key]) for key in all_iid]
    else:
        if delta:
            return mapping_delta.get(uid, None).get(iid, None)
        else:
            return mapping.get(uid, None).get(iid, None)

## Константная модель

In [None]:
def predict_user_constant(uid, iid):
    return means[uid]

In [None]:
%%time
predict_user_constant(0, 377)

In [None]:
r(0, 377)

## Корреляционная система

$$
  \hat{r}_{ui} = \bar{r}_{u} + \frac{\sum S(u, u')\cdot \left(r_{u'i} - \bar{r}_{u'}\right)}{\sum S(u, u')} 
$$

In [None]:
def predict_user_score(uid, iid, scores):
    weighted_score = 0
    list_of_ids = []

    pair_list = [(id, iid) for id in uids]
    f = lambda x: (x[0], x[1], r(*x, delta=True))
    
    for id, iid, r_id_iid_delta in Pool(processes=2).imap_unordered(f, 
                                                                    pair_list):
        if id != uid:
            if r_id_iid_delta is not None:
                list_of_ids.append(id)
                weighted_score += scores[uid, id]*(r_id_iid_delta)

    if len(list_of_ids):
        return means[uid] + weighted_score/scores[uid, list_of_ids].sum()
    else:
        return None

### Мера сходства пользователей
$$
  S_{u}\bigr(u_1, u_2\bigr) = ?
$$

#### Определим как ср. кв. отклоение средних оценок
$$
  S_{u}\bigr(u_1, u_2\bigr) = \exp\bigr(-\left(\bar{r}_{u_1} - \bar{r}_{u_2}\right)^2\bigr)
$$

In [None]:
def user_metric_one(u1, u2):
    r'''
    Оценка схожести объектов на основе квадрата разности средних оценок.

    :param u1: идентификатор первого пользователя
    :type u1: int
    :param u2: иденттификатор второго пользователя
    :type u2: int

    :return: оценку схожести объектов
    :rtype: float
    '''
    ru1 = means[u1]
    ru2 = means[u2]
    return np.exp(-1*(ru1 - ru2)**2)

##### Проанализируем расстояния

In [None]:
scores = np.zeros(shape=[len(uids), len(uids)])

def f(pair):
    i, j = pair
    return i, j, user_metric_one(uids[i], uids[j])

pair_list = [(i, j) \
             for i in range(len(uids)) \
             for j in range(len(uids)) if i < j]

for i, j, score in tqdm(Pool(processes=2).imap_unordered(f, pair_list), total=len(pair_list)):
    scores[i, j] = score
    scores[j, i] = score

In [None]:
sns.heatmap(data=scores)
plt.show()

In [None]:
%%time
predict_user_score(0, 377, scores)

In [None]:
%%time
predict_user_constant(0, 377)

In [None]:
r(0, 377)

## SLIM

In [None]:
trainmat = SLIMatrix(dataframe)

params = {'algo':'cd', 'nthreads':2, 'l1r':1.0, 'l2r':1.0}

model = SLIM()
model.train(params, trainmat)

testmat = SLIMatrix(dataframe, model)

In [None]:
print(model.predict(testmat, nrcmds=5, returnscores=True)[1][0])
print(model.predict(testmat, nrcmds=5, returnscores=True)[0][0])

## SVD

In [None]:
algo = SVD()
algo = algo.fit(trainset)

In [None]:
algo.predict('0', '377').est