# Метрики
## Imports

In [None]:
import itertools

import numba as nb
import numpy as np
import pandas as pd
import requests
from rectools import Columns
from tqdm.auto import tqdm

# Mean Reciprocal Rank

$$
MRR = \frac{1}{|Q|}\sum_{i=1}^{|Q|}\frac{1}{rank_i}
$$

## Get KION dataset

In [None]:
url = (
    'https://github.com/irsafilo/KION_DATASET/raw/'
    'f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
)

req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(
        desc='kion dataset download',
        total=total_size_in_bytes,
        unit='iB',
        unit_scale=True,
    )
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)


In [None]:
!unzip -o kion.zip

## Read data

In [None]:
interactions = pd.read_csv('data_original/interactions.csv')

interactions.rename(
    columns={
        'track_id': Columns.Item,
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight,
    },
    inplace=True,
)

interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime])

## Utils

In [None]:
def generate_subsample(users_count, top_k):
    users = np.random.choice(
        interactions[Columns.User].unique(), users_count, replace=False,
    )
    df = interactions[interactions[Columns.User].isin(users)].reset_index(
        drop=True,
    )
    del df[Columns.Datetime], df[Columns.Weight], df['watched_pct']

    recs = np.random.choice(df[Columns.Item], size=(users_count, top_k))
    return df, users, recs

## MRR calculation functions

In [None]:
def mrr_naive(target, users, recs):
    mrr = []
    for i, user in enumerate(users):
        hit_rank = 0
        user_target = target[target[:, 0] == user][:, 1]
        for rank, rec in enumerate(recs[i]):
            if rec in user_target:
                hit_rank = rank + 1
                break
        mrr.append(1 / hit_rank) if hit_rank else mrr.append(0)
    return sum(mrr) / len(users)

In [None]:
@nb.njit(cache=True, parallel=True)
def mrr_numba(target, users, recs):
    mrr = np.zeros(len(users))
    for i in nb.prange(len(users)):
        hit_rank = 0
        user_target = target[target[:, 0] == users[i]][:, 1]
        for rank in nb.prange(len(recs[i])):
            if recs[i][rank] in user_target:
                hit_rank = rank + 1
                break
        mrr[i] = 1 / hit_rank if hit_rank else 0
    return mrr.mean()

In [None]:
def mrr_pandas(df, users, recs, k):
    df_recs = pd.DataFrame(
        {
            Columns.User: np.repeat(users, k),
            Columns.Item: recs.ravel(),
        },
    )
    df_recs[Columns.Rank] = df_recs.groupby(Columns.User).cumcount() + 1
    df_recs = df.merge(
        df_recs,
        how='left',
        left_on=Columns.UserItem,
        right_on=Columns.UserItem,
    )
    hit_ranks = 1 / len(users) / df_recs.groupby(
        Columns.User,
    )[Columns.Rank].min()
    return hit_ranks.sum()

<hr />

In [None]:
# set seed
np.random.seed(42)

In [None]:
users_counts = [100, 1000, 10000, 100000]
top_ks = [10, 50, 100]
algos = [mrr_naive, mrr_numba, mrr_pandas]
params = list(itertools.product(users_counts, top_ks, algos))
measurements = {
    'users_count': [],
    'top_k': [],
    'algo': [],
    'avg_time': [],
}

In [None]:
# Проверим корректность работы
df, users, recs = generate_subsample(users_counts[0], top_ks[0])
target = df.values
mrr_values = [
    mrr_naive(target, users, recs),
    mrr_numba(target, users, recs),
    mrr_pandas(df, users, recs, top_ks[0]),
]

for mrr_pair in itertools.combinations(mrr_values, 2):
    np.testing.assert_almost_equal(mrr_pair[0], mrr_pair[1])

In [None]:
for param_set in params:
    users_count, top_k, algo = param_set
    print(
        'users_count: {users_count}, top_k: {top_k}, algo: {algo}'.format(
            users_count=users_count,
            top_k=top_k,
            algo=algo.__name__,
        ),
    )
    df, users, recs = generate_subsample(users_count, top_k)
    target = df.values
    if algo == mrr_naive:
        runs = %timeit -o -n 3 -r 1 algo(target, users, recs)
    elif algo == mrr_numba:
        algo(target, users, recs)
        runs = %timeit -o -n 3 -r 1 algo(target, users, recs)
    else:
        runs = %timeit -o -n 3 -r 1 algo(df, users, recs, top_k)
    measurements['users_count'].append(str(users_count))
    measurements['top_k'].append(str(top_k))
    measurements['algo'].append(algo.__name__)
    measurements['avg_time'].append(np.mean(runs.timings))

In [None]:
measurements_df = pd.DataFrame(measurements)

# pFound

$pFound@K = \sum_{i=1}^{k}pLook[i]pRel[i]$
$pLook[1] = 1$
$pLook[i] = pLook[i-1](1 - pRel[i-1])(1 - pBreak)$
$pBreak = 0.15$

По формуле распишем первые несколько значений $pLook$ при $(pBreak = 0.15)$:
$$
\begin{aligned}
&pLook[1] = 1\\
&pLook[2] = (1 - pRel[1]) \cdot 0.85\\
&pLook[3] = (1 - pRel[1]) \cdot 0.85 \cdot (1 - pRel[2]) \cdot 0.85\\
\end{aligned}
$$
Заметим закономерность. Тогда $pLook[i]$ для $i \neq 1$:
$$
\begin{aligned}
&[1] \quad pLook[i] = (1 - pRel[1]) \cdot 0.85 \cdot (1 - pRel[2]) \cdot 0.85 \cdots (1 - pRel[i - 1]) \cdot 0.85
\end{aligned}
$$

In [None]:
def p_found(df, p_break=0.15, k=None):
    df['i < k'] = True
    # Создадим столбец с рангом результата
    df[Columns.Rank] = df.groupby('qid').cumcount() + 1
    if k is not None:
        # Если задано значение k, то игнорируем записи, для которых rank > k
        df['i < k'] = df[Columns.Rank] < k + 1
    # Сделаем сдвиг внутри каждой группы для дальнейших расчетов
    # Теперь в соседнем столбце для каждого pRel[i] будет значение pRel[i - 1]
    df['p_rel[i-1]'] = df.groupby('qid')['p_rel[i]'].shift()
    # Рассчитаем (1 - pRel[i - 1]) * (1 - pBreak)
    df['(1-p_rel[i-1])*(1-p_break)'] = (1 - df['p_rel[i-1]']) * (1 - p_break)
    # Рассчитаем pLook[i] по формуле [1]
    df['p_look[i]'] = df.groupby('qid')['(1-p_rel[i-1])*(1-p_break)'].cumprod()
    # Выполним условие pLook[1] = 1
    df['p_look[i]'] = df['p_look[i]'].fillna(1)
    # Рассчитаем pFound[i]
    # Домножение на df['i < k'] позволяет учитывать только первые k результатов
    df['p_found[i]'] = df['p_look[i]'] * df['p_rel[i]'] * df['i < k']
    # Для каждого запроса суммируем значение pFound и берем среднее
    return df.groupby('qid')['p_found[i]'].sum().mean()

## Read data

Данные - https://yadi.sk/d/guqki4UI4hFlXQ

In [None]:
!unzip -o open_task.zip
!unzip -o hidden_task.zip

In [None]:
column_names = ['qid', 'url', 'p_rel[i]']
open_df = pd.read_csv(
    'open_task/qid_url_rating.tsv', sep='\t', names=column_names,
)
hidden_df = pd.read_csv(
    'qid_url_rating.tsv', sep='\t', names=column_names,
)
merged_df = pd.concat([open_df, hidden_df])

In [None]:
print(
    'pFound на открытой части датасета: {0:.5f}'.format(
        p_found(open_df),
    ),
)
print(
    'pFound на закрытой части датасета: {0:.5f}'.format(
        p_found(hidden_df),
    ),
)
print(
    'pFound на всем датасете: {0:.5f}'.format(
        p_found(merged_df),
    ),
)
