### Качество baseline-моделей

In [333]:
import os
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

In [334]:
%cd /home/jupyter/work/resources/ReChorus

/home/jupyter/work/resources/ReChorus


In [335]:
RANDOM_SEED = 3500
np.random.seed(RANDOM_SEED)

In [339]:
def recommend_top_popular(topk: list, test_df_path: str, train_df_path: str):
    
        train_df = pd.read_csv(train_df_path, sep='\t')
        test_df  = pd.read_csv(test_df_path, sep='\t')
        top_popular = train_df.item_id.value_counts()[:99].index.values
        test_items = test_df['item_id'].values
        rank = []
        for i in test_items:
            item_pos = np.nonzero(top_popular==i)[0]
            rank += [item_pos[0]+1 if item_pos.size else 101]
        rank = np.array(rank)
        
        evaluations = dict()
        metrics = ['HR', 'NDCG']
        for k in topk:
            hit = (rank <= k)
            for metric in metrics:
                key = '{}@{}'.format(metric, k)
                if metric == 'HR':
                    evaluations[key] = round(hit.mean(), 4)
                elif metric == 'NDCG':
                    evaluations[key] = round((hit / np.log2(rank + 1)).mean(), 4)
        return evaluations
    
    
def recommend_top_personal(topk: list, test_df_path: str, train_df_path: str):
    
        train_df = pd.read_csv(train_df_path, sep='\t')
        test_df  = pd.read_csv(test_df_path, sep='\t')
        test_items = test_df['item_id'].values
        
        bought_item_set = defaultdict(list)
        for user_id, seq_df in train_df.groupby('user_id'):
            bought_item_set[user_id] = [x for x in seq_df['item_id'].values.tolist()]
        bought_item_set = dict(bought_item_set)

        top_personal = defaultdict(list)
        for k, v in bought_item_set.items():
            most_common_items = Counter(v).most_common(99)
            top_personal[k] += [x[0] for x in most_common_items]
        top_personal = dict(top_personal)

        rank = []
        for u, i in test_df[['user_id','item_id']].values:
            user_popular = np.array(top_personal[u])
            item_pos = np.nonzero(user_popular==i)[0]
            rank += [item_pos[0]+1 if item_pos.size else 101]
        rank = np.array(rank)
        
        evaluations = dict()
        metrics = ['HR', 'NDCG']
        for k in topk:
            hit = (rank <= k)
            for metric in metrics:
                key = '{}@{}'.format(metric, k)
                if metric == 'HR':
                    evaluations[key] = round(hit.mean(), 4)
                elif metric == 'NDCG':
                    evaluations[key] = round((hit / np.log2(rank + 1)).mean(), 4)
        return evaluations

### TTRS

In [340]:
DATASET = 'ttrs'
RAW_PATH = os.path.join('./data/', DATASET)

In [341]:
topk = [5,10,15]
recommend_top_popular(topk, os.path.join(RAW_PATH, 'test.csv'), os.path.join(RAW_PATH, 'train.csv'))

{'HR@5': 0.3458,
 'NDCG@5': 0.2428,
 'HR@10': 0.4506,
 'NDCG@10': 0.2766,
 'HR@15': 0.5158,
 'NDCG@15': 0.2937}

In [342]:
topk = [5,10,15]
recommend_top_personal(topk, os.path.join(RAW_PATH, 'test.csv'), os.path.join(RAW_PATH, 'train.csv'))

{'HR@5': 0.6614,
 'NDCG@5': 0.509,
 'HR@10': 0.75,
 'NDCG@10': 0.538,
 'HR@15': 0.779,
 'NDCG@15': 0.5457}

### TA_FENG

In [349]:
DATASET = 'ta_feng'
RAW_PATH = os.path.join('./data/', DATASET)

In [350]:
topk = [5,10,15]
recommend_top_popular(topk, os.path.join(RAW_PATH, 'test.csv'), os.path.join(RAW_PATH, 'train.csv'))

{'HR@5': 0.0454,
 'NDCG@5': 0.0374,
 'HR@10': 0.0555,
 'NDCG@10': 0.0407,
 'HR@15': 0.0729,
 'NDCG@15': 0.0452}

In [351]:
topk = [5,10,15]
recommend_top_personal(topk, os.path.join(RAW_PATH, 'test.csv'), os.path.join(RAW_PATH, 'train.csv'))

{'HR@5': 0.0499,
 'NDCG@5': 0.033,
 'HR@10': 0.0749,
 'NDCG@10': 0.0411,
 'HR@15': 0.0897,
 'NDCG@15': 0.045}