In [None]:
%load_ext autoreload
%autoreload 2

In [None]:


import numpy as np
import torch
from torch import nn
from data_util import *
import metrics
import time
import pandas as pd
from MyUtil import *
import matplotlib.pyplot as plt
import plotly.express as px

import lightgbm as lgb

graded_datapath = {
'yahoo':'_data/ltrc_yahoo/set1.binarized_purged_querynorm_filtered.npz',
'mslr':'_data/MSLR-WEB30k/Fold1/binarized_purged_querynorm_filtered.npz',
               }
binary_datapath = {
'yahoo':'_data/ltrc_yahoo/set1.binarized_purged_querynorm_filtered.binrel.pkl',
'mslr':'_data/MSLR-WEB30k/Fold1/binarized_purged_querynorm_filtered.binrel.pkl',
               }

results_dir = '_data/outlier/'

In [None]:
from tqdm.notebook import trange


def get_params(params, outliers=9, topk=20):
    if isinstance(params, str):
        with open(params, 'rb') as f:
            params = pickle.load(f)
    params = pd.DataFrame(params).T
    sessions_dist = params['count'][:outliers+1]
    sessions_dist /= sessions_dist.sum()
    sessions_dist = sessions_dist.values
    for col in ['propensity', 'epsilon_p', 'epsilon_n']:
        params[col] = params[col].apply(lambda x: np.array(x).astype(np.float)[None,:topk])
    params['propensity'] = params['propensity'].apply(np.array)
    p = np.concatenate(params['propensity'][:outliers+1].values, 0)
    ep = np.concatenate(params['epsilon_p'][:outliers+1].values, 0)
    en = np.concatenate(params['epsilon_n'][:outliers+1].values, 0)
    betas = p * en
    alphas = p * ep - betas
    return alphas, betas, sessions_dist.astype(np.float)
    
def make_params_readable(pkl_path):
    if isinstance(pkl_path, str):
        with open(pkl_path, 'rb') as f:
            params = pd.DataFrame(pickle.load(f))
    else:
        params = pkl_path
    df = pd.DataFrame(params).T
    topk = len(df.propensity.iloc[0])
    for col in ['propensity', 'epsilon_n', 'epsilon_p']:
        df[[f'{col}_{i+1}' for i in range(topk)]] = df[col].to_list()
    for i in range(topk):
        df[f'zp_{i+1}'] = df[f'propensity_{i+1}'] * df[f'epsilon_p_{i+1}']
        df[f'zn_{i+1}'] = df[f'propensity_{i+1}'] * df[f'epsilon_n_{i+1}']
        df[f'z_{i+1}'] = df[[f'zp_{i+1}', f'zn_{i+1}']].agg(list, axis=1)
    return df[[f'z_{i+1}' for i in range(topk)]]

def generate_clicks_pbm(params, outliers_max_pos, topk, ds, sessions_cnt, save_path, outliers_path):
    alphas, betas, sessions_dist = get_params(params, outliers=outliers_max_pos, topk=topk)
      
    clicks = []
    session_inds = np.random.choice(
        np.arange(alphas.shape[0]), 
        size=(ds.trdlr.shape[0] - 1,), replace=True, p=sessions_dist)
    total_clicks_cnt = 0
    outlierness = []
    
#     Make the labels binary if not already!
    if ds.trlv.max() > 1:
        ds.trlv = np.round(ds.trlv/ds.trlv.max(), 0)
    
    for qid in trange(ds.trdlr.shape[0] - 1, leave=False):
        s, e = ds.trdlr[qid: qid+2]
        session_ind = min(session_inds[qid], e-s)
        alpha = alphas[session_ind]
        beta = betas[session_ind]
        q_session = np.repeat(ds.sessions[qid], sessions_cnt//ds.sessions[qid].shape[0], 0)
        rel = ds.trlv[s:e]
        q_a = np.ones_like(rel) * alpha[-1]
        q_a[:min(e-s, alpha.shape[0])] = alpha[:min(e-s, alpha.shape[0])]
        q_b = np.ones_like(rel) * beta[-1]
        q_b[:min(e-s, beta.shape[0])] = beta[:min(e-s, beta.shape[0])]
        c = q_a * rel[q_session] + q_b
        clicks.append(np.random.binomial(1, c))
        total_clicks_cnt += len(np.where(clicks[-1]==1)[0])
        q_outliers = np.zeros(e-s)
        if session_ind > 0:
            q_outliers[session_ind - 1] = 1.
        outlierness.append(q_outliers)
        
    with open(save_path, 'wb') as f:
        pickle.dump(clicks, f)
    with open(outliers_path, 'wb') as f:
        pickle.dump(np.concatenate(outlierness,0), f)
    return total_clicks_cnt
    

---

### Binarize levels

In [None]:
for dataset_name in ['yahoo', 'mslr']:
    dataset = read_pkl(graded_datapath[dataset_name])
    dataset.trlv = np.round(dataset.trlv/4,0)
    dataset.valv = np.round(dataset.valv/4,0)
    dataset.telv = np.round(dataset.telv/4,0)
    with open(binary_datapath[dataset_name], 'wb') as f:
        pickle.dump(dataset.__dict__, f)
    

---

### Production ranker

In [None]:
sessions_datapath = {
    "yahoo": "_data/ltrc_yahoo/sessions_fix.pkl",
    "mslr": "_data/MSLR-WEB30k/Fold1/sessions_fix.pkl"
}


def lambdarank(dataset, model_path=None, learning_rate=0.05, num_leaves=31, n_estimators=300, eval_at=[10], early_stopping_rounds=10000):
    start = time.time()
    if model_path is not None and os.path.exists(model_path):
        booster = lgb.Booster(model_file=model_path)
        print('loading lgb took {} secs.'.format(time.time() - start))
        return booster

    gbm = lgb.LGBMRanker(learning_rate=learning_rate, n_estimators=n_estimators, num_leaves=num_leaves)

    gbm.fit(dataset.trfm, dataset.trlv, 
          group=np.diff(dataset.trdlr), 
          eval_set=[(dataset.vafm, dataset.valv)],
          eval_group=[np.diff(dataset.vadlr)], 
          eval_at=eval_at, 
          early_stopping_rounds=early_stopping_rounds, 
          verbose=False)

    if model_path is not None:
        gbm.booster_.save_model(model_path)

    print('training lgb took {} secs.'.format(time.time() - start))
    return gbm.booster_


In [None]:

for dataset_name in ['yahoo', 'mslr']:
    with open(binary_datapath[dataset_name], 'rb') as f:
        dataset = type('ltr', (object,), pickle.load(f))
    small_dataset = subsample_splits(dataset, 20, 777)
    booster = lambdarank(small_dataset, model_path=f'{dataset_name}_production.gbt')
    
    te_y_pred = booster.predict(dataset.tefm)
    te_metric = metrics.LTRMetrics(dataset.telv,np.diff(dataset.tedlr),te_y_pred)
    tr_y_pred = booster.predict(dataset.trfm)
    tr_metric = metrics.LTRMetrics(dataset.trlv,np.diff(dataset.trdlr),tr_y_pred)
    
    print(dataset_name, 'train:', tr_metric.NDCG(10), ', test:', te_metric.NDCG(10))
    
    sessions = []
    for qid in range(dataset.trdlr.shape[0] - 1):
        s, e = dataset.trdlr[qid: qid+2]
        y = tr_y_pred[s:e]
        argsorted = y.argsort()[::-1]
        
        sessions.append(argsorted[None,:])
        
        with open(sessions_datapath[dataset_name], 'wb') as f:
            pickle.dump(sessions, f)
    

---

### top10

In [None]:
topk = 10

topk_datapath = {
'yahoo': f'_data/ltrc_yahoo/top{topk}.set1.binarized_purged_querynorm_filtered.binrel.pkl',
'mslr': f'_data/MSLR-WEB30k/Fold1/top{topk}.binarized_purged_querynorm_filtered.binrel.pkl',
               }

topk_sessions_datapath = {
    "yahoo": f"_data/ltrc_yahoo/top{topk}.sessions_fix.pkl",
    "mslr": f"_data/MSLR-WEB30k/Fold1/top{topk}.sessions_fix.pkl"
}


In [None]:

for dataset_name in ['yahoo', 'mslr']:
    with open(sessions_datapath[dataset_name], 'rb') as f:
        sessions = pickle.load(f)
    with open(binary_datapath[dataset_name], 'rb') as f:
        dataset = pickle.load(f)
    
    topk_sessions = []
    fms, dlrs, lvs = [], [0], []
    for qid in range(len(sessions)):
        session = sessions[qid][0,:topk]
        s_i, e_i = dataset['trdlr'][qid:qid+2]
        fm_ = dataset['trfm'][s_i:e_i, :]
        lv_ = dataset['trlv'][s_i:e_i]
        if sum(lv_[session]) > 1:
            fms.append(fm_[session, :])
            dlrs.append(session.shape[0])
            lvs.append(lv_[session])
            topk_sessions.append(np.arange(session.shape[0])[None,:])
    dataset['trfm'] = np.concatenate(fms, 0)
    dataset['trdlr'] = np.cumsum(dlrs)
    dataset['trlv'] = np.concatenate(lvs, 0)
    
    
    print(f'after top{topk}:', dataset_name, metrics.LTRMetrics(dataset['trlv'],np.diff(dataset['trdlr']),-np.arange(dataset['trlv'].shape[0])).NDCG(10))

    with open(topk_sessions_datapath[dataset_name], 'wb') as f:
        pickle.dump(topk_sessions, f)
    
    with open(topk_datapath[dataset_name], 'wb') as f:
        pickle.dump(dataset, f)
    
    

#### relevant freq

In [None]:
from collections import defaultdict

for dataset_name in ['yahoo', 'mslr']:
    with open(topk_datapath[dataset_name], 'rb') as f:
        dataset = pickle.load(f)
        
    freq = defaultdict(lambda : 0)
    print(dataset_name, dataset['trdlr'].shape[0] - 1, 'queries')
    for qid in range(dataset['trdlr'].shape[0] - 1):
        s_i , e_i = dataset['trdlr'][qid:qid+2]
        freq[sum(dataset['trlv'][s_i:e_i])] += 1
    
    print(dict(freq))

---

### Full info

In [None]:
def read_results(jobid, results_dir = '_data/outlier/'):
    
    files_list = os.listdir(results_dir)

    dfs = []
    for file in files_list:
        if ((jobid is not None and jobid in file) or (jobid is None)) and file.endswith('.json'):
            df = pd.read_json(os.path.join(results_dir, file), lines=True)
            df['file_name'] = file
            dfs.append(df)
    full_df = pd.concat(dfs)

    max_epoch_dfs = []
    for dataset in full_df.dataset.unique():
        tmp_df = full_df.loc[(full_df.dataset == dataset)]
        max_epoch_dfs.append(tmp_df.loc[(tmp_df.epoch == tmp_df.epoch.max())])
        print(dataset, 'epoch:', tmp_df.epoch.max())
    full_df = pd.concat(max_epoch_dfs)
    values=['train_clicks','train']
    if 'valid' in full_df:
        values.append('valid')
    if 'test' in full_df:
        values.append('test')
        full_df = full_df.loc[~full_df.test.isna()]
    pv = pd.pivot_table(full_df, values=values, columns=['dataset'], aggfunc=lambda x: f'{len(x)} -> {np.nanmean(x):.04f} ' + u"\u00B1" + f' {np.nanstd(x):.04f}')
    return pv

---

### Gaussian

In [None]:
def add_dists(big, small, position, alpha):
    mid_pos = int(len(small)/2)
    if position < mid_pos:
        small = small[mid_pos-position:]
    small = np.pad(small, (max(0, position-mid_pos), 0,))
    return (1-alpha) * big + (alpha) * np.pad(small, (0, len(big)-len(small),))

def put_on_top(big, small, position, alpha):
    mid_pos = int(len(small)/2)
    if position < mid_pos:
        small = small[mid_pos-position:]
    start_pos = max(0, position-mid_pos)
    big[start_pos:start_pos + len(small)] = (1-alpha) * big[start_pos:start_pos + len(small)] + (alpha) * small
    return big
    
def create_params(outlier_share, outliers=7, topk=20, only_outliers=True):
    theta = np.array([(1./(i+1.)) for i in range(topk)])
    ep = np.array([0.98-(i/100.) for i in range(topk)])
    en = np.ones_like(theta)*0.05

    gauss_theta = np.array([0.3, 1, 0.3, 0.05])
    gaussp = np.ones_like(gauss_theta)
    gaussn = np.ones_like(gauss_theta)*0.05
    
    params = {}
    offset = 0.
    if not only_outliers:
        params['0.0'] = {'propensity':list(theta), 'epsilon_p':list(ep), 'epsilon_n':list(en), 'count':100}
        offset = 1.
    for position in range(outliers):
        params[f'{position + offset}'] = {'propensity':list(add_dists(theta, gauss_theta, position, outlier_share)), 
                                       'epsilon_p':list(put_on_top(ep, gaussp, position, outlier_share)), 
                                       'epsilon_n':list(put_on_top(en, gaussn, position, outlier_share)),
                                       'count':50}
        
    return params



---

**PBM clicks**

#### simple pbm

In [None]:
params = create_params(outlier_share=0, outliers=0, topk=20, only_outliers=False)

In [None]:
clicks_path = {'yahoo': '_data/ltrc_yahoo/top20.clicks',
              'mslr': '_data/MSLR-WEB30k/Fold1/top20.clicks'}

for dataset_name in ['yahoo', 'mslr']:
    ds = load_dataset(dataset_name, 'datasets_info.json', 400)
    params_real_world = f'simple'

    click_cnt = generate_clicks_pbm(params = params, outliers_max_pos = 0, topk = 20, 
                                    ds = ds, sessions_cnt = 400, 
                                    save_path = f'{clicks_path[dataset_name]}_{params_real_world}.pkl', 
                                    outliers_path = f'{clicks_path[dataset_name]}_{params_real_world}.pkl'.replace('clicks', 'outlierness')
                                   )
    print(dataset_name, '->', click_cnt, 'clicks')

#### OPBM

In [None]:
clicks_path = {'yahoo': '_data/ltrc_yahoo/top10.clicks',
              'mslr': '_data/MSLR-WEB30k/Fold1/top10.clicks'}

for dataset_name in ['yahoo', 'mslr']:
    ds = load_dataset(dataset_name, 'datasets_info.json', 400)
    for outlier_share in [0.25, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9]:
        params = create_params(outlier_share=outlier_share, outliers=9, topk=10, only_outliers=True)
        params_real_world = f'opbm_{outlier_share:.02f}'

        click_cnt = generate_clicks_pbm(params = params, outliers_max_pos = 9, topk = 10, 
                                        ds = ds, sessions_cnt = 400, 
                                        save_path = f'{clicks_path[dataset_name]}_{params_real_world}.pkl', 
                                        outliers_path = f'{clicks_path[dataset_name]}_{params_real_world}.pkl'.replace('clicks', 'outlierness')
                                       )
        print(dataset_name, outlier_share, '->', click_cnt, 'clicks')
        print()

In [None]:
params = create_params(outlier_share=0.75, outliers=9, topk=10, only_outliers=True)
params['9.0'] = {}
for key in params['8.0']:
    if isinstance(params['8.0'][key], list):
        params['9.0'][key] = list(0.5*np.array(params['3.0'][key]) + 0.5*np.array(params['8.0'][key]))
    else:
        params['9.0'][key] = params['8.0'][key]
params

In [None]:

def generate_clicks_pbm_mix_lazy(params, outliers_max_pos, topk, ds, sessions_cnt, save_path, outliers_path):
    alphas, betas, sessions_dist = get_params(params, outliers=outliers_max_pos, topk=topk)
      
    print(alphas.shape[0])
    clicks = []
    session_inds = np.random.choice(
        np.arange(alphas.shape[0]), 
        size=(ds.trdlr.shape[0] - 1,), replace=True, p=sessions_dist)
    total_clicks_cnt = 0
    outlierness = []
    outlierness_lazy = []
    
#     Make the labels binary if not already!
    if ds.trlv.max() > 1:
        ds.trlv = np.round(ds.trlv/ds.trlv.max(), 0)
    
    print(ds.trdlr.shape, ds.trdlr[-1])
    for qid in trange(ds.trdlr.shape[0] - 1, leave=False):
        s, e = ds.trdlr[qid: qid+2]
        session_ind = min(session_inds[qid], e-s)
        alpha = alphas[session_ind]
        beta = betas[session_ind]
        q_session = np.repeat(ds.sessions[qid], sessions_cnt//ds.sessions[qid].shape[0], 0)
        rel = ds.trlv[s:e]
        q_a = np.ones_like(rel) * alpha[-1]
        q_a[:min(e-s, alpha.shape[0])] = alpha[:min(e-s, alpha.shape[0])]
        q_b = np.ones_like(rel) * beta[-1]
        q_b[:min(e-s, beta.shape[0])] = beta[:min(e-s, beta.shape[0])]
        c = q_a * rel[q_session] + q_b
        clicks.append(np.random.binomial(1, c))
        total_clicks_cnt += len(np.where(clicks[-1]==1)[0])
        q_outliers = np.zeros(e-s)
        q_outliers_lazy = np.zeros(e-s)
        if session_ind > 0:
            q_outliers[session_ind - 1] = 1.
            if session_ind == 9:
                session_ind = 3
            q_outliers_lazy[session_ind - 1] = 1.
                
        outlierness.append(q_outliers)
        outlierness_lazy.append(q_outliers_lazy)
        
    print(len(outlierness))
    print(np.concatenate(outlierness,0).shape)
    with open(save_path, 'wb') as f:
        pickle.dump(clicks, f)
    with open(outliers_path, 'wb') as f:
        pickle.dump(np.concatenate(outlierness,0), f)
    with open(save_path.replace('.pkl', '_lazy.pkl'), 'wb') as f:
        pickle.dump(clicks, f)
    with open(outliers_path.replace('.pkl', '_lazy.pkl'), 'wb') as f:
        pickle.dump(np.concatenate(outlierness_lazy,0), f)
    return total_clicks_cnt

In [None]:
clicks_path = {'yahoo': '_data/ltrc_yahoo/top10.clicks',
              'mslr': '_data/MSLR-WEB30k/Fold1/top10.clicks'}

dataset_name = 'mslr'
ds = load_dataset(dataset_name, 'datasets_info.json', 400)
params = create_params(outlier_share=0.8, outliers=9, topk=10, only_outliers=True)
params['9.0'] = {}
for key in params['8.0']:
    if isinstance(params['8.0'][key], list):
        params['9.0'][key] = list(0.5*np.array(params['3.0'][key]) + 0.5*np.array(params['8.0'][key]))
    else:
        params['9.0'][key] = params['8.0'][key]

for count in [3, 6, 12, 25, 50, 100]:
    params['9.0']['count'] = count
    params_real_world = f'opbm_0.8_mix_{count}'

    click_cnt = generate_clicks_pbm_mix_lazy(params = params, outliers_max_pos = 10, topk = 10, 
                                    ds = ds, sessions_cnt = 400, 
                                    save_path = f'{clicks_path[dataset_name]}_{params_real_world}.pkl', 
                                    outliers_path = f'{clicks_path[dataset_name]}_{params_real_world}.pkl'.replace('clicks', 'outlierness')
                                   )
    print(dataset_name, '->', click_cnt, 'clicks')
    print()