# Summary
Rerankが全く上手く行かなったので、ルールベースで頑張ることにしました。  
候補の選び方は以下の順です。
1. 閲覧履歴にある候補を出現順に並べる
2. 閲覧履歴の宿と一次の共起関係にある宿を共起数順に並べる
3. 閲覧履歴の宿と同一`sml_cd`かつ二次の共起関係にある宿を共起数順に並べる
4. 閲覧履歴の宿と異なる`sml_cd`かつ二次の共起関係にある宿を共起数順に並べる
5. `sml_cd`・`lrg_cd`・`ken_cd`・`wid_cd`の順に閲覧数の多い宿を並べる


2個以上閲覧履歴があるセッションに対してのみRerankすると微増したので、一応最終サブではアンサンブルしています。  
(Rerankはマジで大したことしてないので割愛しました。)

|              | single history | multi history | total  | LB     |
|--------------|----------------|---------------|--------|--------|
| CV(Rule)     | 0.14002        | 0.8845        | 0.4064 | 0.4433 |
| CV(LGBM)     | Use Rule       | 0.8877        |        | 0.4436 |
| CV(Ensemble) |                |               |        | 0.4437 |

# Preprocess

In [1]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import numpy as np

AREA_COLS = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']

def remove_duplicates(lst):
    return list(dict.fromkeys(lst))

# suppress SettingWithCopyWarning
pd.options.mode.chained_assignment = None

# read csvs
train_log = pd.read_csv('../input/train_log.csv')
test_log = pd.read_csv('../input/test_log.csv')
log = pd.concat([train_log, test_log], axis=0)

# convert session_id to session_index
unique_ids = log['session_id'].unique()
session_index_map = {id:i for i, id in enumerate(unique_ids)}

log['session_index'] = log['session_id'].map(session_index_map)
train_log['session_index'] = train_log['session_id'].map(session_index_map)
test_log['session_index'] = test_log['session_id'].map(session_index_map)

# drop session_id
log = log.drop('session_id', axis=1)
train_log = train_log.drop('session_id', axis=1)
test_log = test_log.drop('session_id', axis=1)

train_df = train_log[['session_index']].drop_duplicates().reset_index(drop=True)
test_df = test_log[['session_index']].drop_duplicates().reset_index(drop=True)
df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)


def factorize_area_cols(yado_df):
    # factorize
    for col in AREA_COLS:
        yado_df[col] = pd.factorize(yado_df[col])[0]
    
    return yado_df

def add_log_features(yado_df, log_df):
    # 閲覧回数
    yado_df['yad_counts'] = yado_df['yad_no'].map(log_df['yad_no'].value_counts())

    # セッション数
    yado_df['session_counts'] = yado_df['yad_no'].map(log_df.groupby('yad_no')['session_index'].nunique())

    return yado_df

yado_df = pd.read_csv('../input/yado.csv')
yado_df = yado_df.fillna(0)
yado_df = factorize_area_cols(yado_df)

yado_df = add_log_features(yado_df.copy(), log)
train_yado_df = add_log_features(yado_df.copy(), train_log)
test_yado_df = add_log_features(yado_df.copy(), test_log)

sml_cd_map = yado_df.set_index('yad_no')['sml_cd'].to_dict()

def add_history_features(df, log_df):
    log_df.sort_values(['session_index', 'seq_no'], ascending=[True, True], inplace=True)

    history_map = defaultdict(list)
    for session_index, group in tqdm(log_df.groupby('session_index')):
        history_map[session_index] = group['yad_no'].values.tolist()  # group is sorted by seq_no desc

    df['histories'] = df['session_index'].map(history_map)
    df['num_histories'] = df['histories'].map(len)
    df['last_history'] = df['histories'].map(lambda x: x[-1])
    df['last_history_sml_cd'] = df['last_history'].map(sml_cd_map)

    return df

df = add_history_features(df, log)
train_df = add_history_features(train_df, train_log)
test_df = add_history_features(test_df, test_log)

100%|██████████| 463398/463398 [00:14<00:00, 31761.60it/s]
100%|██████████| 288698/288698 [00:08<00:00, 32602.09it/s]
100%|██████████| 174700/174700 [00:05<00:00, 33708.13it/s]


# History candidates
最後のアイテムと同一のアイテムや、重複した予測を除外する条件においては、単に出現順に並べるのが一番いい。

In [2]:
def remove_last_item(lst):
    # 最後に出現する要素は除外する
    remove_item = lst[-1]
    return remove_duplicates([item for item in lst if item != remove_item])

def add_history_candidates(df):
    df['history_candidates'] = df['histories'].map(remove_last_item)
    return df

df = add_history_candidates(df)
train_df = add_history_candidates(train_df)
test_df = add_history_candidates(test_df)

# Co-visit Candidates
閲覧履歴と共起関係にある宿を候補に入れる。  
以下の4つを作成。  
- 1次のアイテムを共起数順にソートしたもの
- 1次のアイテムを共起数順にソートしたもの（同一セッション内で重複したアイテムは削除）
- 2次のアイテムを同一エリア→共起数順にソートしたもの
- 2次のアイテムを同一エリア→共起数順にソートしたもの（同一セッション内で重複したアイテムは削除）

In [3]:
def create_co_visit_graph(log_df, no_duplication=False):
    log_df['session_length'] = log_df['session_index'].map(log_df.groupby('session_index').size().to_dict())
    log_df2 = log_df.query('session_length > 1')
    log_df2 = log_df2.merge(yado_df[['yad_no', ] + AREA_COLS], how='left', on='yad_no')

    co_visit_graph = defaultdict(lambda: defaultdict(int))

    area_co_visit_graphs = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    for session_index, group in tqdm(log_df2.groupby('session_index')):
        if no_duplication:
            group = group.drop_duplicates('yad_no')
        yados = group['yad_no'].values
        
        # 宿の共起関係
        for yado_id1 in yados:
            for yado_id2 in yados:
                if yado_id1 == yado_id2:
                    continue
                co_visit_graph[yado_id1][yado_id2] += 1
                co_visit_graph[yado_id2][yado_id1] += 1
        
        # エリアの共起関係も作っておく
        for area_col in AREA_COLS:
            areas = group[area_col].values
            for area1 in areas:
                for area2 in areas:
                    if area1 == area2:
                        continue
                    area_co_visit_graphs[area_col][area1][area2] += 1
                    area_co_visit_graphs[area_col][area2][area1] += 1

    # sort by count
    for history_index in co_visit_graph.keys():
        co_visit_graph[history_index] = sorted(co_visit_graph[history_index].items(), key=lambda x: x[1], reverse=True)

    for area_col in area_co_visit_graphs.keys():
        for history_index in area_co_visit_graphs[area_col].keys():
            area_co_visit_graphs[area_col][history_index] = sorted(area_co_visit_graphs[area_col][history_index].items(), key=lambda x: x[1], reverse=True)
    
    return co_visit_graph, area_co_visit_graphs

co_visit_graph, area_co_visit_graphs = create_co_visit_graph(log)
train_co_visit_graph, train_area_co_visit_graphs = create_co_visit_graph(train_log)
test_co_visit_graph, test_area_co_visit_graphs = create_co_visit_graph(test_log)

# no duplicated version
co_visit_graph_no_dup, area_co_visit_graphs_no_dup = create_co_visit_graph(log, no_duplication=True)
train_co_visit_graph_no_dup, train_area_co_visit_graphs_no_dup = create_co_visit_graph(train_log, no_duplication=True)
test_co_visit_graph_no_dup, test_area_co_visit_graphs_no_dup = create_co_visit_graph(test_log, no_duplication=True)

100%|██████████| 164072/164072 [00:17<00:00, 9280.92it/s]
100%|██████████| 103312/103312 [00:11<00:00, 9229.11it/s]
100%|██████████| 60760/60760 [00:06<00:00, 9166.69it/s]
100%|██████████| 164072/164072 [00:54<00:00, 3033.90it/s]
100%|██████████| 103312/103312 [00:34<00:00, 2997.33it/s]
100%|██████████| 60760/60760 [00:20<00:00, 3012.67it/s]


In [4]:
def get_co_visit_candidates(yado_id, co_visit_graph):
    if len(co_visit_graph[yado_id]) == 0:
        return []
    return co_visit_graph[yado_id]

def add_co_visit_candidates(df, co_visit_graph, col_name, k=10):
    co_visit_candidates = []
    for i, history in enumerate(tqdm(df['histories'].values)):
        history = remove_duplicates(history)
        candidates = []
        for yado_id in history:
            candidates_ = get_co_visit_candidates(yado_id, co_visit_graph)
            candidates_ = [x for x in candidates_ if x[0] not in history][:k]
            candidates.extend(candidates_)
        candidates = sorted(candidates, key=lambda x: x[1], reverse=True) # sort by count
        candidates = [x[0] for x in candidates] # drop count
        candidates = remove_duplicates(candidates)
        co_visit_candidates.append(candidates) 

    df[col_name] = co_visit_candidates
    return df

def add_second_co_visit_candidates(df, co_visit_graph, col_name, k=10):
    second_co_visit_candidates = []
    for i, (history, co_visit_candidates) in enumerate(tqdm(df[['histories', 'co_visit_candidates']].values)):
        candidates = []
        for yado_id in co_visit_candidates:
            candidates_ = get_co_visit_candidates(yado_id, co_visit_graph)
            candidates_ = [x for x in candidates_ if (x[0] not in history) and (x[0] not in co_visit_candidates)][:k]
            candidates.extend(candidates_)
        candidates = sorted(candidates, key=lambda x: x[1], reverse=True) # sort by count
        candidates = [x[0] for x in candidates] # drop count
        candidates = remove_duplicates(candidates)
        second_co_visit_candidates.append(candidates)
    df[col_name] = second_co_visit_candidates

    # 2次のco-visitは同一エリアが最初に来るように並べ替える
    ordered_second_co_visit_candidates = []
    for last_history_sml_cd, co_visit_candidates in tqdm(df[['last_history_sml_cd', col_name]].values):
        # rorder co_visit_candidates by prioritize same sml_cd
        co_visit_candidates_sml_cds = [sml_cd_map[yado_no] for yado_no in co_visit_candidates]
        co_visit_candidates = [x for _, x in sorted(zip(co_visit_candidates_sml_cds, co_visit_candidates), key=lambda x: (x[0]!=last_history_sml_cd, x[0]))]
        ordered_second_co_visit_candidates.append(co_visit_candidates)

    df[col_name] = ordered_second_co_visit_candidates

    return df

k = 20
df = add_co_visit_candidates(df, co_visit_graph, "co_visit_candidates", k=k)
df = add_co_visit_candidates(df, co_visit_graph_no_dup, "co_visit_candidates_no_dup", k=k)
df = add_second_co_visit_candidates(df, co_visit_graph, "second_co_visit_candidates", k=k)
df = add_second_co_visit_candidates(df, co_visit_graph_no_dup, "second_co_visit_candidates_no_dup", k=k)

train_df = add_co_visit_candidates(train_df, train_co_visit_graph, "co_visit_candidates", k=k)
train_df = add_co_visit_candidates(train_df, train_co_visit_graph_no_dup, "co_visit_candidates_no_dup", k=k)
train_df = add_second_co_visit_candidates(train_df, train_co_visit_graph, "second_co_visit_candidates", k=k)
train_df = add_second_co_visit_candidates(train_df, train_co_visit_graph_no_dup, "second_co_visit_candidates_no_dup", k=k)

test_df = add_co_visit_candidates(test_df, test_co_visit_graph, "co_visit_candidates", k=k)
test_df = add_co_visit_candidates(test_df, test_co_visit_graph_no_dup, "co_visit_candidates_no_dup", k=k)
test_df = add_second_co_visit_candidates(test_df, test_co_visit_graph, "second_co_visit_candidates", k=k)
test_df = add_second_co_visit_candidates(test_df, test_co_visit_graph_no_dup, "second_co_visit_candidates_no_dup", k=k)

  0%|          | 0/463398 [00:00<?, ?it/s]

100%|██████████| 463398/463398 [00:05<00:00, 91297.31it/s] 
100%|██████████| 463398/463398 [00:04<00:00, 110865.13it/s]
100%|██████████| 463398/463398 [01:14<00:00, 6204.61it/s]
100%|██████████| 463398/463398 [00:11<00:00, 38942.08it/s]
100%|██████████| 463398/463398 [01:08<00:00, 6809.00it/s]
100%|██████████| 463398/463398 [00:11<00:00, 40764.73it/s]
100%|██████████| 288698/288698 [00:02<00:00, 112271.30it/s]
100%|██████████| 288698/288698 [00:02<00:00, 115178.68it/s]
100%|██████████| 288698/288698 [00:39<00:00, 7353.16it/s]
100%|██████████| 288698/288698 [00:07<00:00, 40297.34it/s]
100%|██████████| 288698/288698 [00:38<00:00, 7539.19it/s]
100%|██████████| 288698/288698 [00:05<00:00, 49205.88it/s]
100%|██████████| 174700/174700 [00:01<00:00, 140782.64it/s]
100%|██████████| 174700/174700 [00:01<00:00, 144857.28it/s]
100%|██████████| 174700/174700 [00:12<00:00, 14313.00it/s]
100%|██████████| 174700/174700 [00:02<00:00, 67668.53it/s]
100%|██████████| 174700/174700 [00:10<00:00, 16204.33i

# Area Popular Candidates
閲覧履歴と同一エリアの人気宿を追加する。  
sml_cdとlrg_cdに対しては共起関係にあるエリアの人気宿も後ろに付け加える。

In [5]:
def get_topk_popular_candidates_by_area(train_df, yado_df, area_col, area_co_visit_graphs, k=10, add_co_visit_area_candidates=False):
    
    area_map = yado_df[['yad_no', area_col]].set_index('yad_no')[area_col].to_dict()
    area_group_map = {}
    for area_id, area_df in yado_df.groupby(area_col):
        topk_area_df = area_df.sort_values('yad_counts', ascending=False).head(k)
        area_group_map[area_id] = topk_area_df['yad_no'].tolist()

    candidates = []
    for i, history in enumerate(tqdm(train_df['histories'].values)):
        history = remove_duplicates(history)
        areas = set([area_map[x] for x in history])
        candidate = []
        for area in areas:
            candidate.extend(area_group_map[area])
        if add_co_visit_area_candidates:
            for area in areas:
                if isinstance(area_co_visit_graphs[area_col][area] , list):
                    covisit_area_counts = area_co_visit_graphs[area_col][area]
                    for covisit_area, count in covisit_area_counts:
                        candidate.extend(area_group_map[covisit_area])
        candidates.append(candidate)
    return candidates

In [6]:
k = 20
for area_col in AREA_COLS:
    add_co_visit_area_candidates = area_col in ['sml_cd', 'lrg_cd']
    df[f'{area_col}_top{k}_candidates'] = get_topk_popular_candidates_by_area(df, yado_df, area_col, area_co_visit_graphs, k, add_co_visit_area_candidates)
    train_df[f'{area_col}_top{k}_candidates'] = get_topk_popular_candidates_by_area(train_df, train_yado_df, area_col, train_area_co_visit_graphs, k, add_co_visit_area_candidates)
    test_df[f'{area_col}_top{k}_candidates'] = get_topk_popular_candidates_by_area(test_df, test_yado_df, area_col, test_area_co_visit_graphs, k, add_co_visit_area_candidates)

100%|██████████| 463398/463398 [00:00<00:00, 722671.98it/s]
100%|██████████| 288698/288698 [00:00<00:00, 674034.71it/s]
100%|██████████| 174700/174700 [00:01<00:00, 104210.07it/s]
100%|██████████| 463398/463398 [00:00<00:00, 692528.36it/s]
100%|██████████| 288698/288698 [00:00<00:00, 668985.50it/s]
100%|██████████| 174700/174700 [00:00<00:00, 658667.83it/s]
100%|██████████| 463398/463398 [00:02<00:00, 180487.74it/s]
100%|██████████| 288698/288698 [00:03<00:00, 83294.80it/s] 
100%|██████████| 174700/174700 [00:00<00:00, 210425.97it/s]
100%|██████████| 463398/463398 [00:02<00:00, 175645.99it/s]
100%|██████████| 288698/288698 [00:01<00:00, 210286.23it/s]
100%|██████████| 174700/174700 [00:00<00:00, 217289.36it/s]


# Postprocess
セッション内の最後の宿を予測から削除する。

In [7]:
cols = [
    'history_candidates', 
    "co_visit_candidates",
    "co_visit_candidates_no_dup",
    "second_co_visit_candidates",
    "second_co_visit_candidates_no_dup",
    f'sml_cd_top{k}_candidates', 
    f'lrg_cd_top{k}_candidates', 
    f'ken_cd_top{k}_candidates', 
    f'wid_cd_top{k}_candidates'
]

def remove_last_history_from_candidates(row):
    remove_id  = row['histories'][-1]
    for col in cols:
        row[col] = [x for x in row[col] if x != remove_id]
    return row

df = df.apply(remove_last_history_from_candidates, axis=1)
train_df = train_df.apply(remove_last_history_from_candidates, axis=1)
test_df = test_df.apply(remove_last_history_from_candidates, axis=1)

# Ensemble
co-visit candidatesは以下の4つをアンサンブルして並べ替える
- 重複を許すもの(train/testそれぞれ作成)
- 重複を許すもの(train/testをconcatして作成)
- 重複を許さないもの(train/testそれぞれ作成)
- 重複を許さないもの(train/testをconcatして作成)


area candidatesは以下の2つをアンサンブルして並べ替える
- train/testそれぞれ作成
- train/testをconcatして作成


In [8]:
tqdm.pandas()

def ensemble(row, candidate_cols, weights):
    assert len(candidate_cols) == len(weights)

    res = defaultdict(float)
    for col, weight in zip(candidate_cols, weights):
        for n, item in enumerate(row[col], start=1):
            res[item] += weight / n
    
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())

    return res       

def create_submission(df, each_df, session_df, prefix='test'):
    sub_df = session_df.merge(df, on='session_index', how='left')
    temp_each_df = session_df.merge(each_df, on='session_index', how='left')

    co_visit_cols = [
        "co_visit_candidates",
        "co_visit_candidates_no_dup",
    ]
    second_co_visit_cols = [
        "second_co_visit_candidates",
        "second_co_visit_candidates_no_dup",
    ]
    area_cols = [
        f'sml_cd_top{k}_candidates', 
        f'lrg_cd_top{k}_candidates', 
        f'ken_cd_top{k}_candidates', 
        f'wid_cd_top{k}_candidates'
    ]
    cols = co_visit_cols + second_co_visit_cols + area_cols

    for col in cols:
        sub_df[f'{prefix}_{col}'] = temp_each_df[col]

    co_visit_cols += [f'{prefix}_{x}' for x in co_visit_cols]
    second_co_visit_cols += [f'{prefix}_{x}' for x in second_co_visit_cols]

    sub_df['ensemble_co_visit_candidates'] = sub_df.progress_apply(lambda x: ensemble(x, co_visit_cols, [1.0] * len(co_visit_cols)), axis=1)
    sub_df['ensemble_second_co_visit_candidates'] = sub_df.progress_apply(lambda x: ensemble(x, second_co_visit_cols, [1.0] * len(second_co_visit_cols)), axis=1)
    for area_col in area_cols:
        sub_df[f'ensemble_{area_col}'] = sub_df.progress_apply(lambda x: ensemble(x, [area_col, f"{prefix}_{area_col}"], [1.0] * 2), axis=1)
    
    candidate_cols = [
        'history_candidates', 
        'ensemble_co_visit_candidates', 
        'ensemble_second_co_visit_candidates', 
        f'ensemble_sml_cd_top{k}_candidates', 
        f'ensemble_lrg_cd_top{k}_candidates', 
        f'ensemble_ken_cd_top{k}_candidates'
    ]

    sub_df['candidates'] = [[]] * len(sub_df)
    for col in candidate_cols:
        if isinstance(sub_df[col].iloc[0], np.ndarray):
            sub_df[col] = sub_df[col].map(lambda x: x.tolist())
        sub_df['candidates'] += sub_df[col]
        
    sub_df['candidates'] = sub_df['candidates'].map(remove_duplicates)

    return sub_df

In [9]:
train_sessions = pd.read_csv('../input/train_label.csv')
train_sessions = train_sessions.rename(columns={'yad_no': 'label'})
train_sessions['session_index'] = train_sessions['session_id'].map(session_index_map)

test_sessions = pd.read_csv('../input/test_session.csv')
test_sessions['session_index'] = test_sessions['session_id'].map(session_index_map)

train_sub_df = create_submission(df, train_df, train_sessions, prefix='train')
test_sub_df = create_submission(df, test_df, test_sessions, prefix='test')

100%|██████████| 288698/288698 [00:10<00:00, 27449.22it/s]
100%|██████████| 288698/288698 [00:22<00:00, 12948.55it/s]
100%|██████████| 288698/288698 [00:35<00:00, 8118.35it/s]
100%|██████████| 288698/288698 [00:38<00:00, 7584.13it/s]
100%|██████████| 288698/288698 [00:13<00:00, 20894.35it/s]
100%|██████████| 288698/288698 [00:06<00:00, 41501.63it/s]
100%|██████████| 174700/174700 [00:06<00:00, 29042.23it/s]
100%|██████████| 174700/174700 [00:19<00:00, 8862.23it/s] 
100%|██████████| 174700/174700 [00:20<00:00, 8631.21it/s]
100%|██████████| 174700/174700 [00:22<00:00, 7803.00it/s]
100%|██████████| 174700/174700 [00:04<00:00, 41503.44it/s]
100%|██████████| 174700/174700 [00:04<00:00, 41300.96it/s]


# Validation

In [10]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k for a single actual value.

    Parameters:
    actual : int
        The actual value that is to be predicted
    predicted : list
        A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The average precision at k
    """
    if actual in predicted[:k]:
        return 1.0 / (predicted[:k].index(actual) + 1)
    return 0.0

def calc_mapk(df, col='candidates', k=10):
    df['score'] = [apk(a, p, k) for a, p in zip(df['label'].values, df[col].values)]
    return df['score'].mean()

def calc_recall(df, col='candidates', topk=10):
    df['in_candidates'] = df.apply(lambda x: x['label'] in x[col][:topk], axis=1)
    return df['in_candidates'].sum() / len(df)

def eval_candidates(df, eval_for_single_history=True, eval_for_multi_history=True):
    df['history_length'] = df['histories'].map(len)

    # historyが1つしかないケースのrecall
    if eval_for_single_history:
        single_df = df.query('history_length == 1')
        print("mapk single:", calc_mapk(single_df, col='candidates'))
        print("recall single @10:", calc_recall(single_df, col='candidates', topk=10))
        print("recall single @20:", calc_recall(single_df, col='candidates', topk=20))
        print("recall single @30:", calc_recall(single_df, col='candidates', topk=30))
        print("recall single @40:", calc_recall(single_df, col='candidates', topk=40))
        print("recall single @50:", calc_recall(single_df, col='candidates', topk=50))

    # historyが二つ以上あるケースのrecall
    if eval_for_multi_history:
        multi_df = df.query('history_length > 1')
        print("mapk multi:", calc_mapk(multi_df, col='candidates'))
        print("recall multi @10:", calc_recall(multi_df, col='candidates', topk=10))
        print("recall multi @20:", calc_recall(multi_df, col='candidates', topk=20))
        print("recall multi @30:", calc_recall(multi_df, col='candidates', topk=30))
        print("recall multi @40:", calc_recall(multi_df, col='candidates', topk=40))
        print("recall multi @50:", calc_recall(multi_df, col='candidates', topk=50))

    # トータルのrecall
    print("mapk:", calc_mapk(df, col='candidates'))
    print("recall @10:", calc_recall(df, col='candidates', topk=10))
    print("recall @20:", calc_recall(df, col='candidates', topk=20))
    print("recall @30:", calc_recall(df, col='candidates', topk=30))
    print("recall @40:", calc_recall(df, col='candidates', topk=40))
    print("recall @50:", calc_recall(df, col='candidates', topk=50))

    # 候補数
    print("num average candidates:", df['candidates'].map(len).mean())
    print("num total candidates:", df['candidates'].map(len).sum())

eval_candidates(train_sub_df)

mapk single: 0.14002880562032816
recall single @10: 0.39840656791775
recall single @20: 0.5923586462839696
recall single @30: 0.6999881328687172
recall single @40: 0.7679220653123753
recall single @50: 0.8134648786855534
mapk multi: 0.884480390421617
recall multi @10: 0.9870779773888803
recall multi @20: 0.9979770017035775
recall multi @30: 0.9992643642558464
recall multi @40: 0.9995741056218058
recall multi @50: 0.9996805792163543
mapk: 0.4064344687319214
recall @10: 0.6090655286839535
recall @20: 0.7375111708428878
recall @30: 0.8070856050267061
recall @40: 0.8508198879105501
recall @50: 0.8801030834990198
num average candidates: 339.281359760026
num total candidates: 97949850


# Submission

In [11]:
# sub
sub = pd.read_csv('../input/sample_submission.csv')
for i in range(10):
    sub[f'predict_{i}'] = test_sub_df['candidates'].map(lambda x: x[i] if len(x) > i else -1)
sub.to_csv('output/rule_based.csv', index=False)
sub.head(10)

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,4420,11561,5466,9534,4714,5785,2680,4545,6488
1,143,4066,613,11923,8108,7014,6129,6555,10095,11237
2,757,7710,9190,9910,1774,410,13570,3400,10485,6721
3,12341,6991,13521,3359,1542,2363,10861,4180,2795,5080
4,2862,9020,763,10826,11480,13235,5372,5650,9623,1448
5,13292,3811,11214,10857,7202,12785,5624,6178,3701,5066
6,11776,8041,8691,1462,850,28,3947,638,1089,2422
7,10904,11201,7537,2824,10606,768,5015,13347,2806,3483
8,3244,10541,682,3901,4522,3101,9717,2645,7102,13394
9,5203,12918,11586,2322,11450,4424,1013,11364,9405,5357


# Ensemble

In [23]:
rule_based = test_sub_df[['session_index', 'candidates']].reset_index(drop=True)

re_rank = pd.read_pickle('output/test_pred_df_re_rank.pkl')

In [24]:
multi_sessioin_index = re_rank['session_index'].unique().tolist()

multi_rule_based = rule_based[rule_based['session_index'].isin(multi_sessioin_index)]
single_rule_based = rule_based[~rule_based['session_index'].isin(multi_sessioin_index)]

In [26]:
ensemble_df = pd.DataFrame({'session_index': multi_sessioin_index})
ensemble_df = ensemble_df.merge(multi_rule_based, on='session_index', how='left').rename(columns={'candidates': 'rule_based_candidates'})
ensemble_df = ensemble_df.merge(re_rank, on='session_index', how='left').rename(columns={'candidates': 're_rank_candidates'})
ensemble_df

Unnamed: 0,session_index,rule_based_candidates,re_rank_candidates
0,288698,"[3560, 4420, 11561, 5466, 9534, 4714, 5785, 26...","[3560, 4714, 4545, 9534, 11561, 4420, 5466, 26..."
1,288700,"[757, 7710, 9190, 9910, 1774, 410, 13570, 3400...","[757, 9190, 7710, 9910, 410, 1774, 10485, 6721..."
2,288701,"[12341, 6991, 13521, 3359, 1542, 2363, 10861, ...","[12341, 6991, 3359, 13521, 1542, 5080, 4180, 1..."
3,288705,"[10904, 11201, 7537, 2824, 10606, 768, 5015, 1...","[10904, 11201, 7537]"
4,288709,"[12986, 12089, 11037, 5944, 6199, 2927, 4614, ...","[12986, 8468, 12089, 2452, 12939, 4614, 10155,..."
...,...,...,...
60755,463377,"[8827, 13552, 11436, 11476, 9709, 13547, 7228,...","[8827, 11436, 11476, 13552, 12577, 13547, 1184..."
60756,463381,"[3720, 7222, 10412, 5393, 10621, 13736, 12579,...","[3720, 7222, 5393, 10412, 3158, 1342, 7812, 10..."
60757,463382,"[4834, 10418, 10883, 2028, 7618, 8107, 3739, 8...","[4834, 10418, 10883, 7618, 2028, 7391, 13306, ..."
60758,463390,"[11561, 2680, 5785, 1959, 9534, 6563, 6058, 98...","[11561, 2680, 1959, 5785, 9534, 5466, 6563, 47..."


In [28]:
cols = ['rule_based_candidates', 're_rank_candidates']
weights = [0.49, 0.51]
ensemble_df['candidates'] = ensemble_df.progress_apply(lambda x: ensemble(x, cols, weights), axis=1)

100%|██████████| 60760/60760 [00:09<00:00, 6494.09it/s]


In [29]:
cols = ['session_index', 'candidates']
pred_df = pd.concat([single_rule_based[cols], ensemble_df[cols]]).reset_index(drop=True)
pred_df.sort_values('session_index', inplace=True)
pred_df = pred_df.reset_index(drop=True)

In [30]:
# sub
sub = pd.read_csv('../input/sample_submission.csv')
for i in range(10):
    sub[f'predict_{i}'] = pred_df['candidates'].map(lambda x: x[i] if len(x) > i else -1)

sub.to_csv('output/sub_ensemble.csv', index=False)
sub.head(10)

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,4714,4420,11561,9534,4545,5466,5785,2680,6488
1,143,4066,613,11923,8108,7014,6129,6555,10095,11237
2,757,9190,7710,9910,410,1774,10485,13570,6721,3400
3,12341,6991,3359,13521,1542,4180,5080,10861,2363,10746
4,2862,9020,763,10826,11480,13235,5372,5650,9623,1448
5,13292,3811,11214,10857,7202,12785,5624,6178,3701,5066
6,11776,8041,8691,1462,850,28,3947,638,1089,2422
7,10904,11201,7537,2824,10606,768,5015,13347,2806,3483
8,3244,10541,682,3901,4522,3101,9717,2645,7102,13394
9,5203,12918,11586,2322,11450,4424,1013,11364,9405,5357
