# README

关键insight
- 不同session id要视为**独立的交互事件**，也即每一次session id都是一个**新用户**
- 使用的数据集有三个: **train**（前三周），**val**（第4周），**test**（第五周）。三个集合内的session id**没有重复**
- 任务目标是给**定一批session的前半段，预测后半段**。
  - 数据中val切分成两段，**valA是前半段交互**（用于构建model input），**valB是后半段**（就是label）
  - test其实是第五周的**前半段**，后半段在leadboard上，用于评分，我们不可见

数据集使用：
- otto-validation 分割成三部分，主要用于训练时eval
  - train_parquet==》train
  - test_parquet==》valA
  - test_labels==》valB
  
- otto-chunk-data-inparquet-format 预估submission时就是用的test
    - train_parquet==>train+valA+valB
    - test_parquet==>test
- otto-valid-test-list 包含valA和test集用户的前半段交互（相当于用户冷启部分）

# 函数定义

In [None]:
VER = 1
import pandas as pd, numpy as np
import pickle, glob, gc

from collections import Counter
import itertools
from tqdm import tqdm
import os
from functools import partial

# multiprocessing
import psutil
N_CORES = min(psutil.cpu_count(),32)     # Available CPU cores
print(f"N Cores : {N_CORES}")
from multiprocessing import Pool

In [None]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}
type_weight_multipliers = {0: 1, 1: 6, 2: 3}
RECALL_NUM = 100
CHUNK_NUM = 5

In [None]:
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES, len(t_split)])
    pool = Pool(num_cores)
    df = pool.map(func, t_split)
    pool.close()
    pool.join()
    
    return df

In [None]:
def load_files(files):  
    dfs = []
    for e, chunk_file in enumerate(glob.glob(files)):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk.session = chunk.session.astype('int32')
        chunk.aid = chunk.aid.astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})
def pqt_to_dict(df):
    '''
    join into form like below
    aid1:[aid2,aid3,aid4],
    aid2:[aid1,aid3,aid4]
    '''
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

召回click的函数

In [None]:
def suggest_clicks(df,recall_num=50):
    session = df[0]
    aids = df[1]
    types = df[2]
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=recall_num:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        # 对近期交互过的aid，按照时间远近以及交互类型进行加权
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # 然后取top 
        sorted_aids = [k for k,v in aids_temp.most_common(recall_num)]
        return session, sorted_aids 

    # 遍历每一个aid，取click共现的top20
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # 排序
    top_aids2 = [aid2 for aid2, recall_num in Counter(aids2).most_common(recall_num) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:recall_num - len(unique_aids)]
    # debug用
    #     if np.random.uniform()<0.01:
    #         print(f"len before use top {len(result)}")
    # 补充高热
    return session, result + list(top_clicks)[:recall_num-len(result)]

## 召回buy和cart的函数
（因为cart和buy行为都很稀疏，共现矩阵把两个行为放在一起计算了）

In [None]:
def suggest_buys(df,recall_num=50):
    # USE USER HISTORY AIDS AND TYPES
    session = df[0]
    aids = df[1]
    types = df[2]

    unique_aids = list(dict.fromkeys(aids[::-1] ))
    # 包括cart和order两种行为
    unique_buys = list(dict.fromkeys( [f for i, f in enumerate(aids) if types[i] in [1, 2]][::-1] ))
    
    if len(unique_aids)>=recall_num:        
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
        for aid in aids3: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(recall_num)]
        return session, sorted_aids
            
    # 用"CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_aids if aid in top_20_buys]))
    # 用"BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
    # 排序
    top_aids2 = [aid2 for aid2, recall_num in Counter(aids2 + aids3).most_common(recall_num) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:recall_num - len(unique_aids)]
#     if np.random.uniform()<0.01:
#         print(f"len before use top {len(result)}")
    # 用高热补充
    return session, result + list(top_orders)[:recall_num-len(result)]

## 分chunk执行

In [None]:
def save_parquet(df,folder,name):
    if folder=="":
        folder="./"
    else:
        folder=f"./{folder}"
    if not os.path.exists(folder):
        os.mkdir(folder)
    df.to_parquet(f"{folder}/{name}.pqt")
    print(f"Save file name = {folder}/{name}.pqt")

def predict_by_chunk(bysession_list, folder, chunk_num=10, recall_num=50):
    # 分chunk执行
    def split(list_a, chunk_num):
        chunk_size = len(list_a)//chunk_num
        for i in range(0, len(list_a), chunk_size):
            yield list_a[i:i + chunk_size]
    
    clicks = []
    buys = []
    for i,sub_list in tqdm(enumerate(split(bysession_list, chunk_num=chunk_num))):
        ## clicks
        temp = df_parallelize_run(partial(suggest_clicks,recall_num=recall_num), sub_list)
        clicks.append(pd.Series([f[1]  for f in temp], index=[f[0] for f in temp]).to_frame().rename(columns= {0: 'aid_list'}))

        ## buys
        temp = df_parallelize_run(partial(suggest_buys,recall_num=recall_num), sub_list)
        buys.append(pd.Series([f[1]  for f in temp], index=[f[0] for f in temp]).to_frame().rename(columns= {0: 'aid_list'}))
        
    # 将chunk拼接好
    clicks = pd.concat(clicks)
    orders = pd.concat(buys)
    print(f"Get clicks candidates avg len = {np.mean(clicks.aid_list.apply(lambda x:len(x)))}")
    print(f"Get buys candidates avg len = {np.mean(orders.aid_list.apply(lambda x:len(x)))}")
    
    # 存储
    save_parquet(clicks, folder, name="clicks_candidates")
    save_parquet(orders, folder, name="orders_candidates")

In [None]:
# 获取explode后的结果
def get_explode_candidates(df,folder,file_name,chunk_num=10):
    user_base_explode_chunks = []
    for i,chunk in tqdm(enumerate(np.array_split(pd.DataFrame(df.index.unique(),columns=['session']), chunk_num))):
        user_base_explode_chunks.append(df[df.index.isin(chunk.session)].explode("aid_list").rename(columns={"aid_list":"aid"}))
    user_base_exploded = pd.concat(user_base_explode_chunks)
    save_parquet(user_base_exploded,folder,file_name)

# Valid用户召回

## load数据
val部分用户前半周的交互，用于召回的输入

In [None]:
%%time
PIECES = 5
valid_bysession_list = []
for PART in range(PIECES):
    with open(f'../input/otto-valid-test-list/valid_group_tolist_{PART}_{VER}.pkl', 'rb') as f:
        valid_bysession_list.extend(pickle.load(f))
print(len(valid_bysession_list))

In [None]:
%%time
DISK_PIECES = 4
# LOAD THREE CO-VISITATION MATRICES
top_20_clicks = pqt_to_dict( pd.read_parquet(f'../input/otto-co-visitation-matrices/top_20_valid_clicks_v{VER}_0.pqt') )
for k in range(1, DISK_PIECES): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(f'../input/otto-co-visitation-matrices/top_20_valid_clicks_v{VER}_{k}.pqt') ) )

top_20_buys = pqt_to_dict( pd.read_parquet(f'../input/otto-co-visitation-matrices/top_15_valid_carts_orders_v{VER}_0.pqt') )
for k in range(1, DISK_PIECES): 
    top_20_buys.update( pqt_to_dict( pd.read_parquet(f'../input/otto-co-visitation-matrices/top_15_valid_carts_orders_v{VER}_{k}.pqt') ) )

top_20_buy2buy = pqt_to_dict( pd.read_parquet(f'../input/otto-co-visitation-matrices/top_15_valid_buy2buy_v{VER}_0.pqt') )

# TOP CLICKS AND ORDERS IN ValidA
valid = load_files('../input/otto-validation/test_parquet/*')
top_clicks = valid.loc[valid['type']==type_labels['clicks'], 'aid'].value_counts().index.values[:RECALL_NUM]
top_orders = valid.loc[valid['type']==type_labels['orders'], 'aid'].value_counts().index.values[:RECALL_NUM]
del valid
_ = gc.collect()

print('Here are size of our 3 co-visitation matrices:')
print(f"top_20_clicks = {len(top_20_clicks)}, top_20_buy2buy = {len(top_20_buy2buy)}, top_20_buys = {len(top_20_buys)}")
print(f"top_clicks = {len(top_clicks)}, top_orders = {len(top_orders)}")

## 执行召回

In [None]:
%%time
predict_by_chunk(valid_bysession_list,"val",chunk_num=CHUNK_NUM,recall_num=RECALL_NUM)

## explode

In [None]:
val_click_candidates = pd.read_parquet('./val/clicks_candidates.pqt')
get_explode_candidates(val_click_candidates,'val','clicks_exploded_candidates',chunk_num=CHUNK_NUM)
del val_click_candidates
_ = gc.collect()

In [None]:
val_order_candidates = pd.read_parquet('./val/orders_candidates.pqt')
get_explode_candidates(val_order_candidates,'val','orders_exploded_candidates',chunk_num=CHUNK_NUM)
del val_order_candidates
_ = gc.collect()

In [None]:
# FREE MEMORY
del valid_bysession_list
del top_20_clicks, top_20_buy2buy, top_20_buys, top_clicks, top_orders
_ = gc.collect()

# Test用户召回

## load数据
load物料，co-visitation矩阵，test set时期的近期高热。作为候选

In [None]:
%%time
# 读取test的共现矩阵和高热
DISK_PIECES = 4
top_20_clicks = pqt_to_dict( pd.read_parquet(f'../input/otto-co-visitation-matrices/top_20_test_clicks_v{VER}_0.pqt') )
for k in range(1, DISK_PIECES): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(f'../input/otto-co-visitation-matrices/top_20_test_clicks_v{VER}_{k}.pqt') ) )

top_20_buys = pqt_to_dict( pd.read_parquet(f'../input/otto-co-visitation-matrices/top_15_test_carts_orders_v{VER}_0.pqt') )
for k in range(1, DISK_PIECES): 
    top_20_buys.update( pqt_to_dict( pd.read_parquet(f'../input/otto-co-visitation-matrices/top_15_test_carts_orders_v{VER}_{k}.pqt') ) )
    
top_20_buy2buy = pqt_to_dict( pd.read_parquet(f'../input/otto-co-visitation-matrices/top_15_test_buy2buy_v{VER}_0.pqt') )

# TOP CLICKS AND ORDERS IN TEST
test = load_files('../input/otto-chunk-data-inparquet-format/test_parquet/*')
top_clicks = test.loc[test['type']==type_labels['clicks'], 'aid'].value_counts().index.values[:RECALL_NUM]
top_orders = test.loc[test['type']==type_labels['orders'], 'aid'].value_counts().index.values[:RECALL_NUM]
del test
_ = gc.collect()

print('Here are size of our 3 co-visitation matrices:')
print(f"top_20_clicks = {len(top_20_clicks)}, top_20_buy2buy = {len(top_20_buy2buy)}, top_20_buys = {len(top_20_buys)}")
print(f"top_clicks = {len(top_clicks)}, top_orders = {len(top_orders)}")

In [None]:
%%time
# 读取
PIECES = 5
test_bysession_list = []
for PART in range(PIECES):
    with open(f'../input/otto-valid-test-list/test_group_tolist_{PART}_{VER}.pkl', 'rb') as f:
        test_bysession_list.extend(pickle.load(f))
print(f"Test user cnt = {len(test_bysession_list)}")

## 执行召回

In [None]:
%%time
# 同时召回[click]和[buy+cart]两类
predict_by_chunk(test_bysession_list,"test",chunk_num=CHUNK_NUM,recall_num=RECALL_NUM)

## explode

In [None]:
test_click_candidates = pd.read_parquet('./test/clicks_candidates.pqt')
get_explode_candidates(test_click_candidates,'test','clicks_exploded_candidates',chunk_num=CHUNK_NUM)
del test_click_candidates
_ = gc.collect()

In [None]:
test_order_candidates = pd.read_parquet('./test/orders_candidates.pqt')
get_explode_candidates(test_order_candidates,'test','orders_exploded_candidates',chunk_num=CHUNK_NUM)
del test_order_candidates
_ = gc.collect()

In [None]:
# FREE MEMORY
del test_bysession_list
del top_20_clicks, top_20_buy2buy, top_20_buys, top_clicks, top_orders
_ = gc.collect()

# Metric
评估单纯效果

In [None]:
import pandas as pd
import numpy as np

In [None]:
benchmark = {"clicks":0.5255597442145808, "carts":0.4093328152483512, "orders":0.6487936598117477, "all":.5646320148830121}
weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}

valid_labels = pd.read_parquet('../input/otto-validation/test_labels.parquet')

def hits(b):
    # b[0] : session id
    # b[1] : ground truth
    # b[2] : aids prediction 
    return b[0], len(set(b[1]).intersection(set(b[2]))), np.clip(len(b[1]), 0, 20)

def otto_metric_piece(values, typ, verbose=True):
    '''
    values==> session:[aid list]
    '''
    c1 = pd.DataFrame(values, columns=["labels"]).reset_index().rename({"index":"session"}, axis=1)
    a = valid_labels.loc[valid_labels['type']==typ].merge(c1, how='left', on=['session'])

    b=[[a0, a1, a2] for a0, a1, a2 in zip(a["session"], a["ground_truth"], a["labels"])]
    c = df_parallelize_run(hits, b)
    c = np.array(c)
    
    recall = c[:,1].sum() / c[:,2].sum()
    
    print('{} recall = {:.5f} (vs {:.5f} in benchmark)'.format(typ ,recall, benchmark[typ]))
    
    return recall

def otto_metric(clicks, carts, orders, verbose = True):
    
    score = 0
    score += weights["clicks"] * otto_metric_piece(clicks, "clicks", verbose = verbose)
    score += weights["carts"] * otto_metric_piece(carts, "carts", verbose = verbose)
    score += weights["orders"] * otto_metric_piece(orders, "orders", verbose = verbose)
    
    if verbose:
        print('=============')
        print('Overall Recall = {:.5f} (vs {:.5f} in benchmark)'.format(score, benchmark["all"]))
        print('=============')
    
    return score

In [None]:
val_clicks = pd.read_parquet("./val/clicks_candidates.pqt").rename(columns={'aid_list':'labels'})
val_orders = pd.read_parquet("./val/orders_candidates.pqt").rename(columns={'aid_list':'labels'})
val_clicks.labels = val_clicks.labels.apply(lambda x: x[:20])
val_orders.labels = val_orders.labels.apply(lambda x: x[:20])
# 计算recall rate和总分数
_ = otto_metric(val_clicks, val_orders, val_orders)

# 提交
直接提交基于recall的test。用于debug

In [None]:
# import pandas as pd
# import numpy as np
# def to_submission_format(df,suffix):
#     sub = df.reset_index().rename(columns={'index':'session_type','aid_list':'labels'})
#     sub.labels = sub.labels.apply(lambda x: " ".join(map(str,x[:20])))
#     sub.session_type = sub.session_type.astype('str')+ f'_{suffix}'
#     return sub
# test_clicks = pd.read_parquet("./test/clicks_candidates.pqt")
# test_orders = pd.read_parquet("./test/orders_candidates.pqt")
# clicks_sub = to_submission_format(test_clicks,'clicks')
# carts_sub = to_submission_format(test_orders,'carts')
# orders_sub = to_submission_format(test_orders,'orders')
# sub = pd.concat([clicks_sub, carts_sub, orders_sub])
# assert sub.shape[0]==1671803*3

# sub.to_csv("submission.csv", index=False)
# sub.head(10)

# 创建数据集