In [None]:
VER = 1
import pandas as pd, numpy as np
import pickle, glob, gc

from collections import Counter
import itertools
import os
from tqdm import tqdm
import time

# multiprocessing 
import psutil
N_CORES = min(psutil.cpu_count(),32)     # Available CPU cores
print(f"N Cores : {N_CORES}")
from multiprocessing import Pool
from matplotlib import pyplot as plt

import lightgbm as lgb
from lightgbm.sklearn import LGBMRanker
from sklearn.model_selection import GroupKFold

REMARKS = "过滤所有无交互"
KFOLDS = 3

# Train

In [None]:
def filter_out_neg_sessions(train_set,target,frac=0.9):
    # 过滤无target行为的用户
    session_target_sum = train_set.groupby('session')[target].sum().to_frame()
    filter_out_session = session_target_sum[session_target_sum[target]==0].sample(frac=frac) # 保留一部分用户
    train_set = train_set[~train_set.session.isin(filter_out_session.index)]
    return train_set

def neg_sample(train_set,target,neg_sample_ratio):
    # 对负例进行采样，正：负=1：neg_sample_ratio。
    positives = train_set[train_set[target]==1]
    negatives = train_set[train_set[target]==0].sample(n=positives.shape[0]*neg_sample_ratio)
    train_set = pd.concat([positives,negatives],axis=0,ignore_index=True).sample(frac=1).sort_values(['session'])
    train_set.groupby('session').aid.count().hist()
    plt.show()
    return train_set

In [None]:
params = {
    'device':'gpu',
    'learning_rate': 0.1,
    'max_depth': 10,
    'num_leaves':256,
    'early_stopping_round':10,
    'objective':"lambdarank",
    'metric':"ndcg",
    'colsample_bytree': 0.9,
    'subsample':0.9,
    'boosting_type':"gbdt",
    'n_estimators':20,
    'importance_type':'gain'
}

cart

In [None]:
%%time
# load数据
TARGET = 'carts' # carts/orders
print(f"TARGET = {TARGET}")

train_set = pd.read_parquet(f"../feature/less_train_with_feature.pqt")
train_set = filter_out_neg_sessions(train_set,TARGET,frac=1)
display(train_set.head(10))

print(f"Session avg aid len = {np.mean(train_set.groupby('session').aid.nunique())}")
print(f"Session avg {TARGET} num = {train_set.groupby('session')[TARGET].sum().mean()}")
print(f"Train with {train_set.session.nunique()} users")

FEATURES = train_set.columns[5:]
print(f"FEATURES = {FEATURES}")
# 开始训练
skf = GroupKFold(n_splits=KFOLDS)
for fold,(train_idx, valid_idx) in tqdm(enumerate(skf.split(train_set, train_set[TARGET], groups=train_set['session']))):

    X_train = train_set.iloc[train_idx][FEATURES]
    y_train = train_set.iloc[train_idx][TARGET]
    X_valid = train_set.iloc[valid_idx][FEATURES]
    y_valid = train_set.iloc[valid_idx][TARGET]
    group_train = train_set.iloc[train_idx].groupby('session')['session'].count()
    group_valid = train_set.iloc[valid_idx].groupby('session')['session'].count()
    print(train_idx.shape,X_train.shape,y_train.shape)
    ranker = LGBMRanker(
        **params
    )
    ranker = ranker.fit(
        X_train,
        y_train,
        group=group_train,
        eval_set=[(X_train,y_train),(X_valid, y_valid)],
        eval_group=[group_train,group_valid],
        eval_at=(1,5,10,20)
    )
    lgb.plot_metric(ranker)
    lgb.plot_importance(ranker,max_num_features=20)
    plt.show()
    ranker.booster_.save_model(f'LGBM_fold{fold}_{TARGET}_{REMARKS}.txt')

order

In [None]:
%%time
# train lgb model
import lightgbm as lgb
from lightgbm.sklearn import LGBMRanker
from sklearn.model_selection import GroupKFold

# load数据
TARGET = 'orders' # carts/orders
print(f"TARGET = {TARGET}")

train_set = pd.read_parquet(f"../feature/less_train_with_feature.pqt")
train_set = filter_out_neg_sessions(train_set,TARGET,frac=1)
display(train_set.head(10))

print(f"Session avg aid len = {np.mean(train_set.groupby('session').aid.nunique())}")
print(f"Session avg {TARGET} num = {train_set.groupby('session')[TARGET].sum().mean()}")
print(f"Train with {train_set.session.nunique()} users")

FEATURES = train_set.columns[5:]
print(f"FEATURES = {FEATURES}")

# 训练
skf = GroupKFold(n_splits=KFOLDS)
for fold,(train_idx, valid_idx) in tqdm(enumerate(skf.split(train_set, train_set[TARGET], groups=train_set['session']))):

    X_train = train_set.iloc[train_idx][FEATURES]
    y_train = train_set.iloc[train_idx][TARGET]
    X_valid = train_set.iloc[valid_idx][FEATURES]
    y_valid = train_set.iloc[valid_idx][TARGET]
    group_train = train_set.iloc[train_idx].groupby('session')['session'].count()
    group_valid = train_set.iloc[valid_idx].groupby('session')['session'].count()
    print(train_idx.shape,X_train.shape,y_train.shape)

    ranker = LGBMRanker(
        **params
    )
    ranker = ranker.fit(
        X_train,
        y_train,
        group=group_train,
        eval_set=[(X_train,y_train),(X_valid, y_valid)],
        eval_group=[group_train,group_valid],
        eval_at=(1,5,10,20)
    )
    lgb.plot_metric(ranker)
    lgb.plot_importance(ranker,max_num_features=20)
    plt.show()
    ranker.booster_.save_model(f'LGBM_fold{fold}_{TARGET}_{REMARKS}.txt')

In [None]:
del train_set
_ = gc.collect()

# Metric

In [None]:
# eval lgb model
import lightgbm as lgb
from lightgbm.sklearn import LGBMRanker

def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES, len(t_split)])
    pool = Pool(num_cores)
    df = pool.map(func, t_split)
    pool.close()
    pool.join()
    
    return df

def predict_save(test_set,features,target,save_file_prefix='test'):
    print(f"Predicting using features {features}")
    preds = np.zeros(len(test_set))
    for fold in tqdm(range(KFOLDS)):
        model = lgb.Booster(model_file=f'LGBM_fold{fold}_{target}_{REMARKS}.txt')
        preds += model.predict(test_set[features])/KFOLDS
    predictions = test_set[['session','aid']].copy()
    predictions['pred'] = preds

    predictions = predictions.sort_values(['session','pred'], ascending=[True,False]).reset_index(drop=True)
    predictions['n'] = predictions.groupby('session').aid.cumcount().astype('int8')
    predictions = predictions.loc[predictions.n<20]
    predictions.to_csv(f"{save_file_prefix}_intermediate_{target}_predictions.csv",index=False)
    sub = predictions.groupby('session').aid.apply(list)
    sub = sub.to_frame().reset_index()
    sub.aid = sub.aid.apply(lambda x: " ".join(map(str,x)))
    sub.columns = ['session_type','labels']
    sub.session_type = sub.session_type.astype('str')+ f'_{target}'
    sub.to_csv(f"{save_file_prefix}_{target}_predictions.csv",index=False)
    display(sub.head(10))

In [None]:
benchmark = {"clicks":0.5255597442145808, "carts":0.4093328152483512, "orders":0.6487936598117477, "all":.5646320148830121}
weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}

valid_labels = pd.read_parquet('../input/otto-validation/test_labels.parquet')

def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES, len(t_split)])
    pool = Pool(num_cores)
    df = pool.map(func, t_split)
    pool.close()
    pool.join()
    
    return df

def hits(b):
    # b[0] : session id
    # b[1] : ground truth
    # b[2] : aids prediction 
    return b[0], len(set(b[1]).intersection(set(b[2]))), np.clip(len(b[1]), 0, 20)

def otto_metric_piece(values, typ, verbose=True):
    '''
    pred==> index: session|aid_list
    '''
    c1 = pd.DataFrame(values, columns=["labels"]).reset_index().rename({"index":"session"}, axis=1)
    a = valid_labels[valid_labels['type']==typ].merge(c1, how='left', on=['session'])

    b=[[a0, a1, a2] for a0, a1, a2 in zip(a["session"], a["ground_truth"], a["labels"])]
    try:
        c = df_parallelize_run(hits, b)
    except Exception as e:
        print(f'Error {e}, labels = {a.labels}')
        return -1
    c = np.array(c)
    
    recall = c[:,1].sum() / c[:,2].sum()
    
    print('{} recall = {:.5f} (vs {:.5f} in benchmark)'.format(typ ,recall, benchmark[typ]))
    
    return recall


def otto_metric(clicks, carts, orders, verbose = True):
    
    score = 0
    score += weights["clicks"] * otto_metric_piece(clicks, "clicks", verbose = verbose)
    score += weights["carts"] * otto_metric_piece(carts, "carts", verbose = verbose)
    score += weights["orders"] * otto_metric_piece(orders, "orders", verbose = verbose)
    
    if verbose:
        print('=============')
        print('Overall Recall = {:.5f} (vs {:.5f} in benchmark)'.format(score, benchmark["all"]))
        print('=============')
    
    return score

In [None]:
# %%time
# train_set = pd.read_parquet(f"../feature/less_train_with_feature.pqt")
# FEATURES = train_set.columns[5:]
# print('Predicting carts...')
# predict_save(train_set,FEATURES,'carts','val')
# print('Predicting orders...')
# predict_save(train_set,FEATURES,'orders','val')

# print('Predicting clicks')
# clicks_candidates = pd.read_parquet("../recall/val/clicks_candidates.pqt")
# sub = clicks_candidates.reset_index().rename(columns={'index':'session_type','aid_list':'labels'})
# sub.labels = sub.labels.apply(lambda x: " ".join(map(str,x[:20])))
# sub.session_type = sub.session_type.astype('str')+ f'_clicks'
# sub.to_csv("./val_clicks_predictions.csv",index=False)
# display(sub.head(10))

In [None]:
# # 计算recall rate和总分数
# val_clicks = pd.read_csv("./val_clicks_predictions.csv")
# val_clicks.columns = ['session','labels']
# val_clicks.session = val_clicks.session.apply(lambda x:int(x.split('_')[0]))
# val_clicks.labels = val_clicks.labels.apply(lambda x:list(map(int,x.split(' '))))
# val_clicks = val_clicks.set_index(['session'])
# val_carts = pd.read_csv("./val_carts_predictions.csv")
# val_carts.columns = ['session','labels']
# val_carts.session = val_carts.session.apply(lambda x:int(x.split('_')[0]))
# val_carts.labels = val_carts.labels.apply(lambda x:list(map(int,x.split(' '))))
# val_carts = val_carts.set_index(['session'])
# val_orders = pd.read_csv("./val_orders_predictions.csv")
# val_orders.columns = ['session','labels']
# val_orders.session = val_orders.session.apply(lambda x:int(x.split('_')[0]))
# val_orders.labels = val_orders.labels.apply(lambda x:list(map(int,x.split(' '))))
# val_orders = val_orders.set_index(['session'])
# _ = otto_metric(val_clicks, val_carts, val_orders)

# Test

In [None]:
VER = 1
import pandas as pd, numpy as np
import pickle, glob, gc

from collections import Counter
import itertools
import os
from tqdm import tqdm
import time

# multiprocessing 
import psutil
N_CORES = psutil.cpu_count()     # Available CPU cores
print(f"N Cores : {N_CORES}")
from multiprocessing import Pool

使用同一批候选，但是模型分别用两种order和cart两种模型

In [None]:
test_set = pd.read_parquet("../feature/less_test_with_feature.pqt")
assert test_set.session.nunique()==1671803, "User cnt is wrong, please chekc your feature generation part!"
test_set.head(10)

In [None]:
%%time
FEATURES = test_set.columns[2:]
print('Predicting carts...')
predict_save(test_set,FEATURES,'carts','test')
print('Predicting orders...')
predict_save(test_set,FEATURES,'orders','test')

clicks_candidates = pd.read_parquet("../recall/test/clicks_candidates.pqt")
sub = clicks_candidates.reset_index().rename(columns={'index':'session_type','aid_list':'labels'})
sub.labels = sub.labels.apply(lambda x: " ".join(map(str,x[:20])))
sub.session_type = sub.session_type.astype('str')+ f'_clicks'
sub.to_csv("test_clicks_predictions.csv",index=False)
display(sub.head(10))

# 提交

In [None]:
%%time
import pandas as pd
clicks = pd.read_csv("test_clicks_predictions.csv")
carts = pd.read_csv("test_carts_predictions.csv")
orders = pd.read_csv("test_orders_predictions.csv")
sub = pd.concat([clicks,carts,orders])
sub.to_csv("submission.csv",index=False)
sub.head(10)

In [None]:
# !kaggle competitions submit -c otto-recommender-system -f submission.csv -m "more feature"