In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict,Counter

train_df = pd.read_parquet('../data/test2/test.parquet')
train_labels_df = pd.read_parquet('../data/test2/test_labels.parquet')

In [2]:
train_labels_df

Unnamed: 0,session,type,ground_truth
0,11098528,clicks,[1679529]
1,11098528,carts,[1199737]
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369..."
3,11098529,clicks,[1105029]
4,11098530,orders,[409236]
...,...,...,...
2212687,12899774,clicks,[1399483]
2212688,12899775,clicks,[1760714]
2212689,12899776,clicks,[1737908]
2212690,12899777,clicks,[384045]


In [3]:
train_df

Unnamed: 0,session,aid,ts,type
0,11098528,11830,1661119200,0
1,11098529,1105029,1661119200,0
2,11098530,264500,1661119200,0
3,11098530,264500,1661119288,0
4,11098530,409236,1661119369,0
...,...,...,...,...
7683572,12899774,33035,1661723968,0
7683573,12899775,1743151,1661723970,0
7683574,12899776,548599,1661723972,0
7683575,12899777,384045,1661723976,0


In [4]:
# 每个 session 的点击/加购/购买数量
counts = train_df.groupby(['session', 'type']).size().unstack(fill_value=0)

# 重命名列（0=click, 1=cart, 2=buy）
counts = counts.rename(columns={0: "click_num", 1: "cart_num", 2: "buy_num"}).reset_index()
counts

type,session,click_num,cart_num,buy_num
0,11098528,1,0,0
1,11098529,1,0,0
2,11098530,5,1,0
3,11098531,20,0,4
4,11098532,2,0,0
...,...,...,...,...
1801246,12899774,1,0,0
1801247,12899775,1,0,0
1801248,12899776,1,0,0
1801249,12899777,1,0,0


In [5]:
train_df = train_df.merge(counts, on="session", how="left")
train_df

Unnamed: 0,session,aid,ts,type,click_num,cart_num,buy_num
0,11098528,11830,1661119200,0,1,0,0
1,11098529,1105029,1661119200,0,1,0,0
2,11098530,264500,1661119200,0,5,1,0
3,11098530,264500,1661119288,0,5,1,0
4,11098530,409236,1661119369,0,5,1,0
...,...,...,...,...,...,...,...
7683572,12899774,33035,1661723968,0,1,0,0
7683573,12899775,1743151,1661723970,0,1,0,0
7683574,12899776,548599,1661723972,0,1,0,0
7683575,12899777,384045,1661723976,0,1,0,0


## Session级特征

In [6]:
# session 长度
train_df['session_length'] = train_df.groupby('session')['session'].transform('size')

# 比例特征
train_df['click_ratio'] = train_df['click_num'] / train_df['session_length']
train_df['cart_ratio']  = train_df['cart_num']  / train_df['session_length']
train_df['buy_ratio']   = train_df['buy_num']   / train_df['session_length']

# 时间跨度
train_df['time_diff'] = train_df.groupby('session')['ts'].transform(lambda x: x.max() - x.min())

# 去重的 item 数
train_df['item_count'] = train_df.groupby('session')['aid'].transform('nunique')

In [7]:
train_df

Unnamed: 0,session,aid,ts,type,click_num,cart_num,buy_num,session_length,click_ratio,cart_ratio,buy_ratio,time_diff,item_count
0,11098528,11830,1661119200,0,1,0,0,1,1.000000,0.000000,0.0,0,1
1,11098529,1105029,1661119200,0,1,0,0,1,1.000000,0.000000,0.0,0,1
2,11098530,264500,1661119200,0,5,1,0,6,0.833333,0.166667,0.0,1332,2
3,11098530,264500,1661119288,0,5,1,0,6,0.833333,0.166667,0.0,1332,2
4,11098530,409236,1661119369,0,5,1,0,6,0.833333,0.166667,0.0,1332,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7683572,12899774,33035,1661723968,0,1,0,0,1,1.000000,0.000000,0.0,0,1
7683573,12899775,1743151,1661723970,0,1,0,0,1,1.000000,0.000000,0.0,0,1
7683574,12899776,548599,1661723972,0,1,0,0,1,1.000000,0.000000,0.0,0,1
7683575,12899777,384045,1661723976,0,1,0,0,1,1.000000,0.000000,0.0,0,1


In [8]:
sesssion_feat = train_df.drop_duplicates("session")[['session','click_num','cart_num','buy_num','item_count','session_length','click_ratio','cart_ratio','buy_ratio','time_diff']]
sesssion_feat

Unnamed: 0,session,click_num,cart_num,buy_num,item_count,session_length,click_ratio,cart_ratio,buy_ratio,time_diff
0,11098528,1,0,0,1,1,1.000000,0.000000,0.000000,0
1,11098529,1,0,0,1,1,1.000000,0.000000,0.000000,0
2,11098530,5,1,0,2,6,0.833333,0.166667,0.000000,1332
8,11098531,20,0,4,11,24,0.833333,0.000000,0.166667,546
32,11098532,2,0,0,2,2,1.000000,0.000000,0.000000,795
...,...,...,...,...,...,...,...,...,...,...
7683572,12899774,1,0,0,1,1,1.000000,0.000000,0.000000,0
7683573,12899775,1,0,0,1,1,1.000000,0.000000,0.000000,0
7683574,12899776,1,0,0,1,1,1.000000,0.000000,0.000000,0
7683575,12899777,1,0,0,1,1,1.000000,0.000000,0.000000,0


In [9]:
recalled_df = pd.read_parquet("../output/recall.parquet")
recalled_df

Unnamed: 0,session_type,labels
0,11098528,"[1462506, 1199737, 907564, 369774, 440367, 118..."
1,11098529,"[1105029, 561858, 459126, 1140565, 1635046, 29..."
2,11098530,"[409236, 264500, 1603001, 583026, 963957, 2541..."
3,11098531,"[396199, 1271998, 452188, 1728212, 1365569, 62..."
4,11098532,"[634422, 847816, 1089, 108125, 461190, 1308930..."
...,...,...
1801246,12899774,"[33035, 819288, 1539309, 771913, 95488, 181252..."
1801247,12899775,"[1743151, 1760714, 1163166, 1255910, 155954, 1..."
1801248,12899776,"[548599, 1440959, 53600, 773354, 1401030, 1144..."
1801249,12899777,"[384045, 1308634, 395762, 1688215, 703474, 148..."


In [12]:
train_labels_df = train_labels_df[['session','ground_truth','type']]
train_labels_df

Unnamed: 0,session,ground_truth,type
0,11098528,[1679529],clicks
1,11098528,[1199737],carts
2,11098528,"[990658, 950341, 1462506, 1561739, 907564, 369...",orders
3,11098529,[1105029],clicks
4,11098530,[409236],orders
...,...,...,...
2212687,12899774,[1399483],clicks
2212688,12899775,[1760714],clicks
2212689,12899776,[1737908],clicks
2212690,12899777,[384045],clicks


In [18]:
import pickle
type2id = pickle.load(open("../data/type2id.pkl","rb"))
print(type2id)
train_labels_df['type'] = train_labels_df['type'].apply(lambda x: type2id[x])
train_labels_df

{'clicks': 0, 'carts': 1, 'orders': 2}


Unnamed: 0,session,ground_truth,type
0,11098528,[1679529],0
1,11098528,[1199737],1
2,11098528,"[990658, 950341, 1462506, 1561739, 907564, 369...",2
3,11098529,[1105029],0
4,11098530,[409236],2
...,...,...,...
2212687,12899774,[1399483],0
2212688,12899775,[1760714],0
2212689,12899776,[1737908],0
2212690,12899777,[384045],0


In [19]:
train_labels_df = train_labels_df.explode("ground_truth")
train_labels_df

Unnamed: 0,session,ground_truth,type
0,11098528,1679529,0
1,11098528,1199737,1
2,11098528,990658,2
2,11098528,950341,2
2,11098528,1462506,2
...,...,...,...
2212687,12899774,1399483,0
2212688,12899775,1760714,0
2212689,12899776,1737908,0
2212690,12899777,384045,0


In [20]:
train_labels_df = train_labels_df.rename(columns={"ground_truth":"aid"})
train_labels_df["gt"] = 1
train_labels_df

Unnamed: 0,session,aid,type,gt
0,11098528,1679529,0,1
1,11098528,1199737,1,1
2,11098528,990658,2,1
2,11098528,950341,2,1
2,11098528,1462506,2,1
...,...,...,...,...
2212687,12899774,1399483,0,1
2212688,12899775,1760714,0,1
2212689,12899776,1737908,0,1
2212690,12899777,384045,0,1


In [21]:
recalled = recalled_df.explode("labels")
recalled

Unnamed: 0,session_type,labels
0,11098528,1462506
0,11098528,1199737
0,11098528,907564
0,11098528,369774
0,11098528,440367
...,...,...
1801250,12899778,1814572
1801250,12899778,1136590
1801250,12899778,992558
1801250,12899778,796522


In [22]:
recalled = recalled.rename(columns={"session_type":"session","labels":"aid"})
recalled = recalled.merge(train_labels_df,on=["session","aid"],how="left")
recalled

Unnamed: 0,session,aid,type,gt
0,11098528,1462506,2.0,1.0
1,11098528,1199737,1.0,1.0
2,11098528,1199737,2.0,1.0
3,11098528,907564,2.0,1.0
4,11098528,369774,2.0,1.0
...,...,...,...,...
116927562,12899778,1814572,,
116927563,12899778,1136590,,
116927564,12899778,992558,,
116927565,12899778,796522,,


In [26]:
recalled = recalled.fillna(0)
recalled

  recalled = recalled.fillna(0)


Unnamed: 0,session,aid,type,gt
0,11098528,1462506,2.0,1.0
1,11098528,1199737,1.0,1.0
2,11098528,1199737,2.0,1.0
3,11098528,907564,2.0,1.0
4,11098528,369774,2.0,1.0
...,...,...,...,...
116927562,12899778,1814572,0.0,0.0
116927563,12899778,1136590,0.0,0.0
116927564,12899778,992558,0.0,0.0
116927565,12899778,796522,0.0,0.0


In [27]:
recalled_feature = recalled.merge(sesssion_feat,on="session",how="left")
recalled_feature

Unnamed: 0,session,aid,type,gt,click_num,cart_num,buy_num,item_count,session_length,click_ratio,cart_ratio,buy_ratio,time_diff
0,11098528,1462506,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0
1,11098528,1199737,1.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0
2,11098528,1199737,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0
3,11098528,907564,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0
4,11098528,369774,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
116927562,12899778,1814572,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0
116927563,12899778,1136590,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0
116927564,12899778,992558,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0
116927565,12899778,796522,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0


In [28]:
item_feat = pd.read_parquet("../data/feat/item_feat.parquet")
item_feat

Unnamed: 0,aid,item_click_count,item_carts_count,item_orders_count
0,1517085,79.0,12.0,3.0
1,1563459,64.0,0.0,0.0
2,1309446,4115.0,493.0,87.0
3,16246,1033.0,117.0,38.0
4,1781822,39.0,3.0,1.0
...,...,...,...,...
201297273,385645,1.0,0.0,0.0
201304587,511976,1.0,0.0,0.0
201308463,720469,2.0,0.0,0.0
201324592,582525,1.0,0.0,0.0


In [29]:
recalled_feature = recalled_feature.merge(item_feat,on="aid",how="left").fillna(0)
recalled_feature

Unnamed: 0,session,aid,type,gt,click_num,cart_num,buy_num,item_count,session_length,click_ratio,cart_ratio,buy_ratio,time_diff,item_click_count,item_carts_count,item_orders_count
0,11098528,1462506,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,4171.0,538.0,293.0
1,11098528,1199737,1.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,215.0,43.0,25.0
2,11098528,1199737,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,215.0,43.0,25.0
3,11098528,907564,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,5071.0,982.0,579.0
4,11098528,369774,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,77.0,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116927562,12899778,1814572,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,9.0,0.0,0.0
116927563,12899778,1136590,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,35.0,2.0,0.0
116927564,12899778,992558,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,31.0,5.0,1.0
116927565,12899778,796522,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,3.0,0.0,0.0


## 交互特征

In [30]:
import pandas as pd

rank_df = pd.read_parquet("../output/aid_ranks.parquet")

if 'rank_in_co_vis_recall' in recalled_feature.columns:
    recalled_feature = recalled_feature.drop(columns=['rank_in_co_vis_recall'])
rank_df = rank_df.rename(columns={"session_type":"session"})
rank_df

Unnamed: 0,session,aid,recall1_rank,recall2_rank
0,11098528,1462506,0.0,65.0
1,11098528,1199737,1.0,41.0
2,11098528,907564,2.0,43.0
3,11098528,369774,3.0,
4,11098528,440367,4.0,
...,...,...,...,...
144658978,12899778,1814572,,24.0
144658979,12899778,1136590,,25.0
144658980,12899778,992558,,26.0
144658981,12899778,796522,,27.0


In [31]:
recalled_feature

Unnamed: 0,session,aid,type,gt,click_num,cart_num,buy_num,item_count,session_length,click_ratio,cart_ratio,buy_ratio,time_diff,item_click_count,item_carts_count,item_orders_count
0,11098528,1462506,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,4171.0,538.0,293.0
1,11098528,1199737,1.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,215.0,43.0,25.0
2,11098528,1199737,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,215.0,43.0,25.0
3,11098528,907564,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,5071.0,982.0,579.0
4,11098528,369774,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,77.0,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116927562,12899778,1814572,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,9.0,0.0,0.0
116927563,12899778,1136590,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,35.0,2.0,0.0
116927564,12899778,992558,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,31.0,5.0,1.0
116927565,12899778,796522,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,3.0,0.0,0.0


In [32]:
recalled_feature = recalled_feature.merge(rank_df,on=["session","aid"],how="left")

In [33]:
recalled_feature[['recall1_rank', 'recall2_rank']] = recalled_feature[['recall1_rank', 'recall2_rank']].fillna(999)

In [34]:
recalled_feature

Unnamed: 0,session,aid,type,gt,click_num,cart_num,buy_num,item_count,session_length,click_ratio,cart_ratio,buy_ratio,time_diff,item_click_count,item_carts_count,item_orders_count,recall1_rank,recall2_rank
0,11098528,1462506,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,4171.0,538.0,293.0,0.0,65.0
1,11098528,1199737,1.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,215.0,43.0,25.0,1.0,41.0
2,11098528,1199737,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,215.0,43.0,25.0,1.0,41.0
3,11098528,907564,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,5071.0,982.0,579.0,2.0,43.0
4,11098528,369774,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,77.0,5.0,0.0,3.0,999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116927562,12899778,1814572,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,9.0,0.0,0.0,999.0,24.0
116927563,12899778,1136590,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,35.0,2.0,0.0,999.0,25.0
116927564,12899778,992558,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,31.0,5.0,1.0,999.0,26.0
116927565,12899778,796522,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,3.0,0.0,0.0,999.0,27.0


In [41]:
train_df = pd.read_parquet('../data/CV/valid_seqs.parquet')

In [None]:
# 假设 train_df 里有: session, aid, ts, type
train_df = train_df.sort_values(["session", "ts"]).reset_index(drop=True)

# 1. 用户对 aid 的交互次数
user_aid_counts = train_df.groupby(["session","aid","type"]).size().unstack(fill_value=0).reset_index()
user_aid_counts = user_aid_counts.rename(columns={0:"user_click_aid_count",1:"user_cart_aid_count",2:"user_order_aid_count"})

train_df = train_df.merge(user_aid_counts, on=["session","aid"], how="left")

# 2. 用户对 aid 的时间特征
train_df["user_aid_first_ts"] = train_df.groupby(["session","aid"])["ts"].transform("min")
train_df["user_aid_last_ts"]  = train_df.groupby(["session","aid"])["ts"].transform("max")

# gap 到上一次点击
train_df["prev_ts"] = train_df.groupby(["session","aid"])["ts"].shift()
train_df["user_aid_ts_gap"] = train_df["ts"] - train_df["prev_ts"]
train_df["user_aid_ts_gap"] = train_df["user_aid_ts_gap"].fillna(-1)  # -1 表示首次出现

# 3. 用户行为类型统计
train_df["user_aid_mean_type"] = train_df.groupby(["session","aid"])["type"].transform("mean")
train_df["user_aid_last_type"] = train_df.groupby(["session","aid"])["type"].transform("last")

# 4. 全局特征 (item_feat)
item_feat = pd.DataFrame()
item_feat["item_click_count"] = train_df[train_df["type"]==0].groupby("aid").size()
item_feat["item_cart_count"]  = train_df[train_df["type"]==1].groupby("aid").size()
item_feat["item_order_count"] = train_df[train_df["type"]==2].groupby("aid").size()
item_feat["item_mean_ts"]     = train_df.groupby("aid")["ts"].mean()
item_feat = item_feat.fillna(0).reset_index()

# merge
train_df = train_df.merge(item_feat, on="aid", how="left")

# 计算差值特征
train_df["diff_click_count"] = (train_df["user_click_aid_count"] - train_df["item_click_count"]).abs()
train_df["diff_cart_count"]  = (train_df["user_cart_aid_count"]  - train_df["item_cart_count"]).abs()
train_df["diff_order_count"] = (train_df["user_order_aid_count"] - train_df["item_order_count"]).abs()
train_df["diff_ts_mean"]     = (train_df["user_aid_last_ts"] - train_df["item_mean_ts"]).abs()

# 清理
train_df = train_df.drop(columns=["prev_ts"])

In [23]:
# 去重，保留 aid+session 级别特征
user_item_feat = train_df.drop_duplicates(["session","aid"])[[
    "session","aid",
    "user_click_aid_count","user_cart_aid_count","user_order_aid_count",
    "user_aid_first_ts","user_aid_last_ts","user_aid_ts_gap",
    "user_aid_mean_type","user_aid_last_type",
    "item_click_count","item_cart_count","item_order_count","item_mean_ts",
    "diff_click_count","diff_cart_count","diff_order_count","diff_ts_mean"
]]

# merge 到 recalled_feature
recalled_feature = recalled_feature.merge(user_item_feat, on=["session","aid"], how="left").fillna(0)


In [24]:
recalled_feature = recalled_feature.rename(columns={"item_click_count_x":"item_click_count"})
recalled_feature

Unnamed: 0,session,aid,type,gt,click_num,cart_num,buy_num,item_count,session_length,click_ratio,...,user_aid_mean_type,user_aid_last_type,item_click_count_y,item_cart_count,item_order_count,item_mean_ts,diff_click_count,diff_cart_count,diff_order_count,diff_ts_mean
0,11098528,1462506,2.0,1.0,12,1,8,12,21,0.571429,...,2.0,2.0,236.0,33.0,7.0,1.661419e+09,236.0,33.0,6.0,298266.079710
1,11098528,1199737,0.0,1.0,12,1,8,12,21,0.571429,...,1.0,2.0,5.0,4.0,2.0,1.661363e+09,4.0,3.0,1.0,242032.818182
2,11098528,1199737,1.0,1.0,12,1,8,12,21,0.571429,...,1.0,2.0,5.0,4.0,2.0,1.661363e+09,4.0,3.0,1.0,242032.818182
3,11098528,1199737,2.0,1.0,12,1,8,12,21,0.571429,...,1.0,2.0,5.0,4.0,2.0,1.661363e+09,4.0,3.0,1.0,242032.818182
4,11098528,907564,2.0,1.0,12,1,8,12,21,0.571429,...,2.0,2.0,225.0,48.0,4.0,1.661402e+09,225.0,48.0,3.0,280938.444043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121151373,12899778,1814572,0.0,0.0,1,0,0,1,1,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.000000
121151374,12899778,1136590,0.0,0.0,1,0,0,1,1,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.000000
121151375,12899778,992558,0.0,0.0,1,0,0,1,1,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.000000
121151376,12899778,796522,0.0,0.0,1,0,0,1,1,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.000000


In [42]:
recalled_feature

Unnamed: 0,session,aid,type,gt,click_num,cart_num,buy_num,item_count,session_length,click_ratio,...,user_aid_mean_type,user_aid_last_type,item_click_count_y,item_cart_count,item_order_count,item_mean_ts,diff_click_count,diff_cart_count,diff_order_count,diff_ts_mean
0,11098528,1462506,2.0,1.0,12,1,8,12,21,0.571429,...,2.0,2.0,236.0,33.0,7.0,1.661419e+09,236.0,33.0,6.0,298266.079710
1,11098528,1199737,0.0,1.0,12,1,8,12,21,0.571429,...,1.0,2.0,5.0,4.0,2.0,1.661363e+09,4.0,3.0,1.0,242032.818182
2,11098528,1199737,1.0,1.0,12,1,8,12,21,0.571429,...,1.0,2.0,5.0,4.0,2.0,1.661363e+09,4.0,3.0,1.0,242032.818182
3,11098528,1199737,2.0,1.0,12,1,8,12,21,0.571429,...,1.0,2.0,5.0,4.0,2.0,1.661363e+09,4.0,3.0,1.0,242032.818182
4,11098528,907564,2.0,1.0,12,1,8,12,21,0.571429,...,2.0,2.0,225.0,48.0,4.0,1.661402e+09,225.0,48.0,3.0,280938.444043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121151373,12899778,1814572,0.0,0.0,1,0,0,1,1,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.000000
121151374,12899778,1136590,0.0,0.0,1,0,0,1,1,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.000000
121151375,12899778,992558,0.0,0.0,1,0,0,1,1,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.000000
121151376,12899778,796522,0.0,0.0,1,0,0,1,1,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.000000


In [35]:
from sklearn.model_selection import train_test_split

# 按 session 划分，保证不会泄露
all_sessions = recalled_feature['session'].unique()
train_sessions, valid_sessions = train_test_split(all_sessions, test_size=0.2, random_state=42)

train_recall_df = recalled_feature[recalled_feature['session'].isin(train_sessions)]
valid_recall_df = recalled_feature[recalled_feature['session'].isin(valid_sessions)]


In [36]:
recalled_feature

Unnamed: 0,session,aid,type,gt,click_num,cart_num,buy_num,item_count,session_length,click_ratio,cart_ratio,buy_ratio,time_diff,item_click_count,item_carts_count,item_orders_count,recall1_rank,recall2_rank
0,11098528,1462506,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,4171.0,538.0,293.0,0.0,65.0
1,11098528,1199737,1.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,215.0,43.0,25.0,1.0,41.0
2,11098528,1199737,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,215.0,43.0,25.0,1.0,41.0
3,11098528,907564,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,5071.0,982.0,579.0,2.0,43.0
4,11098528,369774,2.0,1.0,1,0,0,1,1,1.0,0.0,0.0,0,77.0,5.0,0.0,3.0,999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116927562,12899778,1814572,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,9.0,0.0,0.0,999.0,24.0
116927563,12899778,1136590,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,35.0,2.0,0.0,999.0,25.0
116927564,12899778,992558,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,31.0,5.0,1.0,999.0,26.0
116927565,12899778,796522,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,3.0,0.0,0.0,999.0,27.0


In [26]:
train_labels = train_labels_df.groupby("session")['aid'].apply(list)
train_labels

session
11098528    [1199737, 1462506, 950341, 1561739, 92401, 796...
11098529                                            [1298277]
11098530                     [409236, 409236, 409236, 409236]
11098531    [624163, 1271998, 1728212, 1365569, 1365569, 1...
11098532    [634422, 863592, 463529, 1224737, 1059444, 444...
                                  ...                        
12899774                                            [1399483]
12899775                                            [1760714]
12899776                                            [1737908]
12899777                                             [384045]
12899778                                              [32070]
Name: aid, Length: 1801251, dtype: object

In [37]:
valid_recall_df

Unnamed: 0,session,aid,type,gt,click_num,cart_num,buy_num,item_count,session_length,click_ratio,cart_ratio,buy_ratio,time_diff,item_click_count,item_carts_count,item_orders_count,recall1_rank,recall2_rank
195,11098531,396199,0.0,0.0,20,0,4,11,24,0.833333,0.0,0.166667,546,31.0,5.0,0.0,0.0,10.0
196,11098531,1271998,0.0,0.0,20,0,4,11,24,0.833333,0.0,0.166667,546,77.0,4.0,1.0,1.0,33.0
197,11098531,452188,0.0,0.0,20,0,4,11,24,0.833333,0.0,0.166667,546,70.0,7.0,2.0,2.0,999.0
198,11098531,1728212,0.0,0.0,20,0,4,11,24,0.833333,0.0,0.166667,546,36.0,4.0,1.0,3.0,999.0
199,11098531,1365569,2.0,1.0,20,0,4,11,24,0.833333,0.0,0.166667,546,54.0,5.0,0.0,4.0,86.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116926093,12899746,1510058,0.0,0.0,1,0,0,1,1,1.000000,0.0,0.000000,0,1218.0,23.0,2.0,999.0,23.0
116926094,12899746,151763,0.0,0.0,1,0,0,1,1,1.000000,0.0,0.000000,0,1538.0,36.0,5.0,999.0,24.0
116926095,12899746,202220,0.0,0.0,1,0,0,1,1,1.000000,0.0,0.000000,0,4.0,0.0,0.0,999.0,25.0
116926096,12899746,429157,0.0,0.0,1,0,0,1,1,1.000000,0.0,0.000000,0,28.0,1.0,1.0,999.0,27.0


In [38]:
def make_recall_callback(X_valid, y_valid, group_valid, target_type, topk=20):
    def _callback(env):
        if env.iteration % 50 == 0:  # 每 50 轮评估一次
            y_pred = env.model.predict(X_valid)

            start = 0
            up = 0

            if target_type == 0:  # clicks
                temp = train_labels_df[
                    (train_labels_df['type'] == target_type) &
                    (train_labels_df['session'].isin(valid_sessions))
                ].drop_duplicates(['session'])
                low = temp.shape[0]

                for g in group_valid:
                    end = start + g
                    session_preds = y_pred[start:end]
                    session_labels = y_valid[start:end]

                    # topk 预测
                    topk_idx = np.argsort(-session_preds)[:topk]

                    # ground truth: 只取第一个 1
                    gt_pos = np.where(session_labels == 1)[0]
                    if len(gt_pos) > 0:
                        gt = gt_pos[0]
                        if gt in topk_idx:
                            up += 1
                    start = end

            else:  # carts / orders
                temp = train_labels_df[
                    (train_labels_df['type'] == target_type) &
                    (train_labels_df['session'].isin(valid_sessions))
                ].drop_duplicates(['session', 'aid'])
                low = temp.groupby("session").size().clip(upper=20).sum()

                for g in group_valid:
                    end = start + g
                    session_preds = y_pred[start:end]
                    session_labels = y_valid[start:end]

                    topk_idx = np.argsort(-session_preds)[:topk]
                    topk_labels = session_labels[topk_idx]

                    up += topk_labels.sum()
                    start = end

            res = up / low if low > 0 else 0
            print(f"[{env.iteration}] {pred_types[target_type]} recall@{topk}: {res:.4f}")
    return _callback


In [39]:
from lightgbm import LGBMRanker, log_evaluation
import numpy as np
from tqdm import tqdm

# ==========================
# 参数配置
# ==========================
pred_types = ['clicks', 'carts', 'orders']
type2id = {"clicks": 0, "carts": 1, "orders": 2}
topk = 20  # recall@20

# 特征列（可根据实际情况增减）
features = [
    # Session级特征
    'session_length', 'click_num', 'cart_num', 'buy_num',
    'click_ratio', 'cart_ratio', 'buy_ratio',
    'time_diff', 'item_count',

    # Item全局特征
    'item_click_count', 'item_carts_count', 'item_orders_count',

    # 用户-物品交互特征（可按需开启）
    # 'user_click_aid_count', 'user_cart_aid_count', 'user_order_aid_count',
    # 'user_aid_first_ts', 'user_aid_last_ts', 'user_aid_ts_gap',
    # 'user_aid_mean_type', 'user_aid_last_type',

    # 差值特征 (用户行为 vs item/hots)
    # 'diff_click_count', 'diff_cart_count', 'diff_order_count', 'diff_ts_mean',

    # 召回排名特征
    'recall1_rank', 'recall2_rank'
]

# 存放训练好的模型
models = {}
# 存放每种类型的 recall
recalls = []

# ==========================
# 模型训练 & 验证
# ==========================
for pred_type in pred_types:
    print(f"\n==== Training for {pred_type} ====")
    target_type = type2id[pred_type]

    # 构造训练集（正例 + 负例）
    train_df_type = train_recall_df[
        (train_recall_df["type"] == target_type) | (train_recall_df["gt"] == 0)
    ].copy()

    # 构造验证集（正例 + 负例）
    valid_df_type = valid_recall_df[
        (valid_recall_df["type"] == target_type) | (valid_recall_df["gt"] == 0)
    ].copy()

    # 划分特征和标签
    X_train = train_df_type[features]
    y_train = train_df_type['gt']
    group_train = train_df_type.groupby('session').size().to_list()

    X_valid = valid_df_type[features]
    y_valid = valid_df_type['gt'].values
    group_valid = valid_df_type.groupby('session').size().to_list()

    # ==========================
    # 定义 LGBMRanker 模型
    # ==========================
    model = LGBMRanker(
        objective='lambdarank',
        metric='ndcg',
        boosting_type='gbdt',
        random_state=42,
        n_estimators=500,
    )

    # ==========================
    # 模型训练
    # ==========================
    model.fit(
        X_train, y_train,
        group=group_train,
        eval_set=[(X_valid, y_valid)],
        eval_group=[group_valid],
        eval_at=[5, 10, 20],
        callbacks=[
            log_evaluation(period=50),
            make_recall_callback(
                X_valid, y_valid, group_valid, target_type, topk=topk
            )
        ],
    )

    models[pred_type] = model

    # ==========================
    # 评估 recall@topk
    # ==========================
    print(f"\nEvaluating {pred_type} recall@{topk} ...")
    y_pred = model.predict(X_valid)

    start = 0
    up = 0
    # denominator = 每个 session 的正例数，clip 到 20
    temp = train_labels_df[train_labels_df['type'] == target_type]
    low = temp.groupby("session").size().clip(upper=20).sum()

    for g in tqdm(group_valid, total=len(group_valid), desc="evaluating"):
        end = start + g
        session_preds = y_pred[start:end]
        session_labels = y_valid[start:end]

        # 取 topk 索引
        topk_idx = np.argsort(-session_preds)[:topk]
        topk_labels = session_labels[topk_idx]

        # 累加命中的正例
        up += topk_labels.sum()
        start = end

    res = up / low
    print(f"{pred_type} recall@{topk}: {res:.4f}")
    recalls.append(res)

# ==========================
# 计算最终分数
# ==========================
final_score = 0.1 * recalls[0] + 0.3 * recalls[1] + 0.6 * recalls[2]
print("\nFinal score:", final_score)



==== Training for clicks ====
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.055389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2346
[LightGBM] [Info] Number of data points in the train set: 93060203, number of used features: 14
[0] clicks recall@20: 0.6012
[50]	valid_0's ndcg@5: 0.78861	valid_0's ndcg@10: 0.805147	valid_0's ndcg@20: 0.815547
[50] clicks recall@20: 0.6027
[100]	valid_0's ndcg@5: 0.789046	valid_0's ndcg@10: 0.805618	valid_0's ndcg@20: 0.816053
[100] clicks recall@20: 0.6032
[150]	valid_0's ndcg@5: 0.78934	valid_0's ndcg@10: 0.805951	valid_0's ndcg@20: 0.816363
[150] clicks recall@20: 0.6033
[200]	valid_0's ndcg@5: 0.789429	valid_0's ndcg@10: 0.806009	valid_0's ndcg@20: 0.816451


KeyboardInterrupt: 

In [32]:
valid_df_type

Unnamed: 0,session,aid,type,gt,click_num,cart_num,buy_num,item_count,session_length,click_ratio,cart_ratio,buy_ratio,time_diff,item_click_count,item_carts_count,item_orders_count,recall1_rank,recall2_rank
197,11098531,396199,2.0,1.0,15,0,0,10,15,1.0,0.0,0.0,229,31.0,5.0,0.0,0.0,10.0
199,11098531,1271998,2.0,1.0,15,0,0,10,15,1.0,0.0,0.0,229,77.0,4.0,1.0,1.0,33.0
200,11098531,452188,2.0,1.0,15,0,0,10,15,1.0,0.0,0.0,229,70.0,7.0,2.0,2.0,999.0
202,11098531,1728212,2.0,1.0,15,0,0,10,15,1.0,0.0,0.0,229,36.0,4.0,1.0,3.0,999.0
205,11098531,1365569,2.0,1.0,15,0,0,10,15,1.0,0.0,0.0,229,54.0,5.0,0.0,4.0,86.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118281358,12899746,1510058,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,1218.0,23.0,2.0,999.0,23.0
118281359,12899746,151763,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,1538.0,36.0,5.0,999.0,24.0
118281360,12899746,202220,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,4.0,0.0,0.0,999.0,25.0
118281361,12899746,429157,0.0,0.0,1,0,0,1,1,1.0,0.0,0.0,0,28.0,1.0,1.0,999.0,27.0
