In [49]:
import pandas as pd
from tqdm import tqdm

In [50]:
train_df = pd.read_parquet("../data/train.parquet")
test_df = pd.read_parquet("../data/test.parquet")

In [51]:
train_df.head()

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0


In [52]:
all_df = pd.concat(([train_df,test_df]))

In [53]:
top20 = all_df.groupby("aid")['ts'].count().sort_values(ascending=False).head(20).to_list()

In [54]:
top20

[137874,
 135892,
 124885,
 116215,
 106512,
 94766,
 92890,
 86333,
 83865,
 81557,
 80615,
 78983,
 78288,
 75925,
 74986,
 73903,
 70235,
 67675,
 66825,
 65894]

In [55]:
pred_df = test_df.sort_values(["session", "type", "ts"]).groupby(["session"]).apply(
    lambda x: x.tail(20).aid.tolist()
)
pred_df

  pred_df = test_df.sort_values(["session", "type", "ts"]).groupby(["session"]).apply(


session
12899779                                              [59625]
12899780           [1142000, 582732, 973453, 736515, 1142000]
12899781    [141736, 199008, 57315, 194067, 199008, 199008...
12899782    [476063, 779477, 975116, 595994, 1344773, 1711...
12899783    [255297, 1114789, 255297, 300127, 198385, 3001...
                                  ...                        
14571577                                            [1141710]
14571578                                             [519105]
14571579                                             [739876]
14571580                                             [202353]
14571581                                            [1100210]
Length: 1671803, dtype: object

In [56]:
clicks_pred_df = pd.DataFrame(pred_df.add_suffix("_clicks"),columns=['labels']).reset_index()
orders_pred_df = pd.DataFrame(pred_df.add_suffix("_orders"),columns=['labels']).reset_index()
carts_pred_df = pd.DataFrame(pred_df.add_suffix("_carts"),columns=['labels']).reset_index()

In [57]:
clicks_pred_df['labels'] = clicks_pred_df['labels'].apply(lambda x : (x + top20)[:20])
orders_pred_df['labels'] = orders_pred_df['labels'].apply(lambda x : (x + top20)[:20])
carts_pred_df['labels'] = carts_pred_df['labels'].apply(lambda x : (x + top20)[:20])

In [58]:
clicks_pred_df.head()

Unnamed: 0,session,labels
0,12899779_clicks,"[59625, 137874, 135892, 124885, 116215, 106512..."
1,12899780_clicks,"[1142000, 582732, 973453, 736515, 1142000, 137..."
2,12899781_clicks,"[141736, 199008, 57315, 194067, 199008, 199008..."
3,12899782_clicks,"[476063, 779477, 975116, 595994, 1344773, 1711..."
4,12899783_clicks,"[255297, 1114789, 255297, 300127, 198385, 3001..."


In [62]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
pred_df.columns = ['session_type', 'labels']

# 定义自定义排序规则
type_priority = {
    'clicks': 0,  # clicks 排在最前
    'carts': 1,   # carts 排在中间
    'orders': 2   # orders 排在最后
}

# 提取 session_type 中的编号部分和关键词部分
pred_df['session_number'] = pred_df['session_type'].str.extract(r'(\d+)')  # 提取编号部分
pred_df['session_type_keyword'] = pred_df['session_type'].str.extract(r'(clicks|carts|orders)')  # 提取类型部分

# 映射类型到优先级
pred_df['type_priority'] = pred_df['session_type_keyword'].map(type_priority)

# 先按 session_number 排序，再按 type_priority 排序
pred_df = pred_df.sort_values(by=['session_number', 'type_priority'])

# 删除辅助列
pred_df = pred_df.drop(columns=['session_number', 'session_type_keyword', 'type_priority'])

# 将 labels 列中的数组转换为逗号分隔的字符串
pred_df['labels'] = pred_df['labels'].apply(lambda x: ' '.join(map(str, x)))

# 查看排序后的结果
print(pred_df)

# 保存为 CSV 文件
pred_df.to_csv("submission.csv", index=False)


            session_type                                             labels
0        12899779_clicks  59625 137874 135892 124885 116215 106512 94766...
0         12899779_carts  59625 137874 135892 124885 116215 106512 94766...
0        12899779_orders  59625 137874 135892 124885 116215 106512 94766...
1        12899780_clicks  1142000 582732 973453 736515 1142000 137874 13...
1         12899780_carts  1142000 582732 973453 736515 1142000 137874 13...
...                  ...                                                ...
1671801   14571580_carts  202353 137874 135892 124885 116215 106512 9476...
1671801  14571580_orders  202353 137874 135892 124885 116215 106512 9476...
1671802  14571581_clicks  1100210 137874 135892 124885 116215 106512 947...
1671802   14571581_carts  1100210 137874 135892 124885 116215 106512 947...
1671802  14571581_orders  1100210 137874 135892 124885 116215 106512 947...

[5015409 rows x 2 columns]


In [61]:
pred_df

Unnamed: 0,session_type,labels
0,12899779_clicks,"[59625, 137874, 135892, 124885, 116215, 106512..."
0,12899779_carts,"[59625, 137874, 135892, 124885, 116215, 106512..."
0,12899779_orders,"[59625, 137874, 135892, 124885, 116215, 106512..."
1,12899780_clicks,"[1142000, 582732, 973453, 736515, 1142000, 137..."
1,12899780_carts,"[1142000, 582732, 973453, 736515, 1142000, 137..."
...,...,...
1671801,14571580_carts,"[202353, 137874, 135892, 124885, 116215, 10651..."
1671801,14571580_orders,"[202353, 137874, 135892, 124885, 116215, 10651..."
1671802,14571581_clicks,"[1100210, 137874, 135892, 124885, 116215, 1065..."
1671802,14571581_carts,"[1100210, 137874, 135892, 124885, 116215, 1065..."


In [33]:
len(pred_df)

5015409