In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import dask.dataframe as dd
import numpy as np
import xgboost as xgb
import pandas as pd
import gc

USE = 'orders'

# 1) 读入 Dask DataFrame
ddf = dd.read_parquet("/content/drive/MyDrive/OTTO/candidates_features_engineering_final.parquet")

# 2) 转成 pandas
pdf = ddf.compute()
del ddf
gc.collect()

# 3) 替换 inf
pdf = pdf.replace([np.inf, -np.inf], np.nan)

# 4) 需要的特征列表
feature = [
    'session_length','session_click_ratio','session_cart_ratio','session_order_ratio',
    'session_last_click_aid','session_last_atc_aid','session_last_order_aid',
    'session_last_click_timestamp','session_last_atc_timestamp','session_last_order_timestamp',
    'item_popularity','item_cart_count','item_order_count','item_conversion_rate',
    'item_atc_rate','item_atc_conversion_rate','item_recent_clicks','item_recent_carts',
    'item_recent_orders','item_type_mean','session_item_click_count','session_item_cart_count',
    'session_item_order_count','session_item_last_interaction_type_clicks',
    'session_item_last_interaction_type_orders','session_item_abs_click_hots',
    'session_item_abs_atc_hots','session_item_abs_order_hots','session_item_abs_click_time',
    'session_item_abs_atc_time','session_item_abs_order_time'
]

# 5) 创建 preds 数组
preds = np.zeros(len(pdf), dtype=np.float32)

#####################################
# 6) 多折模型 + 分批预测
#####################################

# 循环加载 5 个模型
for fold in range(5):
    model = xgb.Booster()
    model.load_model(f'/content/drive/MyDrive/OTTO/{USE}/model/XGB_fold{fold}_{USE}_v412.json')
    model.set_param({'device': 'cuda'})

    # 分批处理，避免一次性构建太大的 DMatrix
    batch_size = 2_000_000
    start_pos = 0
    n = len(pdf)

    while start_pos < n:
        end_pos = min(start_pos + batch_size, n)

        X_batch = pdf.iloc[start_pos:end_pos][feature]
        dtest_batch = xgb.DMatrix(X_batch)

        # 预测并累加
        preds[start_pos:end_pos] += model.predict(dtest_batch) / 5

        start_pos = end_pos
        del X_batch, dtest_batch
        gc.collect()

    del model
    gc.collect()

#####################################
# 7) 整理输出
#####################################
# 把 preds 加回 pdf
pdf['pred'] = preds

# 排序并取每个 session 前20
pdf = pdf.sort_values(['session','pred'], ascending=[True,False], ignore_index=True)

pdf['n'] = pdf.groupby('session').cumcount().astype('int8')
pdf = pdf.loc[pdf.n < 20]

# 生成提交
sub = pdf.groupby('session')['aid'].apply(list).reset_index()
sub['labels'] = sub['aid'].apply(lambda x: " ".join(map(str, x)))
sub.drop(columns='aid', inplace=True)
sub.columns = ['session_type','labels']
sub['session_type'] = sub['session_type'].astype(str) + f'_{USE}'

In [10]:
sub.head()

Unnamed: 0,session_type,labels
0,12899779_orders,59625 679602 696438 637538 731692 475447 62051...
1,12899780_orders,1142000 736515 973453 582732 932551 103974 595...
2,12899781_orders,199008 918667 57315 141736 194067 1102089 1681...
3,12899782_orders,779477 127404 1711180 562753 987399 740494 889...
4,12899783_orders,255297 1729553 198385 1817895 300127 607638 12...


In [11]:
# 最终 sub 即为提交文件
sub.to_csv(f"submission_{USE}.csv", index=False)
print("Done!")


Done!


#Submission

In [12]:
import pandas as pd

# 1) 读取三个提交文件
df_clicks = pd.read_csv("/content/submission_clicks.csv")
df_carts = pd.read_csv("/content/submission_carts.csv")
df_orders = pd.read_csv("/content/submission_orders.csv")

# 2) 行级拼接
df_final = pd.concat([df_clicks, df_carts, df_orders], ignore_index=True)

# 3) 保存为最终提交
df_final.to_csv("submission.csv", index=False)
