In [3]:
import pandas as pd
import numpy as np

In [1]:
def generate_label_table(df_candidates, df_labels, chunk_size=500000):
    # 直接 explode() ground_truth，變成長格式
    df_labels_exploded = df_labels.explode("ground_truth").dropna()
    df_labels_exploded = df_labels_exploded.rename(columns={"ground_truth": "aid"})

    # 設置 chunking 批量處理，減少記憶體消耗
    num_chunks = (len(df_candidates) + chunk_size - 1) // chunk_size  # 確保 chunk 數量正確
    df_chunks = []

    print(f"Total chunks: {num_chunks}")

    for i in range(num_chunks):
        print(f"Processing chunk {i+1}/{num_chunks}...")

        # 讀取 df_candidates 的部分資料
        df_chunk = df_candidates.iloc[i * chunk_size: (i + 1) * chunk_size].copy()

        # **儲存原始索引**
        original_index = df_chunk.index.copy()

        # **合併標籤資料**
        df_chunk = df_chunk.merge(df_labels_exploded, on=["session", "aid"], how="left")

        # **使用 pivot() 讓 clicks, carts, orders 變成獨立欄位**
        df_chunk["label"] = 1
        df_chunk = df_chunk.pivot(index=["session", "aid"], columns="type", values="label").reset_index()

        # **確保 `clicks, carts, orders` 欄位存在**
        # for col in ["clicks", "carts", "orders"]:
        #     if col not in df_chunk.columns:
        #         df_chunk[col] = 0

        # **使用 `reindex()` 確保 `clicks, carts, orders` 欄位存在**
        df_chunk = df_chunk.reindex(columns=["session", "aid", "clicks", "carts", "orders"], fill_value=0)

        # **確保 `merge()` 不影響 `df_candidates` 順序**
        df_chunk = df_chunk.merge(df_candidates[["session", "aid"]], on=["session", "aid"], how="right")
        df_chunk = df_chunk.reindex(original_index)

        # **填補 NaN（表示沒發生行為的標籤），並確保欄位順序**
        df_chunk = df_chunk.fillna(0).astype(int)[["clicks", "carts", "orders"]]

        # **單獨存儲每個 chunk**
        df_chunks.append(df_chunk)

    # 合併所有 chunk
    df_final = pd.concat(df_chunks, ignore_index=True)

    df_final.to_parquet("data/labels.parquet", index=False)
    print("Parquet saved successfully")

    return

In [2]:
df_candidates = pd.read_parquet("data/candidates_features_engineering.parquet")
df_labels = pd.read_parquet("data/val_labels.parquet")

NameError: name 'pd' is not defined

In [None]:
df_labels_output = generate_label_table(df_candidates, df_labels)

Total chunks: 181
Processing chunk 1/181...
Processing chunk 2/181...
Processing chunk 3/181...
Processing chunk 4/181...
Processing chunk 5/181...
Processing chunk 6/181...
Processing chunk 7/181...
Processing chunk 8/181...
Processing chunk 9/181...
Processing chunk 10/181...
Processing chunk 11/181...
Processing chunk 12/181...
Processing chunk 13/181...
Processing chunk 14/181...
Processing chunk 15/181...
Processing chunk 16/181...
Processing chunk 17/181...
Processing chunk 18/181...
Processing chunk 19/181...
Processing chunk 20/181...
Processing chunk 21/181...
Processing chunk 22/181...
Processing chunk 23/181...
Processing chunk 24/181...
Processing chunk 25/181...
Processing chunk 26/181...
Processing chunk 27/181...
Processing chunk 28/181...
Processing chunk 29/181...
Processing chunk 30/181...
Processing chunk 31/181...
Processing chunk 32/181...
Processing chunk 33/181...
Processing chunk 34/181...
Processing chunk 35/181...
Processing chunk 36/181...
Processing chunk 37

### Test

In [4]:
df_labels_output = pd.read_parquet("data/labels.parquet")

In [None]:
df_labels_output[10000:11000].to_csv("test_labels_output.csv", index=False)
df_candidates[10000:11000].to_csv("test_candidates.csv", index=False)
df_labels[100:1100].to_csv("test_labels.csv", index=False)

In [5]:
print(df_labels_output)

          clicks  carts  orders
0              0      0       1
1              0      0       0
2              0      0       0
3              0      0       0
4              0      0       0
...          ...    ...     ...
90062545       0      0       0
90062546       0      0       0
90062547       0      0       0
90062548       0      0       0
90062549       0      0       0

[90062550 rows x 3 columns]
