In [1]:
import numpy as np
import pandas as pd
import datetime
import pickle #pickle序列化对象并保存到磁盘中，并在需要的时候读取出来
import os
import re
import catboost as cb
from catboost import CatBoostClassifier
import xgboost as xgb
import sklearn

In [2]:
def load_data():
    with open("clean_data_without_no_buy.pkl", "rb") as f:
        df_user, df_sku, df_action = pickle.load(f)
    return df_user, df_sku, df_action
df_user, df_sku, df_action = load_data()

In [3]:
#获取从某个时间段内的总的用户行为特征
def calculate_user_action_features(start, end, actions):
    filtered_actions = actions[(actions["time"] >= start) & (actions["time"] < end)]
    filtered_actions = filtered_actions[["user_id", "type", "time"]]
    
    action_counts = filtered_actions.groupby(["user_id", "type"]).size().reset_index(name="count")
    
    action_counts["browse_cnt"] = ((action_counts["type"] == 1) * action_counts["count"]).astype(np.float32)
    action_counts["addcart_cnt"] = ((action_counts["type"] == 2) * action_counts["count"]).astype(np.float32)
    action_counts["delcart_cnt"] = ((action_counts["type"] == 3) * action_counts["count"]).astype(np.float32)
    action_counts["buy_cnt"] = ((action_counts["type"] == 4) * action_counts["count"]).astype(np.float32)
    action_counts["follow_cnt"] = ((action_counts["type"] == 5) * action_counts["count"]).astype(np.float32)
    action_counts["click_cnt"] = ((action_counts["type"] == 6) * action_counts["count"]).astype(np.float32)
    
    action_counts.drop(columns=["type", "count"], inplace=True)
    user_action_summary = action_counts.groupby("user_id").sum().reset_index()
    
    epsilon = 1e-6
    user_action_summary["browse_rate"] = user_action_summary["buy_cnt"] / (user_action_summary["browse_cnt"] + epsilon)
    user_action_summary["addcart_rate"] = user_action_summary["buy_cnt"] / (user_action_summary["addcart_cnt"] + epsilon)
    user_action_summary["delcart_rate"] = user_action_summary["buy_cnt"] / (user_action_summary["delcart_cnt"] + epsilon)
    user_action_summary["follow_rate"] = user_action_summary["buy_cnt"] / (user_action_summary["follow_cnt"] + epsilon)
    user_action_summary["click_rate"] = user_action_summary["buy_cnt"] / (user_action_summary["click_cnt"] + epsilon)
    
    user_action_summary["browse_rate"] = user_action_summary["browse_rate"].apply(lambda x: 1.0 if x > 0.9999 else x).astype(np.float32)
    user_action_summary["addcart_rate"] = user_action_summary["addcart_rate"].apply(lambda x: 1.0 if x > 0.9999 else x).astype(np.float32)
    user_action_summary["delcart_rate"] = user_action_summary["delcart_rate"].apply(lambda x: 1.0 if x > 0.9999 else x).astype(np.float32)
    user_action_summary["follow_rate"] = user_action_summary["follow_rate"].apply(lambda x: 1.0 if x > 0.9999 else x).astype(np.float32)
    user_action_summary["click_rate"] = user_action_summary["click_rate"].apply(lambda x: 1.0 if x > 0.9999 else x).astype(np.float32)
    
    print(user_action_summary.head())
    return user_action_summary

calculate_user_action_features(pd.to_datetime("2016-02-01"), pd.to_datetime("2016-04-18"), df_action)

   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200001        57.0         12.0          6.0      1.0         0.0   
1   200005        36.0          0.0          0.0      0.0         0.0   
2   200014       144.0         10.0          4.0      0.0         0.0   
3   200015       392.0          2.0          2.0      0.0         0.0   
4   200017        16.0          0.0          0.0      1.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0       86.0     0.017544      0.083333      0.166667          1.0    0.011628  
1       56.0     0.000000      0.000000      0.000000          0.0    0.000000  
2      151.0     0.000000      0.000000      0.000000          0.0    0.000000  
3      660.0     0.000000      0.000000      0.000000          0.0    0.000000  
4       17.0     0.062500      1.000000      1.000000          1.0    0.058824  


Unnamed: 0,user_id,browse_cnt,addcart_cnt,delcart_cnt,buy_cnt,follow_cnt,click_cnt,browse_rate,addcart_rate,delcart_rate,follow_rate,click_rate
0,200001,57.0,12.0,6.0,1.0,0.0,86.0,0.017544,0.083333,0.166667,1.0,0.011628
1,200005,36.0,0.0,0.0,0.0,0.0,56.0,0.000000,0.000000,0.000000,0.0,0.000000
2,200014,144.0,10.0,4.0,0.0,0.0,151.0,0.000000,0.000000,0.000000,0.0,0.000000
3,200015,392.0,2.0,2.0,0.0,0.0,660.0,0.000000,0.000000,0.000000,0.0,0.000000
4,200017,16.0,0.0,0.0,1.0,0.0,17.0,0.062500,1.000000,1.000000,1.0,0.058824
...,...,...,...,...,...,...,...,...,...,...,...,...
29425,305301,124.0,1.0,0.0,0.0,0.0,269.0,0.000000,0.000000,0.000000,0.0,0.000000
29426,305308,98.0,1.0,0.0,1.0,0.0,192.0,0.010204,1.000000,1.000000,1.0,0.005208
29427,305313,38.0,1.0,0.0,1.0,0.0,87.0,0.026316,1.000000,1.000000,1.0,0.011494
29428,305317,54.0,0.0,0.0,0.0,0.0,52.0,0.000000,0.000000,0.000000,0.0,0.000000


In [4]:
#获取某时间段内用户针对某类别cate商品的行为特征
def get_user_cate_feature_from_to(start_time, end_time, df_action, df_sku):
    df_user_action = df_action.loc[(df_action.loc[:, "time"] >= start_time) & (df_action.loc[:, "time"] < end_time)]
    df_user_action = df_user_action.loc[:, ["user_id", "sku_id", "type", "time"]]
    df_user_action = pd.merge(df_user_action, df_sku.loc[:, ["sku_id", "cate"]], how="left", on="sku_id")
    df_user_action.pop("sku_id")
    df_user_action_type_cnt = df_user_action.groupby(["user_id", "cate", "type"]).count().reset_index()
    df_user_action_type_cnt.rename(columns={"time":"cnt"}, inplace=True)
    print(df_user_action_type_cnt.head())
    
    df_user_action_type_cnt.loc[:, "browse_cnt"] = ((df_user_action_type_cnt.loc[:, "type"] == 1) \
                                            * df_user_action_type_cnt.loc[:, "cnt"]).astype(np.float32)
    df_user_action_type_cnt.loc[:, "addcart_cnt"] = ((df_user_action_type_cnt.loc[:, "type"] == 2) \
                                            * df_user_action_type_cnt.loc[:, "cnt"]).astype(np.float32)
    df_user_action_type_cnt.loc[:, "delcart_cnt"] = ((df_user_action_type_cnt.loc[:, "type"] == 3) \
                                            * df_user_action_type_cnt.loc[:, "cnt"]).astype(np.float32)
    df_user_action_type_cnt.loc[:, "buy_cnt"] = ((df_user_action_type_cnt.loc[:, "type"] == 4) \
                                            * df_user_action_type_cnt.loc[:, "cnt"]).astype(np.float32)
    df_user_action_type_cnt.loc[:, "follow_cnt"] = ((df_user_action_type_cnt.loc[:, "type"] == 5) \
                                            * df_user_action_type_cnt.loc[:, "cnt"]).astype(np.float32)
    df_user_action_type_cnt.loc[:, "click_cnt"] = ((df_user_action_type_cnt.loc[:, "type"] == 6) \
                                            * df_user_action_type_cnt.loc[:, "cnt"]).astype(np.float32)
    
    df_user_action_type_cnt.pop("type")
    df_user_action_type_cnt.pop("cnt")
    df_user_action_type_cnt = df_user_action_type_cnt.groupby(["user_id", "cate"]).sum().reset_index()
    
    print(df_user_action_type_cnt.head())
    eps = 10**(-6)
    df_user_action_type_cnt.loc[:, "browse_rate"] = df_user_action_type_cnt.loc[:, "buy_cnt"] \
                                                    / (df_user_action_type_cnt.loc[:, "browse_cnt"] + eps)
    df_user_action_type_cnt.loc[:, "addcart_rate"] = df_user_action_type_cnt.loc[:, "buy_cnt"] \
                                                    / (df_user_action_type_cnt.loc[:, "addcart_cnt"] + eps)
    df_user_action_type_cnt.loc[:, "delcart_rate"] = df_user_action_type_cnt.loc[:, "buy_cnt"] \
                                                    / (df_user_action_type_cnt.loc[:, "delcart_cnt"] + eps)
    df_user_action_type_cnt.loc[:, "follow_rate"] = df_user_action_type_cnt.loc[:, "buy_cnt"] \
                                                    / (df_user_action_type_cnt.loc[:, "follow_cnt"] + eps)
    df_user_action_type_cnt.loc[:, "click_rate"] = df_user_action_type_cnt.loc[:, "buy_cnt"] \
                                                    / (df_user_action_type_cnt.loc[:, "click_cnt"] + eps)
    
    df_user_action_type_cnt.loc[:, "browse_rate"] = (df_user_action_type_cnt.loc[:, "browse_rate"] \
                                            .apply(lambda x : 1.0 if x>0.9999 else x)).astype(np.float32)
    df_user_action_type_cnt.loc[:, "addcart_rate"] = (df_user_action_type_cnt.loc[:, "addcart_rate"] \
                                            .apply(lambda x : 1.0 if x>0.9999 else x)).astype(np.float32)
    df_user_action_type_cnt.loc[:, "delcart_rate"] = (df_user_action_type_cnt.loc[:, "delcart_rate"] \
                                            .apply(lambda x : 1.0 if x>0.9999 else x)).astype(np.float32)
    df_user_action_type_cnt.loc[:, "follow_rate"] = (df_user_action_type_cnt.loc[:, "follow_rate"] \
                                            .apply(lambda x : 1.0 if x>0.9999 else x)).astype(np.float32)
    df_user_action_type_cnt.loc[:, "click_rate"] = (df_user_action_type_cnt.loc[:, "click_rate"] \
                                            .apply(lambda x : 1.0 if x>0.9999 else x)).astype(np.float32)
                                              
    df_user_cate_feature = df_user_action_type_cnt
    return df_user_cate_feature
    
get_user_cate_feature_from_to(pd.to_datetime("2016-03-01"), pd.to_datetime("2016-03-30"), df_action, df_sku)

   user_id  cate  type  cnt
0   200001     8     1   47
1   200001     8     2   11
2   200001     8     4    1
3   200001     8     6   65
4   200005     8     1   10
   user_id  cate  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200001     8        47.0         11.0          0.0      1.0         0.0   
1   200005     8        10.0          0.0          0.0      0.0         0.0   
2   200014     8       123.0          5.0          3.0      0.0         0.0   
3   200015     8        30.0          0.0          0.0      0.0         0.0   
4   200017     8        16.0          0.0          0.0      1.0         0.0   

   click_cnt  
0       65.0  
1       19.0  
2      126.0  
3       26.0  
4       17.0  


Unnamed: 0,user_id,cate,browse_cnt,addcart_cnt,delcart_cnt,buy_cnt,follow_cnt,click_cnt,browse_rate,addcart_rate,delcart_rate,follow_rate,click_rate
0,200001,8,47.0,11.0,0.0,1.0,0.0,65.0,0.021277,0.090909,1.0,1.0,0.015385
1,200005,8,10.0,0.0,0.0,0.0,0.0,19.0,0.000000,0.000000,0.0,0.0,0.000000
2,200014,8,123.0,5.0,3.0,0.0,0.0,126.0,0.000000,0.000000,0.0,0.0,0.000000
3,200015,8,30.0,0.0,0.0,0.0,0.0,26.0,0.000000,0.000000,0.0,0.0,0.000000
4,200017,8,16.0,0.0,0.0,1.0,0.0,17.0,0.062500,1.000000,1.0,1.0,0.058824
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24319,305292,8,14.0,2.0,1.0,1.0,0.0,18.0,0.071429,0.500000,1.0,1.0,0.055556
24320,305296,8,26.0,1.0,0.0,0.0,0.0,32.0,0.000000,0.000000,0.0,0.0,0.000000
24321,305301,8,118.0,1.0,0.0,0.0,0.0,257.0,0.000000,0.000000,0.0,0.0,0.000000
24322,305313,8,5.0,0.0,0.0,0.0,0.0,7.0,0.000000,0.000000,0.0,0.0,0.000000


In [5]:
#获取某时间段内用户针对某品牌brand商品的行为特征
def extract_user_brand_metrics(start, end, actions_df, sku_df):
    filtered_actions = actions_df[(actions_df["time"] >= start) & (actions_df["time"] < end)]
    user_actions = filtered_actions[["user_id", "sku_id", "type", "time"]]
    merged_data = user_actions.merge(sku_df[["sku_id", "brand"]], on="sku_id", how="left").drop(columns="sku_id")
    
    action_counts = merged_data.groupby(["user_id", "brand", "type"]).size().reset_index(name="count")
    
    for action_type in range(1, 7):
        action_counts[f"type_{action_type}_count"] = (action_counts["type"] == action_type).astype(int) * action_counts["count"]
    
    action_counts.drop(columns=["type", "count"], inplace=True)
    summary = action_counts.groupby(["user_id", "brand"]).sum().reset_index()
    
    epsilon = 1e-6
    summary["browse_rate"] = summary["type_4_count"] / (summary["type_1_count"] + epsilon)
    summary["addcart_rate"] = summary["type_4_count"] / (summary["type_2_count"] + epsilon)
    summary["delcart_rate"] = summary["type_4_count"] / (summary["type_3_count"] + epsilon)
    summary["follow_rate"] = summary["type_4_count"] / (summary["type_5_count"] + epsilon)
    summary["click_rate"] = summary["type_4_count"] / (summary["type_6_count"] + epsilon)
    
    for rate in ["browse_rate", "addcart_rate", "delcart_rate", "follow_rate", "click_rate"]:
        summary[rate] = summary[rate].apply(lambda x: min(x, 0.9999)).astype(np.float32)
    
    return summary

extract_user_brand_metrics(pd.to_datetime("2016-02-01"), pd.to_datetime("2016-05-02"), df_action, df_sku)

Unnamed: 0,user_id,brand,type_1_count,type_2_count,type_3_count,type_4_count,type_5_count,type_6_count,browse_rate,addcart_rate,delcart_rate,follow_rate,click_rate
0,200001,214,2,0,0,0,0,4,0.000000,0.000000,0.0000,0.0000,0.000000
1,200001,306,26,3,1,1,0,43,0.038462,0.333333,0.9999,0.9999,0.023256
2,200001,403,20,4,3,0,0,34,0.000000,0.000000,0.0000,0.0000,0.000000
3,200001,489,5,3,1,0,0,0,0.000000,0.000000,0.0000,0.0000,0.000000
4,200001,800,4,2,1,0,0,5,0.000000,0.000000,0.0000,0.0000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129641,305317,30,8,0,0,0,0,9,0.000000,0.000000,0.0000,0.0000,0.000000
129642,305317,800,46,0,0,0,0,43,0.000000,0.000000,0.0000,0.0000,0.000000
129643,305318,30,28,3,0,1,0,38,0.035714,0.333333,0.9999,0.9999,0.026316
129644,305318,214,20,0,0,0,0,38,0.000000,0.000000,0.0000,0.0000,0.000000


In [6]:
#获取某时间段内用户针对某商品sku_id的行为特征
def calculate_user_sku_features(start, end, actions):
    filtered_actions = actions[(actions["time"] >= start) & (actions["time"] < end)]
    selected_columns = filtered_actions[["user_id", "sku_id", "type", "time"]]
    action_counts = selected_columns.groupby(["user_id", "sku_id", "type"]).size().reset_index(name="count")
    
    action_counts["browse_cnt"] = (action_counts["type"] == 1).astype(int) * action_counts["count"]
    action_counts["addcart_cnt"] = (action_counts["type"] == 2).astype(int) * action_counts["count"]
    action_counts["delcart_cnt"] = (action_counts["type"] == 3).astype(int) * action_counts["count"]
    action_counts["buy_cnt"] = (action_counts["type"] == 4).astype(int) * action_counts["count"]
    action_counts["follow_cnt"] = (action_counts["type"] == 5).astype(int) * action_counts["count"]
    action_counts["click_cnt"] = (action_counts["type"] == 6).astype(int) * action_counts["count"]
    
    action_counts.drop(columns=["type", "count"], inplace=True)
    aggregated_counts = action_counts.groupby(["user_id", "sku_id"]).sum().reset_index()
    
    epsilon = 1e-6
    aggregated_counts["browse_rate"] = aggregated_counts["buy_cnt"] / (aggregated_counts["browse_cnt"] + epsilon)
    aggregated_counts["addcart_rate"] = aggregated_counts["buy_cnt"] / (aggregated_counts["addcart_cnt"] + epsilon)
    aggregated_counts["delcart_rate"] = aggregated_counts["buy_cnt"] / (aggregated_counts["delcart_cnt"] + epsilon)
    aggregated_counts["follow_rate"] = aggregated_counts["buy_cnt"] / (aggregated_counts["follow_cnt"] + epsilon)
    aggregated_counts["click_rate"] = aggregated_counts["buy_cnt"] / (aggregated_counts["click_cnt"] + epsilon)
    
    for rate in ["browse_rate", "addcart_rate", "delcart_rate", "follow_rate", "click_rate"]:
        aggregated_counts[rate] = aggregated_counts[rate].apply(lambda x: min(x, 0.9999)).astype(np.float32)
    
    return aggregated_counts

calculate_user_sku_features(pd.to_datetime("2016-03-01"), pd.to_datetime("2016-03-02"), df_action)

Unnamed: 0,user_id,sku_id,browse_cnt,addcart_cnt,delcart_cnt,buy_cnt,follow_cnt,click_cnt,browse_rate,addcart_rate,delcart_rate,follow_rate,click_rate
0,200074,62872,1,0,1,0,0,1,0.0,0.0,0.0,0.0,0.0
1,200074,90521,1,0,0,0,0,2,0.0,0.0,0.0,0.0,0.0
2,200074,142829,1,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0
3,200074,145946,3,0,1,0,0,4,0.0,0.0,0.0,0.0,0.0
4,200074,148856,5,1,0,0,0,9,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11528,305225,131300,12,1,0,0,0,31,0.0,0.0,0.0,0.0,0.0
11529,305225,141167,2,0,0,0,0,4,0.0,0.0,0.0,0.0,0.0
11530,305225,153395,6,1,0,0,0,14,0.0,0.0,0.0,0.0,0.0
11531,305225,166876,0,0,0,0,0,4,0.0,0.0,0.0,0.0,0.0


In [7]:
#获取商品某时间段内被用户行为的特征
def compute_sku_action_metrics(start, end, actions):
    filtered_actions = actions[(actions["time"] >= start) & (actions["time"] < end)]
    relevant_columns = filtered_actions[["sku_id", "type", "time"]]
    action_counts = relevant_columns.groupby(["sku_id", "type"]).size().reset_index(name="count")
    
    action_counts["browse_cnt"] = (action_counts["type"] == 1).astype(int) * action_counts["count"]
    action_counts["addcart_cnt"] = (action_counts["type"] == 2).astype(int) * action_counts["count"]
    action_counts["delcart_cnt"] = (action_counts["type"] == 3).astype(int) * action_counts["count"]
    action_counts["buy_cnt"] = (action_counts["type"] == 4).astype(int) * action_counts["count"]
    action_counts["follow_cnt"] = (action_counts["type"] == 5).astype(int) * action_counts["count"]
    action_counts["click_cnt"] = (action_counts["type"] == 6).astype(int) * action_counts["count"]
    
    action_counts.drop(columns=["type", "count"], inplace=True)
    aggregated_counts = action_counts.groupby("sku_id").sum().reset_index()
    
    epsilon = 1e-6
    aggregated_counts["browse_rate"] = aggregated_counts["buy_cnt"] / (aggregated_counts["browse_cnt"] + epsilon)
    aggregated_counts["addcart_rate"] = aggregated_counts["buy_cnt"] / (aggregated_counts["addcart_cnt"] + epsilon)
    aggregated_counts["delcart_rate"] = aggregated_counts["buy_cnt"] / (aggregated_counts["delcart_cnt"] + epsilon)
    aggregated_counts["follow_rate"] = aggregated_counts["buy_cnt"] / (aggregated_counts["follow_cnt"] + epsilon)
    aggregated_counts["click_rate"] = aggregated_counts["buy_cnt"] / (aggregated_counts["click_cnt"] + epsilon)
    
    for rate in ["browse_rate", "addcart_rate", "delcart_rate", "follow_rate", "click_rate"]:
        aggregated_counts[rate] = aggregated_counts[rate].apply(lambda x: min(x, 0.9999)).astype(np.float32)
    
    return aggregated_counts

compute_sku_action_metrics(pd.to_datetime("2016-03-01"), pd.to_datetime("2016-03-02"), df_action)

Unnamed: 0,sku_id,browse_cnt,addcart_cnt,delcart_cnt,buy_cnt,follow_cnt,click_cnt,browse_rate,addcart_rate,delcart_rate,follow_rate,click_rate
0,95,1,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0
1,276,23,0,0,0,0,74,0.0,0.0,0.0,0.0,0.0
2,281,3,0,0,0,0,5,0.0,0.0,0.0,0.0,0.0
3,499,21,0,0,0,1,31,0.0,0.0,0.0,0.0,0.0
4,661,11,0,0,0,0,25,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1273,170561,22,0,0,0,0,41,0.0,0.0,0.0,0.0,0.0
1274,170587,6,0,0,0,0,11,0.0,0.0,0.0,0.0,0.0
1275,170870,17,1,0,0,0,17,0.0,0.0,0.0,0.0,0.0
1276,170998,1,0,0,0,0,2,0.0,0.0,0.0,0.0,0.0


In [8]:
#一个辅助函数，用来命名多个可能重名的属性特征
def rename_columns_with_day(df, fixstr):
    column_dict = {"browse_cnt" : "browse_cnt"+fixstr, \
                  "addcart_cnt" : "addcart_cnt"+fixstr, \
                  "delcart_cnt" : "delcart_cnt"+fixstr, \
                  "buy_cnt" : "buy_cnt"+fixstr, \
                  "follow_cnt" : "follow_cnt"+fixstr, \
                  "click_cnt" : "click_cnt"+fixstr, \
                  "browse_rate" : "browse_rate"+fixstr, \
                  "addcart_rate" : "addcart_rate"+fixstr, \
                  "delcart_rate" : "delcart_rate"+fixstr, \
                  "follow_rate" : "follow_rate"+fixstr, \
                  "click_rate" : "click_rate"+fixstr}
    df.rename(columns=column_dict, inplace=True)
    return df

In [9]:
#获取用户的固有特征（基本特征，整个时间段的行为特征）
def generate_user_features(user_df, sku_df, action_df):
    base_features = user_df.copy()
    
    start = pd.to_datetime("2016-01-01")
    end = pd.to_datetime("2016-06-07")
    
    action_features = calculate_user_action_features(start, end, action_df)
    category_features = get_user_cate_feature_from_to(start, end, action_df, sku_df)
    brand_features = extract_user_brand_metrics(start, end, action_df, sku_df)
    sku_features = calculate_user_sku_features(start, end, action_df)
    
    action_features = rename_columns_with_day(action_features, "_action_all")
    category_features = rename_columns_with_day(category_features, "_category_all")
    brand_features = rename_columns_with_day(brand_features, "_brand_all")
    sku_features = rename_columns_with_day(sku_features, "_sku_all")
    
    feature_list = [base_features, action_features, category_features, brand_features, sku_features]
    
    return feature_list

feature_list = generate_user_features(df_user, df_sku, df_action)

   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200001        57.0         12.0          6.0      1.0         0.0   
1   200005        36.0          0.0          0.0      0.0         0.0   
2   200014       144.0         10.0          4.0      0.0         0.0   
3   200015       392.0          2.0          2.0      0.0         0.0   
4   200017        16.0          0.0          0.0      1.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0       86.0     0.017544      0.083333      0.166667          1.0    0.011628  
1       56.0     0.000000      0.000000      0.000000          0.0    0.000000  
2      151.0     0.000000      0.000000      0.000000          0.0    0.000000  
3      660.0     0.000000      0.000000      0.000000          0.0    0.000000  
4       17.0     0.062500      1.000000      1.000000          1.0    0.058824  
   user_id  cate  type  cnt
0   200001     8     1   57
1   200001     8   

In [10]:
#获取商品的固有特征（基本特征，整个时间段的被用户行为特征）
def create_sku_features(sku_data, action_data):
    sku_base = sku_data.copy()
    sku_action_data = compute_sku_action_metrics(pd.to_datetime("2016-01-01"), pd.to_datetime("2016-06-07"), action_data)
    sku_action_data = rename_columns_with_day(sku_action_data, "_sku_all")
    combined_features = [sku_base, sku_action_data]
    return combined_features

resulting_features = create_sku_features(df_sku, df_action)

In [11]:
#获取一个时间窗口的购买前多个不同时间段的特征    
def compute_window_features(start_time, end_time, action_data, sku_data):
    if (end_time - start_time).days < 30:
        raise ValueError("时间区间太短啦！")
    
    time_intervals = [1, 2, 3, 5, 7, 10, 15, 20, 30]
    
    user_action_features = []
    user_category_features = []
    user_brand_features = []
    user_sku_features = []
    sku_action_features = []
    
    for days in time_intervals:
        interval_start = end_time - datetime.timedelta(days=days)
        
        user_action_f = calculate_user_action_features(interval_start, end_time, action_data)
        user_category_f = get_user_cate_feature_from_to(interval_start, end_time, action_data, sku_data)
        user_brand_f = extract_user_brand_metrics(interval_start, end_time, action_data, sku_data)
        user_sku_f = calculate_user_sku_features(interval_start, end_time, action_data)
        sku_action_f = compute_sku_action_metrics(interval_start, end_time, action_data)
        
        user_action_f = rename_columns_with_day(user_action_f, f"_user_{days}")
        user_category_f = rename_columns_with_day(user_category_f, f"_user_cate_{days}")
        user_brand_f = rename_columns_with_day(user_brand_f, f"_user_brand_{days}")
        user_sku_f = rename_columns_with_day(user_sku_f, f"_user_sku_{days}")
        sku_action_f = rename_columns_with_day(sku_action_f, f"_sku_{days}")
        
        user_action_features.append(user_action_f)
        user_category_features.append(user_category_f)
        user_brand_features.append(user_brand_f)
        user_sku_features.append(user_sku_f)
        sku_action_features.append(sku_action_f)
    
    return (user_action_features, user_category_features, user_brand_features, user_sku_features, sku_action_features)

lists_features_one_window = compute_window_features(pd.to_datetime("2016-02-01"), pd.to_datetime("2016-03-10"), df_action, df_sku)

   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200049         2.0          0.0          0.0      0.0         0.0   
1   200106         6.0          0.0          0.0      0.0         0.0   
2   200113         4.0          0.0          0.0      0.0         0.0   
3   200143        40.0          0.0          0.0      0.0         0.0   
4   200165        14.0          0.0          0.0      0.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0        3.0          0.0           0.0           0.0          0.0         0.0  
1        7.0          0.0           0.0           0.0          0.0         0.0  
2        6.0          0.0           0.0           0.0          0.0         0.0  
3       96.0          0.0           0.0           0.0          0.0         0.0  
4       12.0          0.0           0.0           0.0          0.0         0.0  
   user_id  cate  type  cnt
0   200049     8     1    2
1   200049     8   

   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200015        30.0          0.0          0.0      0.0         0.0   
1   200017         4.0          0.0          0.0      0.0         0.0   
2   200020        14.0          0.0          0.0      1.0         0.0   
3   200032         2.0          0.0          0.0      0.0         0.0   
4   200034        16.0          0.0          0.0      0.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0       26.0     0.000000           0.0           0.0          0.0    0.000000  
1        3.0     0.000000           0.0           0.0          0.0    0.000000  
2       49.0     0.071429           1.0           1.0          1.0    0.020408  
3        2.0     0.000000           0.0           0.0          0.0    0.000000  
4       19.0     0.000000           0.0           0.0          0.0    0.000000  
   user_id  cate  type  cnt
0   200015     8     1   30
1   200015     8   

In [12]:
#获取用户在某时间段内是否购买的标签，令 end_time - start_time = 5，用来生成数据集label
def generate_user_sku_labels(start, end, actions):
    print(start, end)
    user_sku_features = calculate_user_sku_features(start, end, actions)
    buy_labels = user_sku_features.loc[:, ["user_id", "sku_id", "buy_cnt"]]
    buy_labels["buy_label"] = buy_labels["buy_cnt"].apply(lambda count: True if count >= 1 else False)
    buy_labels.drop(columns=["buy_cnt"], inplace=True)
    return buy_labels

user_sku_labels = generate_user_sku_labels(pd.to_datetime("2016-02-05"), pd.to_datetime("2016-02-06"), df_action)
print(user_sku_labels[user_sku_labels["buy_label"] == True])

2016-02-05 00:00:00 2016-02-06 00:00:00
      user_id  sku_id  buy_label
64     201108    5505       True
65     201108   88295       True
341    216670  154636       True
347    216922   33445       True
383    218498  149854       True
441    221842   75877       True
469    222886  154636       True
505    224173   39253       True
574    227271  152478       True
605    227692  133477       True
767    234902  116489       True
1033   242994   26796       True
1143   244996   34517       True
1148   244996   85557       True
1188   247371  149641       True
1248   251645  154636       True
1290   252966  154636       True
1421   262182  128988       True
1518   268041  128988       True
1521   268482  166707       True
1587   270816   12564       True
1652   273152  108399       True
1733   276932  116497       True
1841   283907   32465       True
1984   289710   60861       True
2012   290655  128988       True


In [13]:
def clean_dataset(dataframe):
    cols = dataframe.columns.tolist()
    dataframe.fillna(0, inplace=True)

    for col in cols:
        if re.match(r"user_id[0-9]", col) or re.match(r"sku_id[0-9]", col):
            dataframe.drop(columns=[col], inplace=True)
        elif re.match(r"cate[0-9]", col) or re.match(r"brand[0-9]", col):
            dataframe.drop(columns=[col], inplace=True)

    return dataframe


In [14]:
#获取一个时间窗口的训练集
def construct_window_dataset(start_date, end_date, user_data, sku_data, action_data, user_constant_features, sku_constant_features):
    user_action_features, user_cate_features, user_brand_features, user_sku_features, sku_action_features = compute_window_features(start_date, end_date, action_data, sku_data)
    
    user_sku_labels = generate_user_sku_labels(end_date, end_date + datetime.timedelta(days=5), action_data)
    
    dataset = user_sku_features[-1].loc[:, ["user_id", "sku_id"]].drop_duplicates()
    print("len_df_dataset: ", len(dataset))
    dataset = pd.merge(dataset, sku_data.loc[:, ["sku_id", "cate", "brand"]], how="left", on="sku_id")
    
    suffix_counter = 1
    for user_feature in user_constant_features:
        if "sku_id" in user_feature.columns:
            dataset = pd.merge(dataset, user_feature, how="left", on=["user_id", "sku_id"], suffixes=("", str(suffix_counter)), sort=False)
        elif "cate" in user_feature.columns:
            dataset = pd.merge(dataset, user_feature, how="left", on=["user_id", "cate"], suffixes=("", str(suffix_counter)), sort=False)
        elif "brand" in user_feature.columns:
            dataset = pd.merge(dataset, user_feature, how="left", on=["user_id", "brand"], suffixes=("", str(suffix_counter)), sort=False)
        else:
            dataset = pd.merge(dataset, user_feature, how="left", on="user_id", suffixes=("", str(suffix_counter)), sort=False)
        
        suffix_counter += 1
        
    for sku_feature in sku_constant_features:
        dataset = pd.merge(dataset, sku_feature, how="left", on="sku_id", suffixes=("", str(suffix_counter)), sort=False)
        suffix_counter += 1
        
    for action_feature in user_action_features:
        dataset = pd.merge(dataset, action_feature, how="left", on="user_id", suffixes=("", str(suffix_counter)), sort=False)
        suffix_counter += 1
        
    for cate_feature in user_cate_features:
        dataset = pd.merge(dataset, cate_feature, how="left", on=["user_id", "cate"], suffixes=("", str(suffix_counter)), sort=False)
        suffix_counter += 1
        
    for brand_feature in user_brand_features:
        dataset = pd.merge(dataset, brand_feature, how="left", on=["user_id", "brand"], suffixes=("", str(suffix_counter)), sort=False)
        suffix_counter += 1
        
    for sku_feature in user_sku_features:
        dataset = pd.merge(dataset, sku_feature, how="left", on=["user_id", "sku_id"], suffixes=("", str(suffix_counter)), sort=False)
        suffix_counter += 1
        
    for action_feature in sku_action_features:
        dataset = pd.merge(dataset, action_feature, how="left", on="sku_id", suffixes=("", str(suffix_counter)), sort=False)
        suffix_counter += 1
        
    dataset = pd.merge(dataset, user_sku_labels, how="left", on=["user_id", "sku_id"], suffixes=("", str(suffix_counter)), sort=False)
    dataset["buy_label"] = dataset["buy_label"].apply(lambda x: False if pd.isna(x) else x)
    
    dataset = clean_dataset(dataset)
    print("len_df_dataset: ", len(dataset))
    return dataset



In [15]:
#获取数据集
def generate_dataset(window_size, stride, start_date, end_date, user_data, sku_data, action_data, user_features, sku_features):
    dataset = pd.DataFrame()
    current_start = start_date
    current_end = current_start + datetime.timedelta(days=window_size)
    
    while current_end < end_date:
        window_data = construct_window_dataset(current_start, current_end, user_data, sku_data, action_data, user_features, sku_features)
        dataset = window_data.append(dataset)
        
        current_start += datetime.timedelta(days=stride)
        current_end = current_start + datetime.timedelta(days=window_size)
        
    return dataset

df_dataset = generate_dataset(30, 10, pd.to_datetime("2016-02-01"), pd.to_datetime("2016-04-15"), \
                         df_user, df_sku, df_action, \
                         feature_list, resulting_features)
df_dataset

   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200074        12.0          1.0          2.0      0.0         0.0   
1   200085        16.0          0.0          0.0      0.0         0.0   
2   200089         2.0          0.0          0.0      0.0         0.0   
3   200122        16.0          2.0          0.0      0.0         1.0   
4   200242         2.0          0.0          0.0      0.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0       18.0          0.0           0.0           0.0          0.0         0.0  
1       33.0          0.0           0.0           0.0          0.0         0.0  
2        6.0          0.0           0.0           0.0          0.0         0.0  
3       39.0          0.0           0.0           0.0          0.0         0.0  
4        5.0          0.0           0.0           0.0          0.0         0.0  
   user_id  cate  type  cnt
0   200074     8     1   12
1   200074     8   

   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200015         2.0          0.0          0.0      0.0         0.0   
1   200033         1.0          0.0          0.0      0.0         0.0   
2   200063         6.0          0.0          0.0      0.0         0.0   
3   200067        14.0          0.0          0.0      0.0         0.0   
4   200074        13.0          1.0          2.0      0.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0        2.0          0.0           0.0           0.0          0.0         0.0  
1        3.0          0.0           0.0           0.0          0.0         0.0  
2        9.0          0.0           0.0           0.0          0.0         0.0  
3       16.0          0.0           0.0           0.0          0.0         0.0  
4       19.0          0.0           0.0           0.0          0.0         0.0  
   user_id  cate  type  cnt
0   200015     8     1    2
1   200015     8   

  dataset = window_data.append(dataset)


   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200038         2.0          0.0          0.0      0.0         0.0   
1   200043         2.0          0.0          0.0      0.0         0.0   
2   200055        16.0          0.0          0.0      0.0         0.0   
3   200067        20.0          1.0          0.0      1.0         0.0   
4   200089         4.0          0.0          0.0      0.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0        3.0         0.00           0.0           0.0          0.0    0.000000  
1       26.0         0.00           0.0           0.0          0.0    0.000000  
2       20.0         0.00           0.0           0.0          0.0    0.000000  
3       39.0         0.05           1.0           1.0          1.0    0.025641  
4        6.0         0.00           0.0           0.0          0.0    0.000000  
   user_id  cate  type  cnt
0   200038     8     1    2
1   200038     8   

   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200015        30.0          0.0          0.0      0.0         0.0   
1   200017         4.0          0.0          0.0      0.0         0.0   
2   200020        14.0          0.0          0.0      1.0         0.0   
3   200032         2.0          0.0          0.0      0.0         0.0   
4   200034        16.0          0.0          0.0      0.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0       26.0     0.000000           0.0           0.0          0.0    0.000000  
1        3.0     0.000000           0.0           0.0          0.0    0.000000  
2       49.0     0.071429           1.0           1.0          1.0    0.020408  
3        2.0     0.000000           0.0           0.0          0.0    0.000000  
4       19.0     0.000000           0.0           0.0          0.0    0.000000  
   user_id  cate  type  cnt
0   200015     8     1   30
1   200015     8   

  dataset = window_data.append(dataset)


   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200005        10.0          0.0          0.0      0.0         0.0   
1   200028         6.0          1.0          1.0      0.0         0.0   
2   200038        12.0          0.0          1.0      0.0         0.0   
3   200049         2.0          0.0          0.0      0.0         0.0   
4   200053         4.0          0.0          0.0      0.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0       19.0          0.0           0.0           0.0          0.0         0.0  
1       11.0          0.0           0.0           0.0          0.0         0.0  
2       16.0          0.0           0.0           0.0          0.0         0.0  
3        3.0          0.0           0.0           0.0          0.0         0.0  
4        4.0          0.0           0.0           0.0          0.0         0.0  
   user_id  cate  type  cnt
0   200005     8     1   10
1   200005     8   

   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200001         6.0          1.0          0.0      0.0         0.0   
1   200005        10.0          0.0          0.0      0.0         0.0   
2   200014        98.0          3.0          1.0      0.0         0.0   
3   200017        12.0          0.0          0.0      1.0         0.0   
4   200020         2.0          0.0          0.0      0.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0        7.0     0.000000           0.0           0.0          0.0    0.000000  
1       19.0     0.000000           0.0           0.0          0.0    0.000000  
2       87.0     0.000000           0.0           0.0          0.0    0.000000  
3       14.0     0.083333           1.0           1.0          1.0    0.071429  
4        5.0     0.000000           0.0           0.0          0.0    0.000000  
   user_id  cate  type  cnt
0   200001     8     1    6
1   200001     8   

  dataset = window_data.append(dataset)


   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200001         0.0          1.0          0.0      0.0         0.0   
1   200038         2.0          0.0          0.0      0.0         0.0   
2   200063         0.0          0.0          3.0      0.0         0.0   
3   200100         2.0          0.0          0.0      0.0         0.0   
4   200120         2.0          0.0          0.0      0.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0        0.0          0.0           0.0           0.0          0.0         0.0  
1        3.0          0.0           0.0           0.0          0.0         0.0  
2        0.0          0.0           0.0           0.0          0.0         0.0  
3        3.0          0.0           0.0           0.0          0.0         0.0  
4        4.0          0.0           0.0           0.0          0.0         0.0  
   user_id  cate  type  cnt
0   200001     8     2    1
1   200038     8   

   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200001        41.0         11.0          4.0      1.0         0.0   
1   200014        25.0          2.0          2.0      0.0         0.0   
2   200020         1.0          0.0          0.0      0.0         0.0   
3   200028        11.0          0.0          0.0      0.0         0.0   
4   200033         0.0          1.0          0.0      0.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0       58.0      0.02439      0.090909          0.25          1.0    0.017241  
1       39.0      0.00000      0.000000          0.00          0.0    0.000000  
2        2.0      0.00000      0.000000          0.00          0.0    0.000000  
3       15.0      0.00000      0.000000          0.00          0.0    0.000000  
4        0.0      0.00000      0.000000          0.00          0.0    0.000000  
   user_id  cate  type  cnt
0   200001     8     1   41
1   200001     8   

  dataset = window_data.append(dataset)


   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200028         6.0          0.0          0.0      0.0         0.0   
1   200043         2.0          0.0          0.0      0.0         0.0   
2   200054        20.0          0.0          0.0      0.0         0.0   
3   200077         2.0          0.0          0.0      0.0         0.0   
4   200092         2.0          0.0          0.0      0.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0        9.0          0.0           0.0           0.0          0.0         0.0  
1        4.0          0.0           0.0           0.0          0.0         0.0  
2       21.0          0.0           0.0           0.0          0.0         0.0  
3        5.0          0.0           0.0           0.0          0.0         0.0  
4        2.0          0.0           0.0           0.0          0.0         0.0  
   user_id  cate  type  cnt
0   200028     8     1    6
1   200028     8   

   user_id  browse_cnt  addcart_cnt  delcart_cnt  buy_cnt  follow_cnt  \
0   200001        10.0          0.0          0.0      0.0         0.0   
1   200005        26.0          0.0          0.0      0.0         0.0   
2   200014        21.0          5.0          1.0      0.0         0.0   
3   200015        74.0          2.0          0.0      0.0         0.0   
4   200028        17.0          0.0          0.0      0.0         0.0   

   click_cnt  browse_rate  addcart_rate  delcart_rate  follow_rate  click_rate  
0       21.0          0.0           0.0           0.0          0.0         0.0  
1       37.0          0.0           0.0           0.0          0.0         0.0  
2       25.0          0.0           0.0           0.0          0.0         0.0  
3      126.0          0.0           0.0           0.0          0.0         0.0  
4       32.0          0.0           0.0           0.0          0.0         0.0  
   user_id  cate  type  cnt
0   200001     8     1   10
1   200001     8   

  dataset = window_data.append(dataset)


Unnamed: 0,user_id,sku_id,cate,brand,age,sex,user_lv_cd,browse_cnt_action_all,addcart_cnt_action_all,delcart_cnt_action_all,...,delcart_cnt_sku_30,buy_cnt_sku_30,follow_cnt_sku_30,click_cnt_sku_30,browse_rate_sku_30,addcart_rate_sku_30,delcart_rate_sku_30,follow_rate_sku_30,click_rate_sku_30,buy_label
0,200001,14398,8,403,-1,-1,5,57.0,12.0,6.0,...,0,0,0,55,0.000000,0.000000,0.000000,0.000000,0.000000,False
1,200001,20308,8,306,-1,-1,5,57.0,12.0,6.0,...,127,56,36,10724,0.008327,0.174455,0.440945,0.999900,0.005222,False
2,200001,46186,8,403,-1,-1,5,57.0,12.0,6.0,...,8,0,1,806,0.000000,0.000000,0.000000,0.000000,0.000000,False
3,200001,94944,8,800,-1,-1,5,57.0,12.0,6.0,...,45,13,12,5885,0.003457,0.114035,0.288889,0.999900,0.002209,False
4,200001,113402,8,403,-1,-1,5,57.0,12.0,6.0,...,1,0,0,125,0.000000,0.000000,0.000000,0.000000,0.000000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108707,305292,32465,8,489,-1,-1,4,29.0,2.0,1.0,...,100,33,37,10428,0.004963,0.116197,0.330000,0.891892,0.003165,False
108708,305292,79636,8,489,-1,-1,4,29.0,2.0,1.0,...,53,25,25,4537,0.008582,0.177305,0.471698,0.999900,0.005510,False
108709,305292,128193,8,489,-1,-1,4,29.0,2.0,1.0,...,0,0,0,424,0.000000,0.000000,0.000000,0.000000,0.000000,False
108710,305292,140687,8,489,-1,-1,4,29.0,2.0,1.0,...,3,0,0,933,0.000000,0.000000,0.000000,0.000000,0.000000,False


In [16]:
def save_df_dataset(df_dataset):
    if not os.path.exists("df_dataset.pkl"):
        with open("df_dataset.pkl", "wb") as f:
            pickle.dump(df_dataset, f)
            
def load_df_dataset():
    with open("df_dataset.pkl", "rb") as f:
        df_dataset = pickle.load(f)
    return df_dataset


df_dataset = load_df_dataset()
save_df_dataset(df_dataset)

In [17]:
def get_train_test_dataset(df_dataset, test_size):
    train_size = len(df_dataset) - test_size
    train_dataset = df_dataset.head(train_size)
    test_dataset = df_dataset.tail(test_size)
    return train_dataset, test_dataset

df_train_dataset, df_test_dataset = get_train_test_dataset(df_dataset, 254062)

In [19]:
#进行训练，使用catboost算法
def train_model(dataset, num_iterations, model_file_path):
    data = dataset.copy()
    data.drop(columns=["buy_label", "user_id", "sku_id"], inplace=True)
    print(len(data.columns))

    categorical_features = ["cate", "brand", "age", "sex", "a1", "a2", "a3"]
    present_categorical_features = [feature for feature in categorical_features if feature in data.columns]

    labels = dataset["buy_label"].apply(lambda label: 1 if label else 0)

    model = CatBoostClassifier(
        iterations=num_iterations,
        loss_function="Logloss",
        task_type="CPU",
        depth=6,
        leaf_estimation_method="Newton",
        one_hot_max_size=2
    )

    model.fit(data, labels, cat_features=present_categorical_features)

    with open(model_file_path, "wb") as model_file:
        pickle.dump(model, model_file)

    return model

train_model(df_train_dataset, 150, "model.cb")

561
0:	learn: 0.6035915	total: 607ms	remaining: 1m 30s
1:	learn: 0.4969222	total: 1.28s	remaining: 1m 34s
2:	learn: 0.4277169	total: 2.13s	remaining: 1m 44s
3:	learn: 0.3706725	total: 2.74s	remaining: 1m 40s
4:	learn: 0.3185801	total: 3.48s	remaining: 1m 40s
5:	learn: 0.2753212	total: 4.03s	remaining: 1m 36s
6:	learn: 0.2380220	total: 4.62s	remaining: 1m 34s
7:	learn: 0.1907580	total: 5.42s	remaining: 1m 36s
8:	learn: 0.1664159	total: 6.03s	remaining: 1m 34s
9:	learn: 0.1455721	total: 6.72s	remaining: 1m 34s
10:	learn: 0.1144327	total: 7.47s	remaining: 1m 34s
11:	learn: 0.0904062	total: 8.21s	remaining: 1m 34s
12:	learn: 0.0713151	total: 9.07s	remaining: 1m 35s
13:	learn: 0.0566772	total: 9.81s	remaining: 1m 35s
14:	learn: 0.0470784	total: 10.6s	remaining: 1m 35s
15:	learn: 0.0382957	total: 11.3s	remaining: 1m 34s
16:	learn: 0.0310271	total: 11.9s	remaining: 1m 33s
17:	learn: 0.0261223	total: 12.5s	remaining: 1m 31s
18:	learn: 0.0220136	total: 13.1s	remaining: 1m 30s
19:	learn: 0.01846

<catboost.core.CatBoostClassifier at 0x1c4775f5d50>

In [26]:
#测试打分
def calculate_test_score(results):
    """
    results is a DataFrame with columns: user_id, sku_id, buy_label, predict_label
    """
    user_grouped = results.groupby("user_id").sum().reset_index()
    
    user_tp = len(user_grouped[(user_grouped["buy_label"] > 0) & (user_grouped["predict_label"] > 0)])
    user_fp = len(user_grouped[(user_grouped["buy_label"] == 0) & (user_grouped["predict_label"] > 0)])
    user_fn = len(user_grouped[(user_grouped["buy_label"] > 0) & (user_grouped["predict_label"] == 0)])
    user_tn = len(user_grouped[(user_grouped["buy_label"] == 0) & (user_grouped["predict_label"] == 0)])
    
    user_precision = user_tp / (user_tp + user_fp)
    user_recall = user_tp / (user_tp + user_fn)
    f1_user = 6 * user_recall * user_precision / (5 * user_recall + user_precision)
    
    sku_tp = len(results[(results["buy_label"] > 0) & (results["predict_label"] > 0)])
    sku_fp = len(results[(results["buy_label"] == 0) & (results["predict_label"] > 0)])
    sku_fn = len(results[(results["buy_label"] > 0) & (results["predict_label"] == 0)])
    sku_tn = len(results[(results["buy_label"] == 0) & (results["predict_label"] == 0)])
    
    sku_precision = sku_tp / (sku_tp + sku_fp)
    sku_recall = sku_tp / (sku_tp + sku_fn)
    f1_sku = 5 * sku_recall * sku_precision / (2 * sku_recall + 3 * sku_precision)
    
    print("user_tp", user_tp)
    print("user_fp", user_fp)
    print("user_fn", user_fn)
    print("user_tn", user_tn)
    print("user_precision", user_precision)
    print("user_recall", user_recall)
    print("f1_user", f1_user)
    
    print("sku_tp", sku_tp)
    print("sku_fp", sku_fp)
    print("sku_fn", sku_fn)
    print("sku_tn", sku_tn)
    print("sku_precision", sku_precision)
    print("sku_recall", sku_recall)
    print("f1_sku", f1_sku)
    
    final_score = 0.4 * f1_user + 0.6 * f1_sku
    
    return final_score

In [27]:
#进行测试
import pickle
import sklearn.metrics
import pandas as pd
import numpy as np

def evaluate_model(test_data, model_file, threshold=0.5):
    results = test_data[["user_id", "sku_id", "buy_label"]].reset_index()
    features = test_data.drop(columns=["buy_label", "user_id", "sku_id"]).copy()
    
    with open(model_file, "rb") as model_f:
        model = pickle.load(model_f)

    probabilities = model.predict(features, prediction_type='Probability')
    class_probabilities = probabilities[:, 1]
    predicted_labels = class_probabilities > threshold
    
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(results["buy_label"].tolist(), class_probabilities)

    with open("roc_curve", "w") as roc_file:
        for false_pos_rate, true_pos_rate, thres in zip(fpr, tpr, thresholds):
            roc_file.write(f"{false_pos_rate}\t{true_pos_rate}\t{thres}\n")
    
    auc = sklearn.metrics.roc_auc_score(results["buy_label"].tolist(), class_probabilities)
    print("auc_score: ", auc)
    
    results["predict_label"] = pd.Series(predicted_labels).astype(np.int32)

    score = calculate_test_score(results)
    
    return score

evaluate_model(df_test_dataset, "model.cb", 0.25)

auc_score:  0.9981035508536477
user_tp 789
user_fp 623
user_fn 188
user_tn 17557
user_precision 0.5587818696883853
user_recall 0.8075742067553736
f1_user 0.5890257558790594
sku_tp 751
sku_fp 780
sku_fn 229
sku_tn 252302
sku_precision 0.49052906596995427
sku_recall 0.7663265306122449
f1_sku 0.625624791736088


0.6109851773932765

In [57]:
################## xgboost模型的训练和测试 ########################

In [28]:
# 由于xgboost不原生支持类别特征，将类别特征转换为 OneHot 编码
def encode_categorical_features(dataframe):
    categorical_columns = ["cate", "brand", "age", "sex", "a1", "a2", "a3"]
    existing_categorical_columns = [col for col in categorical_columns if col in dataframe.columns]
    encoded_dataframe = pd.get_dummies(dataframe, columns=existing_categorical_columns)
    return encoded_dataframe

encoded_dataset = encode_categorical_features(df_dataset)
print(encoded_dataset.head())

   user_id  sku_id  user_lv_cd  browse_cnt_user_all  addcart_cnt_user_all  \
0   200001   14398           5                 57.0                  12.0   
1   200001   20308           5                 57.0                  12.0   
2   200001   46186           5                 57.0                  12.0   
3   200001   94944           5                 57.0                  12.0   
4   200001  113402           5                 57.0                  12.0   

   delcart_cnt_user_all  buy_cnt_user_all  follow_cnt_user_all  \
0                   6.0               1.0                  0.0   
1                   6.0               1.0                  0.0   
2                   6.0               1.0                  0.0   
3                   6.0               1.0                  0.0   
4                   6.0               1.0                  0.0   

   click_cnt_user_all  browse_rate_user_all  ...  a1_-1  a1_1  a1_2  a1_3  \
0                86.0              0.017544  ...      0     1  

In [29]:
train_data, test_data = get_train_test_dataset(encoded_dataset, 254062)

In [30]:
def train_xgboost_model(dataset, model_filepath):
    features = dataset.drop(columns=["buy_label", "user_id", "sku_id"]).copy()
    print(len(features.columns))

    labels = dataset["buy_label"].apply(lambda x: 1 if x else 0)

    dmatrix = xgb.DMatrix(features, label=labels)

    params = {
        "learning_rate": 0.3,
        "gamma": 0.6,
        "max_depth": 6,
        "lambda": 1,
        "tree_method": "hist",
        "scale_pos_weight": 1,
        "predictor": "cpu_predictor",
        "objective": "binary:logistic",
        "eval_metric": "logloss"
    }

    model = xgb.train(params, dmatrix, num_boost_round=150, evals=[(dmatrix, "train")])

    with open(model_filepath, "wb") as model_file:
        pickle.dump(model, model_file)
    return model

train_xgboost_model(train_data, "xgb_model")

609


Parameters: { "predictor" } are not used.



[0]	train-logloss:0.09537
[1]	train-logloss:0.07018
[2]	train-logloss:0.05206
[3]	train-logloss:0.03893
[4]	train-logloss:0.02937
[5]	train-logloss:0.02233
[6]	train-logloss:0.01718
[7]	train-logloss:0.01336
[8]	train-logloss:0.01053
[9]	train-logloss:0.00846
[10]	train-logloss:0.00690
[11]	train-logloss:0.00576
[12]	train-logloss:0.00489
[13]	train-logloss:0.00424
[14]	train-logloss:0.00374
[15]	train-logloss:0.00339
[16]	train-logloss:0.00310
[17]	train-logloss:0.00288
[18]	train-logloss:0.00272
[19]	train-logloss:0.00259
[20]	train-logloss:0.00249
[21]	train-logloss:0.00240
[22]	train-logloss:0.00232
[23]	train-logloss:0.00224
[24]	train-logloss:0.00219
[25]	train-logloss:0.00217
[26]	train-logloss:0.00214
[27]	train-logloss:0.00207
[28]	train-logloss:0.00203
[29]	train-logloss:0.00198
[30]	train-logloss:0.00193
[31]	train-logloss:0.00191
[32]	train-logloss:0.00189
[33]	train-logloss:0.00187
[34]	train-logloss:0.00185
[35]	train-logloss:0.00183
[36]	train-logloss:0.00182
[37]	train-

<xgboost.core.Booster at 0x1711ae7be90>

In [31]:
def evaluate_xgb_model(test_dataset, model_file, threshold=0.5):
    results = test_dataset[["user_id", "sku_id", "buy_label"]].reset_index()
    features = test_dataset.drop(columns=["buy_label", "user_id", "sku_id"]).copy()
    dmatrix = xgb.DMatrix(features)
    
    true_labels = results["buy_label"].tolist()
    
    with open(model_file, "rb") as model_f:
        model = pickle.load(model_f)
    
    predicted_scores = model.predict(dmatrix)
    predicted_labels = [score > threshold for score in predicted_scores]
    
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(true_labels, predicted_scores)
    
    with open("roc_curve", "w") as roc_file:
        for fp, tp, th in zip(fpr, tpr, thresholds):
            roc_file.write(f"{fp}\t{tp}\t{th}\n")
    
    auc = sklearn.metrics.roc_auc_score(true_labels, predicted_scores)
    print("auc_score: ", auc)
    
    results["predict_label"] = pd.Series(predicted_labels).astype(np.int32)
    score = calculate_test_score(results)
    
    return score

evaluate_xgb_model(test_data, "xgb_model", 0.165)

auc_score:  0.9979856734342294
user_tp 833
user_fp 757
user_fn 144
user_tn 17423
user_precision 0.5238993710691824
user_recall 0.8526100307062436
f1_user 0.5598745379186738
sku_tp 802
sku_fp 991
sku_fn 178
sku_tn 252091
sku_precision 0.4472950362520915
sku_recall 0.8183673469387756
f1_sku 0.6144652160588417


0.5926289448027746

In [None]:
########### 利用catboost对特征重要性进行排序 ##############

In [20]:
# 将 df 转换成 catboost 里面的 pool
def dataframe_to_pool(df):
    df_copy = df.copy()
    df_copy.drop(columns=["buy_label", "user_id", "sku_id"], inplace=True)
    
    categorical_features = ["cate", "brand", "age", "sex", "a1", "a2", "a3"]
    existing_categorical_features = [feat for feat in categorical_features if feat in df_copy.columns]
    
    labels = df["buy_label"].apply(lambda x: 1 if x else 0)
    
    pool = cb.Pool(df_copy, labels, cat_features=existing_categorical_features)
    
    return pool

In [21]:
def calculate_feature_importance(model, dataset):
    df_copy = dataset.copy()
    df_copy.drop(columns=["buy_label", "user_id", "sku_id"], inplace=True)
    
    categorical_features = ["cate", "brand", "age", "sex", "a1", "a2", "a3"]
    existing_categorical_features = [feat for feat in categorical_features if feat in df_copy.columns]
    
    labels = dataset["buy_label"].apply(lambda x: 1 if x else 0)
    
    pool = cb.Pool(df_copy, labels, cat_features=existing_categorical_features)
    
    features = df_copy.columns.tolist()
    importances = model.get_feature_importance(pool, type="FeatureImportance")
    
    feature_importance = [(feature, importance) for feature, importance in zip(features, importances)]
    
    return feature_importance

with open("model.cb", "rb") as f:
    model = pickle.load(f)
feature_importance_list = calculate_feature_importance(model, df_train_dataset)
fun = lambda x:-x[1]
sorted_feature = sorted(feature_importance_list, key=fun)
for f, ipt in sorted_feature:
    print(f, " : ", ipt)

browse_rate_user_sku_all  :  4.881386876470655
browse_cnt_user_sku_5  :  3.795555723935232
addcart_rate_user_30  :  3.7681879135908525
buy_cnt_user_cate_30  :  3.763674280600984
addcart_rate_user_cate_30  :  3.182502144562689
follow_rate_user_sku_all  :  3.028083968331891
click_rate_user_15  :  2.5943070734352625
follow_rate_user_30  :  2.5181457822385274
delcart_rate_user_sku_all  :  2.374607059019883
browse_rate_user_brand_all  :  2.3526586733465
browse_cnt_user_brand_all  :  2.0693021930529483
buy_cnt_user_sku_all  :  2.0143196160493684
follow_rate_user_cate_30  :  1.859077815645591
click_rate_user_cate_20  :  1.828883524830059
buy_cnt_user_brand_30  :  1.7863887522315238
buy_cnt_user_30  :  1.6748267911865182
click_cnt_user_brand_all  :  1.665418374626246
delcart_rate_user_brand_30  :  1.5904824781705622
addcart_cnt_user_sku_20  :  1.518473618982225
addcart_rate_user_sku_all  :  1.502379264481769
delcart_rate_user_30  :  1.4999010229834042
click_rate_user_brand_30  :  1.36264591701

In [22]:
# 得到待删除的特征
def features_to_pop(feature_importance_list):
    feat_pop = []
    for f, v in feature_importance_list:
        if v < 1e-3:
            feat_pop.append(f)
    return feat_pop

feat_pop = features_to_pop(feature_importance_list)
len(feat_pop)


322

In [23]:
# 获得删除不重要特征的数据集
def get_feature_poped_dataset(df_dataset, feat_pop):
    for feat in feat_pop:
        df_dataset.pop(feat)
    return df_dataset
    
df_feature_poped_dataset = get_feature_poped_dataset(df_dataset, feat_pop)

In [44]:
df_train_dataset, df_test_dataset = get_train_test_dataset(df_feature_poped_dataset, 254062)

In [None]:
train_model(df_train_dataset, 255, "model.cb")

239
0:	learn: 0.5702437	total: 227ms	remaining: 57.5s
1:	learn: 0.4532694	total: 443ms	remaining: 56s
2:	learn: 0.3911200	total: 632ms	remaining: 53.1s
3:	learn: 0.3073702	total: 942ms	remaining: 59.1s
4:	learn: 0.2441487	total: 1.2s	remaining: 59.8s
5:	learn: 0.1888486	total: 1.41s	remaining: 58.7s
6:	learn: 0.1658953	total: 1.64s	remaining: 57.9s
7:	learn: 0.1457907	total: 1.94s	remaining: 59.8s
8:	learn: 0.1260199	total: 2.15s	remaining: 58.7s
9:	learn: 0.1010253	total: 2.43s	remaining: 59.5s
10:	learn: 0.0817478	total: 2.62s	remaining: 58s
11:	learn: 0.0733102	total: 2.81s	remaining: 57s
12:	learn: 0.0578028	total: 3.04s	remaining: 56.6s
13:	learn: 0.0458932	total: 3.29s	remaining: 56.6s
14:	learn: 0.0374286	total: 3.52s	remaining: 56.3s
15:	learn: 0.0303539	total: 3.75s	remaining: 56s
16:	learn: 0.0246953	total: 4s	remaining: 55.9s
17:	learn: 0.0205319	total: 4.26s	remaining: 56.1s
18:	learn: 0.0174855	total: 4.52s	remaining: 56.1s
19:	learn: 0.0152508	total: 4.75s	remaining: 55.8

161:	learn: 0.0028312	total: 41.4s	remaining: 23.8s
162:	learn: 0.0028311	total: 41.6s	remaining: 23.5s
163:	learn: 0.0028286	total: 41.9s	remaining: 23.2s
164:	learn: 0.0028281	total: 42.1s	remaining: 22.9s
165:	learn: 0.0028277	total: 42.3s	remaining: 22.7s
166:	learn: 0.0028242	total: 42.6s	remaining: 22.4s
167:	learn: 0.0028238	total: 42.8s	remaining: 22.2s
168:	learn: 0.0028235	total: 43s	remaining: 21.9s
169:	learn: 0.0028233	total: 43.3s	remaining: 21.6s
170:	learn: 0.0028203	total: 43.5s	remaining: 21.4s
171:	learn: 0.0028196	total: 43.7s	remaining: 21.1s
172:	learn: 0.0028183	total: 43.9s	remaining: 20.8s
173:	learn: 0.0028172	total: 44.1s	remaining: 20.5s
174:	learn: 0.0028149	total: 44.4s	remaining: 20.3s


In [110]:
evaluate_model(df_test_dataset, "model.cb", 0.25)

auc_score:  0.9981228839438827
user_tp 788
user_fp 635
user_fn 189
user_tn 17545
user_precision 0.5537596626844694
user_recall 0.8065506653019447
f1_user 0.5842807711319822
sku_tp 752
sku_fp 795
sku_fn 228
sku_tn 252287
sku_precision 0.4861021331609567
sku_recall 0.7673469387755102
f1_sku 0.6231355651309248


0.6075936475313477

In [112]:
from sklearn.metrics import roc_curve
def get_roc_curve(df_test_dataset, model_path):
    data_pool = dataframe_to_pool(df_test_dataset)
    with open(model_path, "rb") as f:
        model = pickle.load(f)
    predictions = model.predict_proba(data_pool)[:, 1]
    fpr, tpr, thresholds = roc_curve(data_pool.get_label(), predictions)
    return fpr, tpr, thresholds

fpr, tpr, thresholds = get_roc_curve(df_test_dataset, "model.cb")

with open("roc_curve", "w") as f:
    for i in range(len(thresholds)):
        f.write(f"{fpr[i]}\t{tpr[i]}\t{thresholds[i]}\n")