In [2]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.data_prep import load_sessions, flatten_events, build_session_features

sessions = load_sessions("../data/otto-recsys-train.jsonl", n=50000)
event_df = flatten_events(sessions)
session_features = build_session_features(event_df)

相关性矩阵。

In [3]:
session_features.corr()

Unnamed: 0,total_events,click_cnt,cart_cnt,order_cnt,converted
total_events,1.0,0.994433,0.680712,0.491501,0.486636
click_cnt,0.994433,1.0,0.602871,0.426384,0.453109
cart_cnt,0.680712,0.602871,1.0,0.618487,0.474275
order_cnt,0.491501,0.426384,0.618487,1.0,0.605349
converted,0.486636,0.453109,0.474275,0.605349,1.0


通过构建 Session 级别特征并进行相关性分析，发现加购行为（cart_cnt）对转化具有更高预测能力，显著高于点击行为；同时用户整体活跃度（total_events）也与转化呈中度正相关。

In [4]:
session_features.groupby('converted').mean()

Unnamed: 0_level_0,total_events,click_cnt,cart_cnt,order_cnt
converted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,31.791713,30.346245,1.445468,0.0
True,121.637924,106.858113,10.880324,3.899486


转化用户的行为量是未转化用户的近4倍，说明用户活跃度与转化高度相关。

In [5]:
session_features['click_ratio'] = session_features['click_cnt'] / session_features['total_events']
session_features['cart_ratio'] = session_features['cart_cnt'] / session_features['total_events']

session_features.groupby('converted')[['click_ratio','cart_ratio']].mean()

Unnamed: 0_level_0,click_ratio,cart_ratio
converted,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.966389,0.033611
True,0.845002,0.10209


由此可见，决定转化的不是“点击多”，而是“加购占比多”。

准备数据，构建Logistics Regression。

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score

X = session_features[['click_cnt','cart_cnt']]
y = session_features['converted']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000)
)

model.fit(X_train, y_train)

y_pred_prob = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, y_pred_prob)

print("AUC:", auc)

AUC: 0.8950338841696261


In [8]:
coef_df = pd.DataFrame({
    "feature": ['click_cnt','cart_cnt'],
    "coef": model.named_steps['logisticregression'].coef_[0]
})

coef_df

Unnamed: 0,feature,coef
0,click_cnt,0.439754
1,cart_cnt,1.445519


在控制点击行为的情况下，加购行为对转化概率的边际提升显著高于点击行为，说明高意图行为是转化的核心驱动因素。