# WOE编码

> 以下代码仅用于演示。由于系统安全问题，请勿直接在生产中使用。

建议使用 [jupyter](https://jupyter.org/) 运行本教程。

分箱是基于排序方法创建独立变量的桶。分箱帮助我们将连续变量转换为分类变量。

WOE分箱实现了针对二元目标变量的数值变量的分箱。



```none
bin_total = bin_positives + bin_negatives

total_labels = total_positives + total_negatives

bin_WOE = log((bin_positives / total_positives) / (bin_negatives / total_negatives))

bin_iv = ((bin_positives / total_positives) - (bin_negatives / total_negatives)) * bin_woe
```

目前我们为垂直分割的数据集提供WOE编码。

让我们先加载一个样本数据集。

### 1 准备数据

In [1]:
import secretflow as sf
from secretflow.data.vertical import VDataFrame
from secretflow.utils.simulation.datasets import load_linear

In [2]:
sf.shutdown()
sf.init(['alice', 'bob'], address='local')
alice, bob = sf.PYU('alice'), sf.PYU('bob')
spu = sf.SPU(sf.utils.testing.cluster_def(['alice', 'bob']))

  self.pid = _posixsubprocess.fork_exec(
2024-08-23 19:23:35,672	INFO worker.py:1724 -- Started a local Ray instance.


In [3]:
parts = {
    bob: (1, 11),
    alice: (11, 22),
}
vdf = load_linear(parts=parts)

INFO:root:Create proxy actor <class 'secretflow.device.proxy.ActorPartitionAgent'> with party bob.
INFO:root:Create proxy actor <class 'secretflow.device.proxy.ActorPartitionAgent'> with party alice.


In [4]:
label_data = vdf['y']
y = sf.reveal(label_data.partitions[alice].data).values

In [5]:
vdf.columns

['x1',
 'x2',
 'x3',
 'x4',
 'x5',
 'x6',
 'x7',
 'x8',
 'x9',
 'x10',
 'x11',
 'x12',
 'x13',
 'x14',
 'x15',
 'x16',
 'x17',
 'x18',
 'x19',
 'x20',
 'y']

In [8]:
sf.reveal(vdf.partitions[alice].data)

Unnamed: 0,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,y
0,0.241531,-0.705729,-0.020094,-0.486932,0.851992,0.035219,-0.796096,0.810261,0.048303,0.937679,1
1,-0.402727,0.115744,0.468149,-0.697152,0.386395,0.712798,0.239583,0.312728,0.526637,0.589773,1
2,0.872675,-0.559321,0.390246,0.000472,0.225594,-0.639674,0.279511,0.039087,-0.753417,0.516735,0
3,-0.644718,-0.409382,0.141747,-0.797517,0.314084,-0.802476,0.348878,-0.855979,0.250944,0.979465,1
4,-0.949669,-0.940787,-0.951708,0.187475,0.272346,0.124419,0.853226,-0.238805,0.243109,-0.121446,1
...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.031331,-0.078700,-0.020636,-0.575713,0.210120,-0.288943,-0.262945,-0.847253,0.069960,0.786748,1
9996,0.047039,0.965614,-0.921435,-0.092970,0.205778,0.155392,0.922683,-0.502486,-0.076290,-0.604832,1
9997,0.269438,-0.115586,0.928880,0.430016,0.269042,-0.331772,0.520971,-0.424209,0.434947,0.998955,1
9998,0.999325,0.433372,-0.805999,0.311548,0.072405,0.973399,-0.123470,0.914291,-0.473056,0.616257,1


In [9]:
sf.reveal(vdf.partitions[bob].data)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,-0.514226,0.730010,-0.730391,0.970483,-0.350854,-0.800808,-0.201530,-0.499206,-0.750112,-0.910640
1,-0.725537,0.482244,-0.823223,0.202119,-0.270679,-0.139781,-0.368098,-0.652901,0.438065,0.830206
2,0.608353,-0.071102,-0.775098,-0.391496,-0.521392,0.082370,-0.410503,-0.183506,-0.783842,-0.729929
3,-0.686642,0.160470,0.914477,-0.269052,-0.519369,-0.547841,-0.598098,-0.269405,-0.974268,-0.800515
4,-0.198111,0.212909,0.950474,0.775259,0.814052,-0.840528,-0.881926,0.800389,0.185542,0.183614
...,...,...,...,...,...,...,...,...,...,...
9995,-0.367246,-0.296454,0.558596,-0.403504,-0.403741,0.000142,-0.389204,-0.470127,-0.247682,-0.552526
9996,0.010913,0.629268,-0.384093,-0.552787,-0.382902,-0.100838,0.158053,0.592903,-0.577123,-0.811461
9997,-0.238097,0.904069,-0.344859,-0.687887,0.355400,0.223052,-0.811309,-0.172245,0.713149,-0.184585
9998,0.453686,-0.375173,0.899238,0.908135,0.924383,0.524051,0.519569,-0.558997,0.610076,-0.862191


### 2 执行WOE分箱和替换。

### 类定义

```python
class VertWoeBinning(object):
    def __init__(self, secure_device: SPU | HEU):
        """
        初始化 VertWoeBinning 类。
        
        参数:
        - secure_device: SPU 或 HEU，用于安全计算。
        """
        self.secure_device = secure_device

    def binning(self, vdata: VDataFrame, binning_method: str = 'quantile', bin_num: int = 10, bin_names: Dict[PYU, List[str]] = {}, label_name: str = '', positive_label: str = '1', chimerge_init_bins: int = 100, chimerge_target_bins: int = 10, chimerge_target_pvalue: float = 0.1, audit_log_path: Dict[str, str] = {}):
        """
        基于 vdata 构建 WOE 替换规则。仅支持二分类标签数据集。
        
        参数:
        - vdata: 垂直切片数据集，使用 {binning_method} 对所有数值型特征进行分箱。对字符串类型特征按其类别分箱，其它分箱计算 np.nan 样本。
        - binning_method: 数值型特征分箱方法。选项: “quantile”(等频)/”chimerge”(ChiMerge from AAAI92-019)/”eq_range”(等距)。默认值: “quantile”。
        - bin_num: 单个特征的最大分箱数。范围: (0, ∞]。默认值: 10。
        - bin_names: 需要进行分箱的特征。
        - label_name: 标签列名称。
        - positive_label: 标签中表示正样本的值。
        - chimerge_init_bins: ChiMerge 初始分箱的最大分箱数。范围: (2, ∞]。默认值: 100。
        - chimerge_target_bins: 停止合并的目标分箱数。范围: [2, {chimerge_init_bins})。默认值: 10。
        - chimerge_target_pvalue: 停止合并的 p 值阈值。范围: (0, 1)。默认值: 0.1。
        - audit_log_path: 输出审计日志路径，为 HEU 加密到设备的本地路径。为空表示禁用。例如: {‘alice’: ‘/path/to/alice/audit/filename’, ‘bob’: ‘bob/audit/filename’}。注意: 请勿随意修改此选项，保持为空并禁用。除非你明确了解此选项的含义并接受其风险。
        
        返回:
        - Dict[PYU, PYUObject]: 包含本方所有特征规则的字典。
        """
```



In [10]:
from secretflow.preprocessing.binning.vert_woe_binning import VertWoeBinning
from secretflow.preprocessing.binning.vert_bin_substitution import VertBinSubstitution

binning = VertWoeBinning(spu)
bin_rules = binning.binning(
    vdf,
    binning_method="chimerge",
    bin_num=4,
    bin_names={alice: [], bob: ["x5", "x7"]},
    label_name="y",
)

INFO:root:Create proxy actor <class 'secretflow.device.proxy.ActorVertWoeBinningPyuWorker'> with party alice.
INFO:root:Create proxy actor <class 'secretflow.device.proxy.ActorVertWoeBinningPyuWorker'> with party bob.


[36m(_run pid=1016187)[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': 
[36m(_run pid=1016187)[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
[36m(_run pid=1016187)[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory


[36m(_run pid=1016187)[0m [2024-08-23 19:26:47.229] [info] [thread_pool.cc:30] Create a fixed thread pool with size 7


In [11]:
woe_sub = VertBinSubstitution()
sub_data = woe_sub.substitution(vdf, bin_rules)

INFO:root:Create proxy actor <class 'secretflow.device.proxy.ActorPartitionAgent'> with party bob.
INFO:root:Create proxy actor <class 'secretflow.device.proxy.ActorPartitionAgent'> with party alice.


### 分箱替换后的数据为sub_data[0]

In [21]:
# this is for demo only, be careful with reveal
print(sf.reveal(sub_data[0].partitions[alice].data))
print(sf.reveal(sub_data[0].partitions[bob].data))

INFO:root:Create proxy actor <class 'secretflow.device.proxy.ActorPartitionAgent'> with party bob.
INFO:root:Create proxy actor <class 'secretflow.device.proxy.ActorPartitionAgent'> with party alice.


           x11       x12       x13       x14       x15       x16       x17  \
0     0.241531 -0.705729 -0.020094 -0.486932  0.851992  0.035219 -0.796096   
1    -0.402727  0.115744  0.468149 -0.697152  0.386395  0.712798  0.239583   
2     0.872675 -0.559321  0.390246  0.000472  0.225594 -0.639674  0.279511   
3    -0.644718 -0.409382  0.141747 -0.797517  0.314084 -0.802476  0.348878   
4    -0.949669 -0.940787 -0.951708  0.187475  0.272346  0.124419  0.853226   
...        ...       ...       ...       ...       ...       ...       ...   
9995 -0.031331 -0.078700 -0.020636 -0.575713  0.210120 -0.288943 -0.262945   
9996  0.047039  0.965614 -0.921435 -0.092970  0.205778  0.155392  0.922683   
9997  0.269438 -0.115586  0.928880  0.430016  0.269042 -0.331772  0.520971   
9998  0.999325  0.433372 -0.805999  0.311548  0.072405  0.973399 -0.123470   
9999 -0.203443  0.772931 -0.146181 -0.195646  0.274590  0.803816 -0.312047   

           x18       x19       x20  y  
0     0.810261  0.04830

### 3. IV值
有时我们可能需要IV值。根据GitHub issue, https://github.com/secretflow/secretflow/issues/565，
发布分桶的ivs可能会泄漏标签信息。目前，我们选择将桶的iv值保存在标签持有者设备中。标签持有者可以选择 1. 不共享iv信息 2. 共享一些已选择的iv信息

1. 将一些iv信息分享给其他方
2. 将一些iv信息分享给其他方

我们将演示如何共享特征iv。

回想一下，woe_rules是一个字典 {PYU: PYUObject}，其中 “ “每个 PYUObject 本身是以下类型的字典


```
{
    "variables":[
        {
            "name": str, # feature name
            "type": str, # "string" or "numeric", if feature is discrete or continuous
            "categories": list[str], # categories for discrete feature
            "split_points": list[float], # left-open right-close split points
            "total_counts": list[int], # total samples count in each bins.
            "else_counts": int, # np.nan samples count
            "filling_values": list[float], # woe values for each bins.
            "else_filling_value": float, # woe value for np.nan samples.
        },
        # ... others feature
    ],
    # label holder's PYUObject only
    # warning: giving bin_ivs to other party will leak positive samples in each bin.
    # it is up to label holder's will to give feature iv or bin ivs or all info to workers.
    # for more information, look at: https://github.com/secretflow/secretflow/issues/565

    # in the following comment, by safe we mean label distribution info is not leaked.
    "feature_iv_info" :[
        {
            "name": str, #feature name
            "ivs": list[float], #iv values for each bins, not safe to share with workers in any case.
            "else_iv": float, #iv for nan values, may share to with workers
            "feature_iv": float, #sum of bin_ivs, safe to share with workers when bin num > 2.
        }
    ]
}
```

In [29]:
bin_rules

{PYURuntime(bob): <secretflow.device.device.pyu.PYUObject at 0x7f3c22be2d70>,
 PYURuntime(alice): <secretflow.device.device.pyu.PYUObject at 0x7f3c22be1540>}

In [31]:
# alice is label holder
# todo:IV值排序参考
dict_pyu_object = bin_rules[alice]


def extract_name_and_feature_iv(list_of_feature_iv_info):
    return [(d["name"], d["feature_iv"]) for d in list_of_feature_iv_info]


feature_ivs = alice(
    lambda dict_pyu_object: extract_name_and_feature_iv(
        dict_pyu_object["feature_iv_info"]
    )
)(dict_pyu_object)

In [37]:
# we can give the feature_ivs to bob
feature_ivs.to(bob)
# and/or we can reveal it to see it
sf.reveal(feature_ivs)

[('x5', 0.37848298069087766), ('x7', 0)]