# 1 数据读取

#### 将DataFrame数据按行和列分别分割保存为csv数据文件来模拟各方持有的数据情况，读取csv数据分别为HDataFrame和VDataFrame类型变量

## 1.1 初始化

In [1]:
import secretflow as sf

# Check the version of your SecretFlow
print('The version of SecretFlow: {}'.format(sf.__version__))

# In case you have a running secretflow runtime already.
sf.shutdown()

sf.init(['alice', 'bob'], address='local')
alice, bob = sf.PYU('alice'), sf.PYU('bob')

The version of SecretFlow: 1.7.0b0


  self.pid = _posixsubprocess.fork_exec(
2024-08-12 19:27:46,517	INFO worker.py:1724 -- Started a local Ray instance.


## 1.2 查看Dataframe数据

In [2]:
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris(as_frame=True)
data = pd.concat([iris.data, iris.target], axis=1)
data["uid"] = data.index
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,uid
0,5.1,3.5,1.4,0.2,0,0
1,4.9,3.0,1.4,0.2,0,1
2,4.7,3.2,1.3,0.2,0,2
3,4.6,3.1,1.5,0.2,0,3
4,5.0,3.6,1.4,0.2,0,4
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,145
146,6.3,2.5,5.0,1.9,2,146
147,6.5,3.0,5.2,2.0,2,147
148,6.2,3.4,5.4,2.3,2,148


## 1.3 数据横向分割（每个参与方持有一定数量的完整数据）

In [3]:
# Horizontal partitioning.
h_alice, h_bob = data.iloc[:75, :], data.iloc[75:, :]

# Save to temporary files.
import os

temp_dir = "/home/beng003/python_project/sf-test/data/"

h_alice_path = os.path.join(temp_dir, 'h_alice.csv')
h_bob_path = os.path.join(temp_dir, 'h_bob.csv')
h_alice.to_csv(h_alice_path, index=False)
h_bob.to_csv(h_bob_path, index=False)

## 1.4 数据纵向分割（每个参与方持有一定数量的特征数据）

In [4]:
import numpy as np

# Vertical partitioning.
v_alice, v_bob = (
    data.iloc[:, np.r_[0:3, -1]],
    data.iloc[:, 3:],
)

# Save to temporary files.
v_alice_path = os.path.join(temp_dir, 'v_alice.csv')
v_bob_path = os.path.join(temp_dir, 'v_bob.csv')
v_alice.to_csv(v_alice_path, index=False)
v_bob.to_csv(v_bob_path, index=False)

## 1.5 读取横向分割的csv数据

In [5]:

from secretflow.data.horizontal import read_csv as h_read_csv
from secretflow.security.aggregation import SecureAggregator
from secretflow.security import SecureAggregator
from secretflow.security.compare import SPUComparator

# The aggregator and comparator are respectively used to aggregate
# or compare data in subsequent data analysis operations.
aggr = SecureAggregator(device=alice, participants=[alice, bob])

spu = sf.SPU(sf.utils.testing.cluster_def(parties=['alice', 'bob']))
comp = SPUComparator(spu)
hdf = h_read_csv(
    {alice: h_alice_path, bob: h_bob_path},
    aggregator=aggr,
    comparator=comp,
)

hdf.drop(columns=["uid"], inplace=True)

INFO:root:Create proxy actor <class 'secretflow.security.aggregation.secure_aggregator._Masker'> with party alice.
INFO:root:Create proxy actor <class 'secretflow.security.aggregation.secure_aggregator._Masker'> with party bob.
INFO:root:Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.
INFO:root:Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party bob.


## 1.6 读取纵向分割的csv数据

In [6]:
from secretflow.data.vertical import read_csv as v_read_csv

vdf = v_read_csv(
    {alice: v_alice_path, bob: v_bob_path},
    spu=spu,
    keys="uid",
    drop_keys="uid",
    psi_protocl="ECDH_PSI_2PC",
)

INFO:root:Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.
INFO:root:Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party bob.


[36m(SPURuntime(device_id=None, party=alice) pid=2462239)[0m [2024-08-12 19:28:03.704] [info] [launch.cc:164] LEGACY PSI config: {"psi_type":"ECDH_PSI_2PC","broadcast_result":true,"input_params":{"path":"/home/beng003/python_project/sf-test/data/v_alice.csv","select_fields":["uid"],"precheck":true},"output_params":{"path":"/home/beng003/python_project/sf-test/data/v_alice.csv.psi_output_83250","need_sort":true},"curve_type":"CURVE_25519","bucket_size":1048576}
[36m(SPURuntime(device_id=None, party=alice) pid=2462239)[0m [2024-08-12 19:28:03.704] [info] [bucket_psi.cc:400] bucket size set to 1048576
[36m(SPURuntime(device_id=None, party=alice) pid=2462239)[0m [2024-08-12 19:28:03.705] [info] [bucket_psi.cc:252] Begin sanity check for input file: /home/beng003/python_project/sf-test/data/v_alice.csv, precheck_switch:true
[36m(SPURuntime(device_id=None, party=alice) pid=2462239)[0m [2024-08-12 19:28:03.707] [info] [csv_checker.cc:135] Executing duplicated scripts: LC_ALL=C sort --

In [7]:
from typing import Union
import pandas as pd
from secretflow.data.horizontal import HDataFrame
from secretflow.data.vertical import VDataFrame

def table_statistics_vh(
    table: Union[pd.DataFrame, VDataFrame, HDataFrame]
) -> pd.DataFrame:
    """Get table statistics for a pd.DataFrame, VDataFrame or HDataFrame.

    Args:
        table: Union[pd.DataFrame, VDataFrame, HDataFrame]
    Returns:
    """
    assert isinstance(
        table, (pd.DataFrame, VDataFrame, HDataFrame)
    ), "table must be a pd.DataFrame, VDataFrame or HDataFrame"
    index = table.columns
    result = pd.DataFrame(index=index)
    result["datatype"] = table.dtypes
    result["total_count"] = table.shape[0]
    result["count(non-NA count)"] = table.count()
    result["count_na(NA count)"] = table.isna().sum()
    result["na_ratio"] = table.isna().sum() / table.shape[0]
    result["min"] = table.min(numeric_only=True)
    result["max"] = table.max(numeric_only=True)
    result["mean"] = table.mean(numeric_only=True)

    result["sum"] = table.sum(numeric_only=True)

    return result

In [8]:
from secretflow.stats.table_statistics import table_statistics

pd.set_option("display.max_rows", None)
data_stats = table_statistics_vh(vdf)
data_stats

Unnamed: 0,datatype,total_count,count(non-NA count),count_na(NA count),na_ratio,min,max,mean,sum
sepal length (cm),float64,150,150,0,0.0,4.3,7.9,5.843333,876.5
sepal width (cm),float64,150,150,0,0.0,2.0,4.4,3.057333,458.6
petal length (cm),float64,150,150,0,0.0,1.0,6.9,3.758,563.7
petal width (cm),float64,150,150,0,0.0,0.1,2.5,1.199333,179.9
target,int64,150,150,0,0.0,0.0,2.0,1.0,150.0


In [9]:
table_statistics_vh(hdf)

[36m(_run pid=2461833)[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': 
[36m(_run pid=2461833)[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
[36m(_run pid=2461833)[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory


Unnamed: 0,datatype,total_count,count(non-NA count),count_na(NA count),na_ratio,min,max,mean,sum
sepal length (cm),float64,150,150,0,0.0,4.3,7.9,5.843333,876.499996
sepal width (cm),float64,150,150,0,0.0,2.0,4.4,3.057333,458.599998
petal length (cm),float64,150,150,0,0.0,1.0,6.9,3.758,563.699997
petal width (cm),float64,150,150,0,0.0,0.1,2.5,1.199333,179.899998
target,int64,150,150,0,0.0,0.0,2.0,1.0,150.0


In [10]:
sf.shutdown()