In [100]:
import pandas as pd
import tensorflow as tf
import os
import time
import torch

In [101]:
LINEITEM_CSV = r"D:\DataSci\data\lineitem.csv"
torch.cuda.set_device(0)            # 这里传入索引 0，而不是 torch.device 对象
device = torch.device("cuda:0")     # 之后构造一个 torch.device 对象，表示“cuda 上的第 0 块设备”

# 验证
print("当前 CUDA 设备索引：", torch.cuda.current_device())
print("CUDA 设备名称：", torch.cuda.get_device_name(torch.cuda.current_device()))

当前 CUDA 设备索引： 0
CUDA 设备名称： NVIDIA GeForce RTX 4060 Laptop GPU


In [None]:
# 读取lineitem 表csv并将几列转换成pytorch张量
def load_data(csv_path, nrows=None):
    usecols = ['l_orderkey',
               'l_quantity',
               'l_extendedprice',
               'l_discount',
               'l_partkey',
               'l_suppkey',
               'l_linenumber',
               'l_tax']
    df = pd.read_csv(csv_path,usecols=usecols,dtype={
        'l_orderkey': 'int64',
        'l_quantity': 'float32',
        'l_extendedprice': 'float32',
        'l_discount': 'float32',
        'l_partkey': 'int64',
        'l_suppkey': 'int64',
        'l_linenumber': 'int64',
        'l_tax':        'float32'
    },
    nrows=nrows
    )
    tensors = {
        'l_orderkey':      torch.from_numpy(df['l_orderkey'].values).long().to(device),
        'l_quantity':      torch.from_numpy(df['l_quantity'].values).to(device),
        'l_extendedprice': torch.from_numpy(df['l_extendedprice'].values).to(device),
        'l_discount':      torch.from_numpy(df['l_discount'].values).to(device),
        'l_partkey':       torch.from_numpy(df['l_partkey'].values).long().to(device),
        'l_suppkey':       torch.from_numpy(df['l_suppkey'].values).long().to(device),
        'l_linenumber':    torch.from_numpy(df['l_linenumber'].values).long().to(device),
        'l_tax':           torch.from_numpy(df['l_tax'].values).to(device)
    }
    return tensors,df

In [103]:
# cond为筛选函数
# 使用 torch.masked_select 实现布尔掩码过滤
def tensor_selection(tensors, cond):
    mask = cond(tensors)
    result = {}
    for col_name, col_tensor in tensors.items():
        result[col_name] = torch.masked_select(col_tensor, mask)
    return result

In [104]:
"""
    用 Pandas 原生做一次筛选（对比用）。
    这里以示例条件： l_quantity > 30 AND l_discount < 0.05
    返回一个新的 DataFrame，只保留满足条件的行。
"""
def pandas_selection(df):
    return df[
        (df['l_quantity']  > 30.0) &
        (df['l_discount']  < 0.05) &
        (df['l_partkey']   > 1000) &
        (df['l_suppkey']   < 5000) &
        (df['l_linenumber'] == 1) &
        (df['l_tax']       <= 0.08)
    ]

In [None]:
# 加载数据到张量中
start_time = time.time()
tensors, df_full = load_data(LINEITEM_CSV, nrows=None)
load_time = time.time() - start_time
print(f"  数据加载 + 张量转换耗时: {load_time:.2f} 秒")
print(f"  原始行数: {df_full.shape[0]}")

  数据加载 + 张量转换耗时: 3.58 秒
  原始行数: 6001215


In [106]:
# 使用pandas做筛选
start_pd = time.time()
df_sel_pd = pandas_selection(df_full)
pd_time = time.time() - start_pd
print(f"  Pandas 筛选耗时: {pd_time:.3f} 秒")
print(f"  Pandas 筛选后行数: {df_sel_pd.shape[0]}")

  Pandas 筛选耗时: 0.046 秒
  Pandas 筛选后行数: 135709


In [107]:
# 使用张量做筛选
def cond_fn(ts):
        # ts['l_quantity'] 是一个 torch.float32 张量 (形状 = (N,))
        # ts['l_discount'] 同理
        mask_qty  = ts['l_quantity'] > 30.0      # dtype=torch.bool
        mask_disc = ts['l_discount'] < 0.05      # dtype=torch.bool
        mask_part  = ts['l_partkey'] > 1000        # torch.bool
        mask_supp  = ts['l_suppkey'] < 5000        # torch.bool
        mask_line  = ts['l_linenumber'] == 1       # torch.bool
        mask_tax   = ts['l_tax'] <= 0.08           # torch.bool
        mask = torch.logical_and(mask_qty, mask_disc)
        mask = torch.logical_and(mask, mask_part)
        mask = torch.logical_and(mask, mask_supp)
        mask = torch.logical_and(mask, mask_line)
        mask = torch.logical_and(mask, mask_tax)
        return mask

starter = torch.cuda.Event(enable_timing=True)
ender   = torch.cuda.Event(enable_timing=True)

# start_tf = time.time()
starter.record()   
sel_tensors = tensor_selection(tensors, cond_fn)
ender.record()
# tf_time = time.time() - start_tf
torch.cuda.synchronize()
elapsed_ms = starter.elapsed_time(ender)
# 计算筛选后行数（以任意一个字段长度为准）
tf_count = sel_tensors['l_orderkey'].shape[0]
# print(f"  Tensor 筛选耗时: {tf_time:.2f} 秒")
print(f"  Tensor 筛选耗时：{elapsed_ms:.3f} ms")
print(f"  Tensor 筛选后行数: {tf_count}")

  Tensor 筛选耗时：3.125 ms
  Tensor 筛选后行数: 135709


In [108]:
df_sel_tf = pd.DataFrame({
        'l_orderkey':      sel_tensors['l_orderkey'].cpu().numpy(),
        'l_quantity':      sel_tensors['l_quantity'].cpu().numpy(),
        'l_extendedprice': sel_tensors['l_extendedprice'].cpu().numpy(),
        'l_discount':      sel_tensors['l_discount'].cpu().numpy(),
        'l_partkey':       sel_tensors['l_partkey'].cpu().numpy(),
        'l_suppkey':       sel_tensors['l_suppkey'].cpu().numpy(),
        'l_linenumber':    sel_tensors['l_linenumber'].cpu().numpy(),
        'l_tax':           sel_tensors['l_tax'].cpu().numpy(),
    })
cols = [
    'l_orderkey',
    'l_quantity',
    'l_extendedprice',
    'l_discount',
    'l_partkey',
    'l_suppkey',
    'l_linenumber',
    'l_tax'
]
df_sel_pd_sorted = (
    df_sel_pd
    .sort_values(by=cols)      
    .reset_index(drop=True)     
)[cols]  
df_sel_tf_sorted = (
    df_sel_tf
    .sort_values(by=cols)
    .reset_index(drop=True)
)[cols]
are_equal = df_sel_pd_sorted.equals(df_sel_tf_sorted)
if are_equal :
    print("Pandas 与张量筛选结果完全一致")
else:
    print("Pandas 与张量筛选结果不一致")

Pandas 与张量筛选结果完全一致
