In [4]:
import numpy as np
import numba as nb
import torch as th
import SharedArray as sa
import os
import nutils
import common as cm
import cupy as cp
from tqdm import tqdm
from numba import prange
import modin.pandas as pd

In [5]:
codes = cm.SELECTED_CODES
def get_data(code):
    x = pd.read_csv(
        f"/home/ywang/workspace/alphagen/t0/stkCode_{code}.csv", header=None
    )
    y = sa.attach(f"label_{code}")
    z = sa.attach(f"timestamp_{code}")
    return x, y, z

In [6]:
for code in tqdm(codes):
    x, y, ts = get_data(code)
    with cp.cuda.Device(1):  # Use GPU 1
        # Convert to CuPy array and ensure float32
        x_cp = cp.array(x, dtype=cp.float32)
        
        # Replace NaNs with 0
        x_cp = cp.nan_to_num(x_cp, nan=0.0)
        
        # Standardize using CuPy
        mean = cp.mean(x_cp, axis=0)
        std = cp.std(x_cp, axis=0)
        
        # Avoid division by zero
        std[std == 0] = 1.0
        
        x_cp = (x_cp - mean) / std
        
        # Convert back to NumPy array
        x_np = cp.asnumpy(x_cp)
    
    # Save the standardized data
    np.save(f"/mnt/nas/data/WY/factors/{code}.npy", x_np)

100%|██████████| 100/100 [2:53:22<00:00, 104.02s/it] 


In [7]:
a = np.load(f"/mnt/nas/data/WY/factors/{code}.npy")

In [8]:
a.dtype

dtype('float32')

In [3]:
    # from sklearn.preprocessing import StandardScaler
    # for code in tqdm(codes):
    #     x, y, ts = get_data(code)
    #     x = np.nan_to_num(x, 0, 0, 0).astype(np.float32)
    #     scaler = StandardScaler()
    #     x = scaler.fit_transform(x)
    #     np.save(f"/mnt/nas/data/WY/factors/{code}.npy", x)

  0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
@nb.njit(parallel=True)
def global_standardize(x):
    n, m = x.shape
    
    # Calculate global mean and std for each column manually
    means = np.zeros(m)
    stds = np.zeros(m)
    
    for j in range(m):
        column = x[:, j]
        means[j] = np.mean(column)
        stds[j] = np.std(column)
    
    # Initialize the standardized array
    x_standardized = np.zeros_like(x)
    
    # Standardize the data
    for i in prange(n):
        for j in range(m):
            if stds[j] > 0:
                x_standardized[i, j] = (x[i, j] - means[j]) / stds[j]
            else:
                x_standardized[i, j] = 0

    return x_standardized






In [5]:

@nb.njit(parallel=True)
def standardize_by_day_numba(x, ts):
    n, m = x.shape
    unique_days = np.unique(ts)
    x_standardized = np.zeros_like(x)

    for i in prange(len(unique_days)):
        day = unique_days[i]
        mask = ts == day
        indices = np.where(mask)[0]
        x_day = x[indices]
        
        # Manually calculate mean and std for each column
        means = np.zeros(m)
        stds = np.zeros(m)
        for j in range(m):
            column = x_day[:, j]
            means[j] = np.mean(column)
            stds[j] = np.std(column)
        
        # Standardize
        for k in indices:
            for j in range(m):
                if stds[j] > 0:
                    x_standardized[k, j] = (x[k, j] - means[j]) / stds[j]
                else:
                    x_standardized[k, j] = 0

    return x_standardized

In [7]:
for code in tqdm(codes):
    x, y, ts = get_data(code)
    x = np.nan_to_num(x, 0, 0, 0).astype(np.float32)
    x_standardized = standardize_by_day_numba(x, ts)
    # x_standardized_cpu = cp.asnumpy(x_standardized)
    np.save(f"/mnt/nas/data/WY/factors/{code}.npy", x_standardized)

  1%|          | 1/100 [02:57<4:52:24, 177.21s/it]

In [None]:
def standardize_by_day_on_gpu(x, ts, device_id=0):
    with cp.cuda.Device(device_id):
        # 将 numpy 数组转换为 cupy 数组
        x_cp = cp.array(x)
        ts_cp = cp.array(ts)
        n, m = x_cp.shape

        # 获取所有唯一的天数
        unique_days = cp.unique(ts_cp)

        # 初始化标准化后的数组
        x_standardized = cp.zeros_like(x_cp)

        # 按天标准化
        for day in unique_days:
            mask = ts_cp == day
            x_day = x_cp[mask]
            # 计算每一列的均值和标准差
            mean = cp.mean(x_day, axis=0)
            std = cp.std(x_day, axis=0)
            # 标准化
            x_standardized[mask] = (x_day - mean) / std

        # 将标准化后的数据从 GPU 转回 CPU 并转换为 numpy 数组
        return cp.asnumpy(x_standardized)

In [None]:
for code in tqdm(codes):
    x, y, ts = get_data(code)
    x = np.nan_to_num(x, 0, 0, 0).astype(np.float32)
    x_standardized = standardize_by_day_on_gpu(x, ts)
    x_standardized_cpu = cp.asnumpy(x_standardized)
    np.save(f"/mnt/nas/data/WY/factor/{code}.npy", x_standardized_cpu)