### 1 生成人工数据

In [None]:
import pandas as pd
import numpy as np

df_y_neg = pd.DataFrame(np.random.randint(0,1,size=900))  # 生成900个0
df_y_pos = pd.DataFrame(np.random.randint(1,2,size=100))  # 生成100个1

df_y = pd.concat([df_y_neg, df_y_pos], ignore_index=True).sample(frac=1).reset_index(drop=True)  # 拼起来打乱，生成标签列y
df_X = pd.DataFrame(np.random.normal(1, 0.1, size=(1000, 10)))  # 生成1000个数据，10个维度
df = pd.concat([df_X, df_y], axis=1)  # 拼起来组成数据集
df.columns = ['x'+str(n) for n in range(10)] + ['y']  # 标上列名
df

### 2 计算权重

In [66]:
num_pos = df.loc[df['y'] == 1].shape[0]
num_neg = df.loc[df['y'] == 0].shape[0]
pos_weight = (num_pos + num_neg) / num_pos
neg_weight = (num_pos + num_neg) / num_neg

print(num_pos, num_neg)
print(pos_weight, neg_weight)  # 算出权重

100 900
10.0 1.1111111111111112


### 3 添加权重列

In [67]:
df['y_weight'] = df['y'].apply(lambda x : pos_weight if x==1 else neg_weight)  # 添加权重列
df

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,y,y_weight
0,1.068456,1.063596,1.013166,1.104790,0.880366,0.988808,0.942886,1.094549,0.910249,0.972088,0,1.111111
1,0.833696,0.958102,0.993866,0.924555,0.861291,1.152786,0.961652,0.999858,1.161463,0.803388,0,1.111111
2,1.008791,0.822793,0.852561,1.118715,1.086271,1.069529,0.873995,0.951702,1.028626,1.058171,0,1.111111
3,0.857424,1.160453,0.815194,1.048349,0.903609,0.940855,0.997933,0.882895,0.998632,0.914564,0,1.111111
4,0.913075,1.042431,1.097095,0.876134,1.001605,0.916581,1.011120,1.121976,0.932108,1.033714,1,10.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.923476,0.931259,1.195409,1.016071,0.902529,0.911233,1.092697,1.037648,1.247636,0.969477,0,1.111111
996,1.075055,1.069097,1.059678,1.056545,1.036952,0.880974,1.005536,0.974187,1.000009,0.799761,0,1.111111
997,0.989936,0.980064,0.926465,1.064285,0.914808,0.989019,1.063873,1.039628,0.895884,1.038405,0,1.111111
998,0.989914,0.976585,0.923614,1.040148,1.131505,0.956910,1.134970,1.073546,1.038679,1.090961,1,10.000000


### 4 定义WeightedRandomSampler抽样器

In [68]:
import torch

data_y_w = torch.tensor(df['y_weight'].to_numpy(), dtype=torch.float)   # 注意一下，DataFrame要to_numpy()转成numpy再传入tensor，不然会报错
num_samples = df.shape[0]   # 一共抽多少样本。可设置为与数据集同一数量，也可以自己设。

# 定义抽样器，传入准备好的权重数组，抽样总数，并选择放回抽样
sampler = torch.utils.data.sampler.WeightedRandomSampler(data_y_w, num_samples, replacement=True)

### 5 构建dataset，建立Dataloader

In [69]:
data_features = torch.tensor(df.iloc[:, :-1].values, dtype=torch.float)   # 数据弃掉权重列，传入tensor

data_features_X = data_features[:,:-1]    # 取X
data_features_y = data_features[:, -1].long()  # 取y，标签转成long型

# TensorDataset 可以用来对 tensor 进行打包，类似zip。形式是数据特征+标签。
dataset = torch.utils.data.TensorDataset(data_features_X, data_features_y)

batch_size = 64
data_iter = torch.utils.data.DataLoader(
    dataset = dataset, sampler = sampler, batch_size = batch_size)

In [71]:
# 查看一个批次的数据
X, y = next(iter(data_iter))
y

tensor([1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1])