# sklearn

## train_test_split

### 随机划分

In [1]:
from sklearn.model_selection import train_test_split

# 示例数据
X = [[1, 2], [3, 4], [5, 6], [7, 8]]
y = [0, 1, 0, 1]

# 按8:2划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("训练集:", X_train, y_train)
print("测试集:", X_test, y_test)

训练集: [[7, 8], [1, 2], [5, 6]] [1, 0, 0]
测试集: [[3, 4]] [1]


### 分层划分

In [2]:
from sklearn.model_selection import train_test_split

# 示例数据
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]
y = [0, 0, 1, 1, 0, 1]  # 标签分布不均

# 按标签分层划分
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

print("训练集:", X_train, y_train)
print("测试集:", X_test, y_test)

训练集: [[3, 4], [11, 12], [1, 2], [7, 8]] [0, 1, 0, 1]
测试集: [[5, 6], [9, 10]] [1, 0]


### 多数据集划分

In [4]:
from sklearn.model_selection import train_test_split

# 示例数据
X1 = [[1, 2], [3, 4], [5, 6], [7, 8]]
X2 = [[9, 10], [11, 12], [13, 14], [15, 16]]
y = [0, 1, 0, 1]

# 同步划分多个数据集
X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(
    X1, X2, y, test_size=0.5, random_state=42
)

print("X1_train:", X1_train)
print("X2_train:", X2_train)
print("y_train:", y_train)

X1_train: [[1, 2], [5, 6]]
X2_train: [[9, 10], [13, 14]]
y_train: [0, 0]


# torch

## random_split

用于随机划分数据集，常用于Dataset对象。

In [5]:
import torch
from torch.utils.data import random_split, TensorDataset

# 创建示例数据集
data = torch.arange(20).view(-1, 2)  # 特征
labels = torch.arange(10)  # 标签

# 构建TensorDataset
dataset = TensorDataset(data, labels)

# 按80:20划分
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# 查看划分结果
print("训练集:", list(train_dataset))
print("测试集:", list(test_dataset))

训练集: [(tensor([6, 7]), tensor(3)), (tensor([2, 3]), tensor(1)), (tensor([10, 11]), tensor(5)), (tensor([18, 19]), tensor(9)), (tensor([0, 1]), tensor(0)), (tensor([12, 13]), tensor(6)), (tensor([8, 9]), tensor(4)), (tensor([4, 5]), tensor(2))]
测试集: [(tensor([16, 17]), tensor(8)), (tensor([14, 15]), tensor(7))]


## SubsetRandomSampler

用于按索引随机采样，适合自定义数据划分。

In [8]:
import torch
import random
from torch.utils.data import DataLoader, SubsetRandomSampler, TensorDataset

# 创建示例数据集
data = torch.arange(20).view(-1, 2)  # 特征
labels = torch.arange(10)  # 标签
dataset = TensorDataset(data, labels)

# 划分索引
dataset_size = len(dataset)
indices = list(range(dataset_size))

# 打乱数据
random.seed(42)
random.shuffle(indices)

# 划分索引
split = int(0.8 * dataset_size)

# 定义采样器
train_sampler = SubsetRandomSampler(indices[:split])
test_sampler = SubsetRandomSampler(indices[split:])

# 创建DataLoader
train_loader = DataLoader(dataset, batch_size=3, sampler=train_sampler)
test_loader = DataLoader(dataset, batch_size=3, sampler=test_sampler)

# 查看数据
print("训练集:", list(train_loader))
print("测试集:", list(test_loader))

训练集: [[tensor([[ 8,  9],
        [16, 17],
        [14, 15]]), tensor([4, 8, 7])], [tensor([[ 6,  7],
        [ 4,  5],
        [18, 19]]), tensor([3, 2, 9])], [tensor([[10, 11],
        [12, 13]]), tensor([5, 6])]]
测试集: [[tensor([[2, 3],
        [0, 1]]), tensor([1, 0])]]
