<a href="https://colab.research.google.com/github/boshuaiYu/CaiCai_DL/blob/main/CaiCai_DL_Week5.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **手动实现训练集和测试集的划分**

In [13]:
import random
import torch

In [14]:
import sys
sys.path.append("/content/drive/MyDrive/Colab Notebooks/week4_function")
from torchLearning import *

In [15]:
def data_split(features,labels,rate=0.7):
  """测试集和训练集切分函数

  ：param features：输入的特征张量
  ：param labels：输入的标签张量
  ：param rate：训练集占所有数据的比例
  ：return Xtrain, Xtest, ytrain, ytest：返回特征张量的训练集、测试集、及标签向量的训练集、测试集

  """
  num_examples = len(features)
  indices = list(range(num_examples))
  random.shuffle(indices)
  num_train = int(num_examples*rate)
  indices_train = torch.tensor(indices[:num_train])
  indices_test = torch.tensor(indices[num_train:])
  Xtrain = features[indices_train]
  ytrain = labels[indices_train]
  Xtest = features[indices_test]
  ytest = labels[indices_test]
  return Xtrain, Xtest, ytrain, ytest

In [18]:
torch.manual_seed(420)
features,labels = tensorGenReg()
Xtrain, Xtest, ytrain, ytest = data_split(features,labels)

batch_size = 10
lr = 0.03
num_epochs = 5
w = torch.zeros(3,1,requires_grad=True)

net = linreg
loss = MSE_loss

for epoch in range(num_epochs):
  for X,y in data_iter(batch_size,Xtrain,ytrain):
    l = loss(net(X,w),y)
    l.backward()
    sgd(w,lr)

In [19]:
w

tensor([[ 2.0005],
        [-1.0001],
        [ 1.0000]], requires_grad=True)

In [20]:
MSE_loss(torch.mm(Xtrain,w),ytrain) # 训练集结果

tensor(0.0001, grad_fn=<DivBackward0>)

In [21]:
MSE_loss(torch.mm(Xtest,w),ytest) # 验证集结果

tensor(9.7272e-05, grad_fn=<DivBackward0>)

# **Dataset和DataLoader基本使用方法与数据集切分函数**

##**1. Dataset和DataLoader的基本使用方法**



###<font color="orange">**random_split随机划分函数**

In [22]:
from torch.utils.data import random_split

In [23]:
t = torch.arange(12).reshape(4,3)
t

tensor([[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11]])

In [24]:
random_split(t,[2,2])  # 第二个参数是数据要切成几份，每一份包含多少数据的列表
# 返回的是一个数据生成器

[<torch.utils.data.dataset.Subset at 0x7fe951b80890>,
 <torch.utils.data.dataset.Subset at 0x7fe951d49210>]

In [25]:
for tr,te in random_split(t,[2,2]):
  print(tr,te)

tensor([3, 4, 5]) tensor([6, 7, 8])
tensor([0, 1, 2]) tensor([ 9, 10, 11])


###<font color="orange">**Dataset和DataLoader**
单独使用TensorDataset打包时必须要保证特征向量时张量形式才可以,具有一定局限性

In [49]:
from torch.utils.data import Dataset

In [50]:
# 该类由于是Dataset的子类，则其可以使用DataLoader方法等，不用再使用TensorDataset进行打包，也不用强制性转换为tensor类型
class LBCDataset(Dataset):
  def __init__(self,data):
    self.features = data.data
    self.labels = data.target
    self.lens = len(data.data)
  
  def __getitem__(self,index):
    return self.features[index,:],self.labels[index]

  def __len__(self):
    return self.lens

# 该类是针对特定数据集的特异化定制

In [51]:
from sklearn.datasets import load_breast_cancer as LBC
data = LBC()
LBC_data = LBCDataset(data)

In [52]:
LBC_data.features

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [53]:
LBC_data.lens

569

In [54]:
LBC_data.__getitem__(2)

(array([1.969e+01, 2.125e+01, 1.300e+02, 1.203e+03, 1.096e-01, 1.599e-01,
        1.974e-01, 1.279e-01, 2.069e-01, 5.999e-02, 7.456e-01, 7.869e-01,
        4.585e+00, 9.403e+01, 6.150e-03, 4.006e-02, 3.832e-02, 2.058e-02,
        2.250e-02, 4.571e-03, 2.357e+01, 2.553e+01, 1.525e+02, 1.709e+03,
        1.444e-01, 4.245e-01, 4.504e-01, 2.430e-01, 3.613e-01, 8.758e-02]), 0)

此时可以使用random_split方法对其进行切分

In [55]:
num_train = int(LBC_data.lens*0.7)
num_test = LBC_data.lens - num_train

In [56]:
num_train,num_test

(398, 171)

In [57]:
LBC_train, LBC_test = random_split(LBC_data,[num_train,num_test]) # 映射存储
# 只有dataset，indices两个属性，dataset用来查看原数据集对象，indices用来查看切分后数据集的每一条数据的index

In [58]:
LBC_train.dataset == LBC_data
LBC_test.dataset == LBC_data
# 返回的是指向原对象

True

In [59]:
LBC_train.indices[:10]

[443, 222, 324, 400, 359, 181, 472, 312, 371, 381]

In [60]:
for i in LBC_train:
  print(i)
  break

(array([1.057e+01, 1.832e+01, 6.682e+01, 3.409e+02, 8.142e-02, 4.462e-02,
       1.993e-02, 1.111e-02, 2.372e-01, 5.768e-02, 1.818e-01, 2.542e+00,
       1.277e+00, 1.312e+01, 1.072e-02, 1.331e-02, 1.993e-02, 1.111e-02,
       1.717e-02, 4.492e-03, 1.094e+01, 2.331e+01, 6.935e+01, 3.663e+02,
       9.794e-02, 6.542e-02, 3.986e-02, 2.222e-02, 2.699e-01, 6.736e-02]), 1)


In [61]:
LBC_data.__getitem__(443)

(array([1.057e+01, 1.832e+01, 6.682e+01, 3.409e+02, 8.142e-02, 4.462e-02,
        1.993e-02, 1.111e-02, 2.372e-01, 5.768e-02, 1.818e-01, 2.542e+00,
        1.277e+00, 1.312e+01, 1.072e-02, 1.331e-02, 1.993e-02, 1.111e-02,
        1.717e-02, 4.492e-03, 1.094e+01, 2.331e+01, 6.935e+01, 3.663e+02,
        9.794e-02, 6.542e-02, 3.986e-02, 2.222e-02, 2.699e-01, 6.736e-02]), 1)

DataLoader常用参数如下：
1.   batch_size：小批量大小
2.   shuffle：一般训练集需要乱序，测试集不需要(无意义)
3.   num_worker: 启动多少线程进行计算



In [64]:
from torch.utils.data import DataLoader
train_loader = DataLoader(LBC_train,batch_size=10,shuffle=True)
test_loader = DataLoader(LBC_test,batch_size=10,shuffle=False) # 不用shuffle，batch_size一般是test的大小，不用批量

In [65]:
LBC_train == train_loader.dataset  # 映射在被加载前的数据集(回溯机制)

True

> 这里值得一提的是，市面上有很多教材在介绍PyTorch深度学习建模过程中的数据集划分过程，会推荐使用<font color=yellow>**scikit-learn**</font>中的<font color=yellow>**train_test_split**</font>函数。该函数是可以非常便捷的完成数据集切分，但这种做法只能用于单机运行的数据，并且切分之后还要调用Dataset、DataLoader模块进行数据封装和加载，切分过程看似简单，但其实会<font color=yellow>**额外占用非常多的存储空间和计算资源**</font>，当进行超大规模数据训练时，所造成的影响会非常明显（当然，也有可能由于数据规模过大，本地无法运行）。因此，为了更好的适应深度学习真实应用场景，在使用包括数据切分等常用函数时，函数使用优先级是:     
<center><font color="orange">Pytorch原生函数和类>依据张量及其常用方法手动创建的函数>Scikit-Learn函数

##**2.建模与评估过程**

数据准备

In [71]:
# 数据生成
features,labels = tensorGenReg()
features = features[:,:-1]

# 创建一个继承Dataset类的数据类
class GenData(Dataset):
  def __init__(self,features,labels):
    self.features = features
    self.labels = labels
    self.lens = len(features)

  def __getitem__(self,index):
    return self.features[index,:],self.labels[index,:]

  def __len__(self):
    return self.lens
# 数据实例化
data = GenData(features,labels)
# 切分数据集
num_train = int(data.lens*0.7)
num_test = data.lens - num_train
data_train,data_test = random_split(data,[num_train,num_test])
#数据加载
train_loader = DataLoader(data_train,batch_size=10,shuffle=True)
test_loader = DataLoader(data_test,batch_size=10,shuffle=False)

In [72]:
from torch import nn,optim
batch_size = 10
lr = 0.03
num_epochs = 3

class LR(nn.Module):
  def __init__(self,in_features=2,out_features=1):
    super(LR,self).__init__()
    self.linear = nn.Linear(in_features,out_features)

  def forward(self,x):
    out = self.linear(x)
    return out
LR_model = LR()
criterion = nn.MSELoss()
optimizer = optim.SGD(LR_model.parameters(),lr=lr)

def fit(net,criterion,optimizer,batchdata,epochs):
  for epoch in range(epochs):
    for X,y in batchdata:
      yhat = net.forward(X)
      loss = criterion(yhat,y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()  

In [73]:
fit(  net=LR_model,
    criterion=criterion,
    optimizer=optimizer,
    batchdata=train_loader,
    epochs=num_epochs
    )

In [76]:
F.mse_loss(LR_model(data[data_train.indices][0]),data[data_train.indices][1]) # 训练集上的mse

tensor(0.0001, grad_fn=<MseLossBackward0>)

In [77]:
F.mse_loss(LR_model(data[data_test.indices][0]),data[data_test.indices][1]) # 测试集上的mse

tensor(9.8884e-05, grad_fn=<MseLossBackward0>)

# **实用函数补充**

##**数据封装、切分和加载函数**

In [78]:
def split_loader(features,labels,batch_size=10,rate=0.7):
  data = GenData(features,labels)
  num_train = int(data.lens * 0.7)
  num_test = data.lens - num_train
  data_train, data_test = random_split(data,[num_train,num_test])
  train_loader = DataLoader(data_train,batch_size=batch_size,shuffle=True)
  test_loader = DataLoader(data_test,batch_size=batch_size,shuffle=True)
  return (train_loader,test_loader)

##**模型训练函数函数**

In [80]:
def fit(net,criterion,optimizer,batchdata,epochs=3,cla=False):
  for epoch in range(epochs):
    for X,y in batchdata:
      if cla == True:
        y = y.flatten().long()   # 分类问题对y进行整数转化
      yhat = net.forward(X)
      loss = criterion(yhat,y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

In [82]:
def mse_cal(data_loader,net):
  data = data_loader.dataset
  X = data[:][0]
  y = data[:][1]
  yhat = net(X)
  return F.mse_loss(yhat,y)

In [83]:
def accuracy_cal(data_loader,net):
  data = data_loader.dataset
  X = data[:][0]
  y = data[:][1]
  zhat = net(X)
  soft_z = torch.softmax(zhat,1)
  acc_bool = torch.argmax(soft_z,1).flatten() == y.flatten()
  acc = torch.mean(acc_bool.float())
  return acc