## Description:
这个jupyter弄清楚刚哥台风预测的代码逻辑， 并进行测试集一遍

In [32]:
import os 
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from torch.utils.data import DataLoader, Dataset
from torch.optim import lr_scheduler
from torch import optim

# 模型
from resnext import ResNeXt
from sknet_typhoon import SKNet, SKNet50
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

from torch.utils.data.distributed import DistributedSampler

## 导入数据集

In [33]:
class TyphoonDataset(Dataset):
    def __init__(self, mode='train', root_path="/home/qiao/TyphoonDatasets"):
        super(TyphoonDataset, self).__init__()
        if mode == 'train':
            img_file_path = os.path.join(root_path, 'img_train.npy')
            trace_file_path = os.path.join(root_path, 'input_trace_train.npy')
            out_file_path = os.path.join(root_path, 'output_trace_train.npy')
            self.img, self.trace, self.output = np.load(img_file_path), np.load(trace_file_path), np.load(out_file_path)
      
        elif mode == 'test':
            img_file_path = os.path.join(root_path, 'img_test.npy')
            trace_file_path = os.path.join(root_path, 'input_trace_test.npy')
            out_file_path = os.path.join(root_path, 'output_trace_test.npy')
            self.img, self.trace, self.output = np.load(img_file_path), np.load(trace_file_path), np.load(out_file_path)
          
        self.num = self.output.shape[0]
        self.img = self.img.transpose((0, 3, 1, 2))   # N*H*W*C -> N*C*H*W
    
    def __len__(self):
        return self.num
    
    def __getitem__(self, index):
        return self.img[index, :, :].astype(np.float32), self.trace[index, :].astype(np.float32), self.output[index, :].astype(np.float32)

In [34]:
# 导入数据集
train_data = TyphoonDataset(mode='train')
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)

In [35]:
for (img, trace, y) in iter(train_loader):
    print(img.shape)         # 18个通道， 25*25
    print(trace.shape)          # 22
    print(y.shape)       # 坐标吗这是？
    break

torch.Size([16, 18, 25, 25])
torch.Size([16, 22])
torch.Size([16, 2])


In [36]:
# 划分训练集和测试集
train_size = int(0.8*len(train_data))
test_size = len(train_data) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(train_data, [train_size, test_size])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=False)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8, shuffle=False)

## 模型的训练和测试

In [31]:
# 初始化
#torch.distributed.init_process_group(backend="nccl",  init_method='env:// rendezvous: wzq_env:')

ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set

In [37]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

def weights_init(m):
    if isinstance(m, nn.Conv2d):
        init.xavier_uniform_(m.weight.data)
        init.constant_(m.bias.data, 0.1)
    
    elif isinstance(m, nn.BatchNorm2d):
        m.weight.data.fill_(1)
        m.bias.data.zero_()
    elif isinstance(m, nn.Linear):
        m.weight.data.normal_(0, 0.01)
        m.bias.data.zero_()

In [38]:
# 模型
net = SKNet50()

optimizer = optim.Adam(net.parameters(), lr=1e-3, weight_decay=1e-2, betas=(0.9, 0.999))
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3)

net = net.to(device)
#net = nn.DataParallel(net)

In [39]:
net.apply(weights_init) #apply函数会递归地搜索网络内的所有module并把参数表示的函数应用到所有的module上。  #对所有的Conv层都初始化权重. 

SKNet50(
  (basic_conv): Sequential(
    (0): Conv2d(18, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (stage_1): Sequential(
    (0): SKUnit(
      (feas): Sequential(
        (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1))
        (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): SKConv(
          (convs): ModuleList(
            (0): Sequential(
              (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=8)
              (1): SyncBatchNorm(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): ReLU()
            )
            (1): Sequential(
              (0): Conv2d(128, 128, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=8)
              (1): SyncBatchNorm(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): Re

In [40]:
# 定义训练函数
def train():
    net.train()
    loss_avg = 0.0
    for step, (img, _, y) in enumerate(train_loader):
        img, y = img.cuda(), y.cuda()
        
        output = net(img)
        print(output.shape)
        
        optimizer.zero_grad()
        if output.size() != y.size():
            print(output.size(), y.size())
            output = output.view(1, output.size(0))
        
        loss = F.mse_loss(output, y)
        loss.backward()
        optimizer.step()
        
        loss_avg += loss.item()
        
        if step % 50 == 0:
            print('step {}, loss {}'.format(step, loss.item()))
    
    state['train_loss'] = loss_avg / len(train_loader)

def test():
    net.eval()
    loss_avg = 0.0
    distance = 0.0
    with torch.no_grad():
        for step, (img, _, y) in enumerate(valid_loader):
            img, y = img.cuda(), y.cuda()
            
            output = net(img)
            if output.size() != y.size():
                print(output.size(), y.size())
                output = output.view(1, output.size(0))
            loss = F.mse_loss(output, y)
            
            # distance
            dealt = torch.pow(outpu-y, 2)
            distance += torch.mean(torch.sqrt(torch.sum(dealt, 0)))
            
            loss_avg += loss.item()
        
        state['test_loss'] = loss_avg / len(valid_loader)
        state['distance'] = distance.item() / len(valid_loader)

In [41]:
state = {}
best_distance = 100
for epoch in range(100):
    state['epoch'] = epoch
    print('==========epoch{}==========='.format(epoch))
    train()
    test()
    scheduler.step(state['test_loss'])
    print("epoch: {}, loss is: {}, test is :{}".format(epoch, state["train_loss"], state["test_loss"]))
    if state["distance"] < best_distance:
        best_distance = state["distance"]
    print("Best distance: {}".format(best_distance))



AssertionError: Default process group is not initialized