In [2]:
import torch
import torch.nn as nn
from torch.nn.utils import weight_norm
from torch.utils.data import Dataset, DataLoader,RandomSampler,SubsetRandomSampler
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import random
import json
# import optuna
from torch.nn import functional
import datetime
import gc
import os
import glob
from tqdm import tqdm

In [3]:
all_data = np.load('D:/myfiles/project/bike_prediction/feature_data/tcn_data_3d.npy')
all_data.shape

(753, 3312, 8)

In [5]:
all_data[1:2,0:72,0:8]

array([[[  9.,  17.,   8.,  12.,   2.,  13.,   6.,   0.],
        [  9.,  19.,  10.,   9.,   2.,  10.,   6.,   1.],
        [ 10.,  21.,  11.,  13.,   2.,  13.,   7.,   2.],
        [ 10.,  22.,  12.,  11.,   1.,  12.,   6.,   3.],
        [ 10.,  26.,  16.,   8.,  -1.,   9.,   8.,   4.],
        [  2.,  27.,  25.,   0.,   0.,   7.,   3.,   5.],
        [ -2.,  27.,  29.,  -5.,   0.,   6.,   1.,   6.],
        [ -2.,  28.,  30.,  -6.,   0.,   0.,   1.,   7.],
        [ -4.,  26.,  30., -12.,  -1.,  -6.,  -1.,   8.],
        [-10.,  18.,  28., -14.,  -1., -10.,  -4.,   9.],
        [ -8.,  19.,  27., -14.,  -2., -10.,  -4.,  10.],
        [ -9.,  17.,  26., -14.,  -2., -10.,  -4.,  11.],
        [-10.,  15.,  25., -12.,  -2.,  -9.,  -3.,  12.],
        [-10.,  13.,  23., -10.,  -2.,  -6.,  -3.,  13.],
        [-11.,  11.,  22., -14.,  -2.,  -9.,  -4.,  14.],
        [-11.,  10.,  21., -12.,  -1.,  -8.,  -3.,  15.],
        [-11.,   6.,  17.,  -9.,   1.,  -5.,  -5.,  16.],
        [ -3.,

In [4]:
# 【站点数量，序列长度，特征数量】
class MyDataset(Dataset):
    def __init__(self, his_datas, his_label, output_size, feature_size, seq_num, time_of_day):
        self.his_datas = his_datas  #【N，1080，X】
        # self.sta_datas = sta_datas  #【N，26，Y】
        self.his_label = his_label  #【N，1080，1】
        self.output_size = output_size  # 输出长度24
        self.feature_size = feature_size  # 卷积塔时序特征数量
        # self.static_feature_size = static_feature_size  # 特征塔天粒度/静态特征数量
        self.seq_num = seq_num  # 窗口大小
        self.time_of_day = time_of_day  # 每天24小时
         
        self.site_num = his_datas.shape[0]  # 站点数量
        self.time_num = his_datas.shape[1] // time_of_day  - (seq_num + 3) # 单个站点的样本数量：26-15=11个样本
        self.sample_num = self.time_num * self.site_num  # 总样本数量：32*1080=3w
        # print(his_datas.shape)
        print('单个样本数量：', self.time_num)
        print('站点数量：', self.site_num)
        print('总样本数量：', self.sample_num)
        print("a", his_datas.shape, his_label.shape)
        
    def __getitem__(self, index): # 0-3w
        # 是第几个样本？
        cls_indx, time_indx = divmod(index, self.time_num)
        start_index = time_indx * self.time_of_day
        end_index = (time_indx + self.seq_num) * self.time_of_day
        # [站点,小时粒度序列,小时粒度特征]
        tmp_data = self.his_datas[cls_indx, start_index:end_index, 0:self.feature_size].astype(float)  # [0, 14*24, time_feature_size]
        sample_time_data = torch.tensor(tmp_data, dtype=torch.float32)
        # [站点,天粒度序列,天粒度特征]
        # static_data = self.sta_datas[cls_indx, static_index:static_index+1, 0:self.static_feature_size].astype(float)  # [0, 1, time_feature_size]
        # sample_static_data = torch.tensor(static_data, dtype=torch.float32)
        # [站点,序列,1]
        label_start = end_index
        label_end = label_start + self.output_size
        target_label = self.his_label[cls_indx, label_start:label_end, 0:1].astype(float)
        sample_labels = torch.tensor(target_label, dtype=torch.float32)
        
        return sample_time_data, sample_labels
    
    def __len__(self):
        return self.sample_num


In [5]:
def train_test_split(all_data):  # 56天
    tmp_data_info = np.array(all_data)
    # sta_data_info = np.array(sta_data)
    # 当前总时长为138天，4.15-8.30
    train_start_idx = 0
    train_end_idx = 76 * 24 
    val_start_idx = 76 * 24
    val_end_idx = 107 * 24 
    test_start_idx = 107 * 24
    test_end_idx = 138 * 24 
    # train_start_sta_idx = 0
    # train_end_sta_idx = 18
    # val_end_sta_idx = 22
    # test_end_sta_idx = 26
    
#     train_start_idx = 0
#     train_end_idx = 38 * 24  # 9
#     val_start_idx = (38 - 30) * 24  # 13使用14，14使用15
#     val_end_idx = 42 * 24  # 4
#     test_start_idx = (42 - 30) * 24
#     test_end_idx = 49 * 24  # 7
    
    train_data = tmp_data_info[:, train_start_idx:train_end_idx, :]  # 所有特征
    # train_data_sta = sta_data_info[:, train_start_sta_idx:train_end_sta_idx, :]
    train_label = tmp_data_info[:, train_start_idx:train_end_idx, 0:1]
    val_data = tmp_data_info[:, val_start_idx:val_end_idx, :]
    # val_data_sta = sta_data_info[:, train_end_sta_idx:val_end_sta_idx, :]    
    val_label = tmp_data_info[:, val_start_idx:val_end_idx, 0:1]
    test_data = tmp_data_info[:, test_start_idx:test_end_idx, :]
    # test_data_sta = sta_data_info[:, val_end_sta_idx:test_end_sta_idx, :]  
    test_label = tmp_data_info[:, test_start_idx:test_end_idx, 0:1]
    return train_data, train_label, val_data, val_label, test_data, test_label
    # return train_data, train_data_sta, train_label, val_data, val_data_sta, val_label, test_data, test_data_sta, test_label



def load_data(all_data, batch_size):
    train_data, train_label, val_data, val_label, test_data, test_label = train_test_split(all_data)
    
    # 创建数据集
    train_dataset = MyDataset(his_datas=train_data, his_label=train_label, 
                             output_size=24, feature_size=8, seq_num=7, time_of_day=24)
    
    # 创建训练样本索引
    n_train = len(train_dataset)
    indices = list(range(n_train))
    np.random.shuffle(indices)
    split_point = int(n_train * 0.4)
    train_indices = indices[:split_point]
    
    # 创建采样器
    train_sampler = SubsetRandomSampler(train_indices)
    
    # 创建数据加载器
    train_dataloader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        sampler=train_sampler,
        pin_memory=True  # 加速GPU数据传输
    )
    
    # 验证和测试集保持完整
    val_dataset = MyDataset(his_datas=val_data, his_label=val_label, 
                           output_size=24, feature_size=8, seq_num=7, time_of_day=24)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    test_dataset = MyDataset(his_datas=test_data, his_label=test_label, 
                             output_size=24, feature_size=8, seq_num=7, time_of_day=24)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_dataloader, val_dataloader, test_dataloader


# def load_multitask_data(all_data, sta_data, batch_size, output_sizes={0: 24, 1: 24, 2: 24}):
#     # 自定义collate函数处理多目标数据
#     def multitask_collate(batch):
#         time_data = torch.stack([item[0] for item in batch])
#         static_data = torch.stack([item[1] for item in batch])
#         labels = {}
#         for target_idx in output_sizes.keys():
#             labels[target_idx] = torch.stack([item[2][target_idx] for item in batch])
#         return time_data, static_data, labels
#     train_data, train_data_sta, train_label, val_data, val_data_sta, val_label, test_data, test_data_sta, test_label = train_test_split(all_data, sta_data)
#     train_dataset = MyDataset(his_datas=train_data, sta_datas = train_data_sta, his_label=train_label, 
#                               output_sizes=output_sizes, time_feature_size=22, static_feature_size=7, seq_num=14, time_of_day=24)
#     n_samples = len(train_dataset)
#     indices = list(range(n_samples))
#     # 随机选择50%的样本
#     split = int(0.4 * n_samples)
#     np.random.shuffle(indices)
#     train_indices = indices[:split]  # 前50%作为本次训练样本
#     # 创建采样器
#     train_sampler = SubsetRandomSampler(train_indices)
#     train_dataloader = DataLoader(
#         train_dataset,
#         batch_size=batch_size,
#         sampler=train_sampler,
#         collate_fn=multitask_collate
#     )
#     # train_rand_sampler = RandomSampler(train_dataset, replacement=False, num_samples=int(len(train_dataset)*0.3))
#     # train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=False, sampler=train_rand_sampler) 
#     # train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True) 
      
#     val_dataset = MyDataset(his_datas=val_data, sta_datas = val_data_sta, his_label=val_label,
#                             output_sizes=output_sizes, time_feature_size=22, static_feature_size=7, seq_num=14, time_of_day=24)
#     val_dataloader = DataLoader(val_dataset, batch_size = batch_size, shuffle=False, collate_fn=multitask_collate)

#     test_dataset = MyDataset(his_datas=test_data, sta_datas = test_data_sta, his_label=test_label,
#                              output_sizes=output_sizes, time_feature_size=22, static_feature_size=7, seq_num=14, time_of_day=24)
#     test_dataloader = DataLoader(test_dataset, batch_size = 4, shuffle=False, collate_fn=multitask_collate)

#     return train_dataloader , val_dataloader, test_dataloader


In [6]:
import torch
import torch.nn as nn
from torch.nn.utils import weight_norm

def huber_loss(y_pred, y_true):
    loss = torch.nn.SmoothL1Loss(reduction='mean',beta=5.0)(y_pred, y_true)
    return loss


def mse_loss(y_pred, y_true):
    loss = torch.nn.MSELoss(reduction='mean')(y_pred, y_true)
    return loss
    

def printbar():
    t = datetime.datetime.now()
    print('==========='*8 + str(t))


import os
def setup_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] =str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.daterministic = True
    

# models/informer_full.py
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# ---------------------------
# Positional / Temporal Embedding
# ---------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=10000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)  # (max_len, d_model)

    def forward(self, x):
        # x shape: (B, L, D) -> return (1, L, D)
        return self.pe[:x.size(1)].unsqueeze(0).to(x.device)


class DataEmbedding(nn.Module):
    """
    Simple value embedding (linear) + positional encoding.
    In full Informer they also use temporal embedding (time-of-day, day-of-week). 
    You can extend here if you have those features.
    """
    def __init__(self, input_dim, d_model, max_len=10000):
        super().__init__()
        self.value_embedding = nn.Linear(input_dim, d_model)
        self.position = PositionalEncoding(d_model, max_len)

    def forward(self, x):
        # x: (B, L, input_dim)
        x = self.value_embedding(x) + self.position(x)
        return x  # (B, L, d_model)


# ---------------------------
# ProbSparse Attention (efficient attention) implementation
# This is a simplified but working form of ProbSparse used in Informer.
# ---------------------------
def _get_topk_indices(scores, k):
    # scores: (..., Lk)
    # return top-k indices along last dim
    _, idx = torch.topk(scores, k, dim=-1)
    return idx


class ProbAttention(nn.Module):
    def __init__(self, mask_flag=False, factor=5, scale=None):
        super().__init__()
        self.mask_flag = mask_flag
        self.factor = factor  # factor for sample size
        self.scale = scale

    def _prob_QK(self, Q, K):
        # Q: (B, H, Lq, D), K: (B, H, Lk, D)
        B, H, Lq, D = Q.shape
        _, _, Lk, _ = K.shape

        # sample u keys for each query where u = factor * ln(Lk)
        u = max(1, min(Lk, int(self.factor * math.ceil(math.log(Lk + 1)))))  # sample size
        # random sample index from keys
        K_expand = K.unsqueeze(-3).expand(B, H, Lq, Lk, D)  # (B,H,Lq,Lk,D)

        # choose sampled keys index
        # compute Q * K_sample^T for sampled K
        index_sample = torch.randint(0, Lk, (Lq, u), device=Q.device)  # (Lq, u)
        # gather sampled K
        K_sample = K[:, :, index_sample, :]  # (B, H, Lq, u, D)
        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-1, -2)).squeeze(-2)  # (B,H,Lq,u)
        M = Q_K_sample.max(-1)[0] - Q_K_sample.mean(-1)  # (B,H,Lq)
        # select top queries
        topk = max(1, int(self.factor * math.ceil(math.log(Lq + 1))))
        topk_indices = _get_topk_indices(M, topk)  # (B,H,topk)
        return topk_indices  # indexes of important queries

    def forward(self, Q, K, V, attn_mask=None):
        # Q,K,V: (B, H, L, D)
        B, H, Lq, D = Q.shape
        _, _, Lk, _ = K.shape

        # compute full scores for small sequences, else ProbSparse
        if Lk <= 64:
            scores = torch.matmul(Q, K.transpose(-1, -2))  # (B,H,Lq,Lk)
            if self.scale is not None:
                scores = scores / math.sqrt(self.scale)
            if self.mask_flag and attn_mask is not None:
                scores.masked_fill_(attn_mask.unsqueeze(1).unsqueeze(2), -1e9)
            attn = torch.softmax(scores, dim=-1)
            out = torch.matmul(attn, V)
            return out

        # ProbSparse for large Lk
        # 1) find important queries
        topk_idx = self._prob_QK(Q, K)  # (B,H,topk)
        # gather selected Q
        # build full attention only for topk queries, other queries get approximate aggregated value
        # compute full scores at topk positions
        # we'll compute scores for topk queries across all keys then scatter back
        # prepare indexing
        B_idx = torch.arange(B, device=Q.device)[:, None, None]
        H_idx = torch.arange(H, device=Q.device)[None, :, None]
        tq = topk_idx  # (B,H,topk)

        # gather Q_topk: (B,H,topk,D)
        Q_topk = torch.gather(Q, 2, tq.unsqueeze(-1).expand(-1, -1, -1, D))
        # compute scores_topk: (B,H,topk,Lk)
        scores_topk = torch.matmul(Q_topk, K.transpose(-1, -2))
        if self.scale is not None:
            scores_topk = scores_topk / math.sqrt(self.scale)
        if self.mask_flag and attn_mask is not None:
            # mask broadcasting
            scores_topk = scores_topk.masked_fill(attn_mask.unsqueeze(1).unsqueeze(2), -1e9)
        attn_topk = torch.softmax(scores_topk, dim=-1)  # (B,H,topk,Lk)
        out_topk = torch.matmul(attn_topk, V)  # (B,H,topk,D)

        # now create output tensor and scatter out_topk to their positions
        out = torch.zeros_like(Q, device=Q.device)  # (B,H,Lq,D)
        out = out.scatter(2, tq.unsqueeze(-1).expand(-1, -1, -1, D), out_topk)
        # For non-selected queries, approximate by aggregate of V (mean)
        V_mean = V.mean(2, keepdim=True).expand(-1, -1, Lq, -1)  # (B,H,Lq,D)
        out = out + V_mean * 0.0  # keep zeros elsewhere (we could add approximation)
        return out


# ---------------------------
# MultiHead wrapper
# ---------------------------
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, attention):
        super().__init__()
        assert d_model % n_heads == 0
        self.n_heads = n_heads
        self.d_head = d_model // n_heads
        self.d_model = d_model

        self.w_qs = nn.Linear(d_model, d_model)
        self.w_ks = nn.Linear(d_model, d_model)
        self.w_vs = nn.Linear(d_model, d_model)
        self.proj = nn.Linear(d_model, d_model)
        self.attention = attention

    def forward(self, q, k, v, attn_mask=None):
        # q,k,v: (B, L, D)
        B, Lq, D = q.shape
        _, Lk, _ = k.shape

        q = self.w_qs(q).view(B, Lq, self.n_heads, self.d_head).transpose(1, 2)  # (B, H, Lq, d_head)
        k = self.w_ks(k).view(B, Lk, self.n_heads, self.d_head).transpose(1, 2)
        v = self.w_vs(v).view(B, Lk, self.n_heads, self.d_head).transpose(1, 2)

        out = self.attention(q, k, v, attn_mask=attn_mask)  # (B, H, Lq, d_head)
        out = out.transpose(1, 2).contiguous().view(B, Lq, self.d_model)
        out = self.proj(out)
        return out


# ---------------------------
# Encoder/DecoderLayer & stacks
# ---------------------------
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff=2048, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, n_heads, ProbAttention(scale=d_model))
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = F.relu

    def forward(self, x, attn_mask=None):
        # x: (B, L, D)
        new_x = self.attn(x, x, x, attn_mask=attn_mask)
        x = x + self.dropout(new_x)
        x = self.norm1(x)
        y = x.transpose(-1, -2)  # (B, D, L)
        y = self.conv2(self.dropout(self.activation(self.conv1(y))))
        y = y.transpose(-1, -2)  # (B, L, D)
        x = x + self.dropout(y)
        x = self.norm2(x)
        return x


class Encoder(nn.Module):
    def __init__(self, layer, num_layers, distil=True):
        super().__init__()
        self.layers = nn.ModuleList([layer for _ in range(num_layers)])
        self.distil = distil
        if distil:
            # 1D conv for down-sampling (like paper)
            self.conv_layers = nn.ModuleList([nn.Conv1d(in_channels=layer.attn.d_model, out_channels=layer.attn.d_model, kernel_size=3, padding=1, stride=2) for _ in range(max(0, num_layers-1))])
            self.activation = nn.ReLU()

    def forward(self, x, attn_mask=None):
        # x: (B, L, D)
        seqs = []
        for i, layer in enumerate(self.layers):
            x = layer(x, attn_mask=attn_mask)
            seqs.append(x)
            # distillation between layers: downsample temporal dim
            if self.distil and i < len(self.layers) - 1:
                x = x.transpose(1, 2)  # (B, D, L)
                x = self.activation(self.conv_layers[i](x))
                x = x.transpose(1, 2)  # (B, L//2, D)
        return x  # final memory (B, L', D)


class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff=2048, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads, ProbAttention(scale=d_model, mask_flag=True))
        self.cross_attn = MultiHeadAttention(d_model, n_heads, ProbAttention(scale=d_model))
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = F.relu

    def forward(self, x, memory, self_mask=None, cross_mask=None):
        x = x + self.dropout(self.self_attn(x, x, x, attn_mask=self_mask))
        x = self.norm1(x)
        x = x + self.dropout(self.cross_attn(x, memory, memory, attn_mask=cross_mask))
        x = self.norm2(x)
        y = x.transpose(-1, -2)
        y = self.conv2(self.dropout(self.activation(self.conv1(y))))
        y = y.transpose(-1, -2)
        x = x + self.dropout(y)
        x = self.norm3(x)
        return x


class Decoder(nn.Module):
    def __init__(self, layer, num_layers, projection):
        super().__init__()
        self.layers = nn.ModuleList([layer for _ in range(num_layers)])
        self.projection = projection  # final linear to output dim

    def forward(self, x, memory, self_mask=None, cross_mask=None):
        for layer in self.layers:
            x = layer(x, memory, self_mask, cross_mask)
        out = self.projection(x)
        return out


# ---------------------------
# Informer model
# ---------------------------
class Informer(nn.Module):
    def __init__(self,
                 enc_in, dec_in, c_out,
                 seq_len=168, label_len=24, out_len=24,
                 d_model=512, n_heads=8,
                 e_layers=3, d_layers=2,
                 d_ff=2048, dropout=0.05, distil=True):
        super().__init__()

        self.seq_len = seq_len
        self.label_len = label_len
        self.pred_len = out_len
        self.d_model = d_model

        # Embedding
        self.enc_embedding = DataEmbedding(enc_in, d_model)
        self.dec_embedding = DataEmbedding(dec_in, d_model)

        # Encoder
        enc_layer = EncoderLayer(d_model, n_heads, d_ff=d_ff, dropout=dropout)
        self.encoder = Encoder(enc_layer, e_layers, distil=distil)

        # Decoder
        dec_layer = DecoderLayer(d_model, n_heads, d_ff=d_ff, dropout=dropout)
        # projection to single value
        projection = nn.Linear(d_model, c_out)
        self.decoder = Decoder(dec_layer, d_layers, projection)

        # final projection if needed (already in decoder)
        # self.projection = nn.Linear(d_model, c_out)

    def forward(self, x_enc, x_dec, enc_mask=None, dec_mask=None):
        """
        x_enc: (B, seq_len, enc_in)
        x_dec: (B, label_len + pred_len, dec_in) -- or (B, label_len, dec_in) with zeros padded to pred_len
        returns: (B, pred_len, c_out)
        """
        # embedding
        enc_out = self.enc_embedding(x_enc)  # (B, seq_len, d_model)
        enc_out = self.encoder(enc_out, attn_mask=enc_mask)  # memory

        dec_out = self.dec_embedding(x_dec)  # (B, label_len+pred_len, d_model)
        out = self.decoder(dec_out, enc_out, self_mask=dec_mask, cross_mask=None)  # (B, label_len+pred_len, c_out)
        # take the last pred_len steps as prediction
        return out[:, -self.pred_len:, :]  # (B, pred_len, c_out)


In [10]:
class PeakHuberLoss(nn.Module):
    def __init__(self):
        super(PeakHuberLoss, self).__init__()
    def forward(self, y_pred, y_true, delta = 5):
        # y_pred: [B, 24, 1]; y_true: [B, 24, 1]
        # 标准化形状，确保可广播
        if y_pred.ndim == 2:
            y_pred = y_pred.unsqueeze(-1)
        if y_true.ndim == 2:
            y_true = y_true.unsqueeze(-1)
        error = y_true - y_pred
        peak_mask = (y_true >= 5)
        # 让空集合时保持为张量而不是 Python float
        if torch.any(peak_mask):
            peak_err = error[peak_mask]
            peak_loss = torch.where(torch.abs(peak_err) <= delta,
                                    0.5 * peak_err**2,
                                    delta * (torch.abs(peak_err) - 0.5 * delta)).mean()
        else:
            peak_loss = torch.zeros((), device=error.device)
        non_peak_mask = ~peak_mask
        if torch.any(non_peak_mask):
            non_peak_err = error[non_peak_mask]
            non_peak_loss = torch.abs(non_peak_err).mean()
        else:
            non_peak_loss = torch.zeros((), device=error.device)
        total_loss = peak_loss * 2 + non_peak_loss
        return total_loss  # 返回单个标量张量
    
class MultiTaskPHLoss(nn.Module):
    def __init__(self, loss_weights=None):
        super(MultiTaskPHLoss, self).__init__()
        self.peakhuberloss = PeakHuberLoss()
        self.loss_weights = loss_weights
    
    def forward(self, predictions, targets, delta = 5):
        total_loss = 0
        losses = {}
        for scale, pred in predictions.items():
            target = targets[scale]
            scale_loss = self.peakhuberloss(pred, target, delta = delta)
            weight = self.loss_weights[scale] if self.loss_weights else 1.0
            weighted_loss = weight * scale_loss
            losses[scale] = scale_loss.item()
            total_loss += weighted_loss
        return total_loss, losses

In [8]:
setup_seed(12345)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
output_sizes = 24

# device = 'cpu'
print('device:', device)
print(all_data.shape)
# print(static_all_data.shape)

# 加载数据
train_dataloader, val_dataloader, test_dataloader = load_data(all_data[:, :, :], 1024)


device: cuda
(753, 3312, 8)
单个样本数量： 66
站点数量： 753
总样本数量： 49698
a (753, 1824, 8) (753, 1824, 1)
单个样本数量： 21
站点数量： 753
总样本数量： 15813
a (753, 744, 8) (753, 744, 1)
单个样本数量： 21
站点数量： 753
总样本数量： 15813
a (753, 744, 8) (753, 744, 1)


In [25]:
def prepare_decoder_inputs(x_enc, y_true, label_len=24, pred_len=24, feature_index=0):
    """
    Construct x_dec for Informer as in paper:
    - take last `label_len` values of the target (from input sequence)
    - append pred_len zeros for future steps
    Args:
      x_enc: (B, seq_len, feat_dim)  --> contains target in channel feature_index
      y_true: (B, pred_len, 1)       --> ground truth for loss (only for training)
    Return:
      x_dec: (B, label_len + pred_len, dec_in)
    Note: dec_in will be same as enc_in for simplicity.
    """
    B, _, feat_dim = x_enc.shape
    # last label_len targets from encoder input's target channel
    last_y = x_enc[:, -label_len:, feature_index:feature_index+1].clone()  # (B,label_len,1)
    # zeros for future
    zeros = torch.zeros(B, pred_len, 1, device=x_enc.device)
    x_dec = torch.cat([last_y, zeros], dim=1)  # (B, label_len + pred_len, 1)
    # if your decoder expects full feature vector, you can pad zeros for other features:
    if feat_dim > 1:
        # replicate zeros for other features (or provide known future covariates)
        pad = torch.zeros(B, label_len + pred_len, feat_dim - 1, device=x_enc.device)
        x_dec = torch.cat([x_dec, pad], dim=-1)  # (B, label_len+pred_len, feat_dim)
    return x_dec


def train_one_epoch(model, loader, optimizer, criterion, DEVICE):
    model.train()
    total_loss = 0.0
    n = 0
    for x, y in loader:
        x = x.float().to(DEVICE)  # (B, 168, feat)
        y = y.float().to(DEVICE)  # (B, 24, 1)
        x_dec = prepare_decoder_inputs(x, y, label_len=24, pred_len=24, feature_index=0)

        optimizer.zero_grad()
        pred = model(x, x_dec)  # (B, 24, 1)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x.size(0)
        n += x.size(0)
    return total_loss / n


def evaluate(model, loader, criterion, DEVICE):
    model.eval()
    total_loss = 0.0
    n = 0
    preds = []
    trues = []
    with torch.no_grad():
        for x, y in loader:
            x = x.float().to(DEVICE)
            y = y.float().to(DEVICE)
            x_dec = prepare_decoder_inputs(x, y, label_len=24, pred_len=24, feature_index=0)
            pred = model(x, x_dec)  # (B,24,1)
            loss = criterion(pred, y)
            total_loss += loss.item() * x.size(0)
            n += x.size(0)
            preds.append(pred.cpu().numpy())
            trues.append(y.cpu().numpy())
    preds = np.concatenate(preds, axis=0)
    trues = np.concatenate(trues, axis=0)
    # compute MAE / RMSE
    mae = np.mean(np.abs(preds - trues))
    rmse = np.sqrt(np.mean((preds - trues) ** 2))
    return total_loss / n, mae, rmse, preds, trues


def evaluate_metrics(y_pred, y_true):
    """评估测试集的 MSE / MAPE / WMAPE（仅统计真值>5的样本）"""
    
    # y_pred = np.concatenate(y_pred, axis=0)  # [N, 24, 1]
    # y_true = np.concatenate(y_true, axis=0)  # [N, 24, 1]
    print(y_pred.shape)
    # 去掉最后一个特征维度
    y_pred = y_pred.squeeze(-1)  # [N, 24]
    y_true = y_true.squeeze(-1)  # [N, 24]

    def compute_metrics_gt5(y_true_slice, y_pred_slice, gt_min=5):
        """仅在真值>gt_min的样本上计算指标"""
        mask = y_true_slice > gt_min
        if not np.any(mask):
            return float('nan'), float('nan'), float('nan')
        yt = y_true_slice[mask]
        yp = y_pred_slice[mask]
        mse = float(np.mean((yp - yt) ** 2))
        mape = float(np.mean(np.abs((yp - yt) / yt)))
        denom = float(np.sum(np.abs(yt)))
        wmape = float(np.sum(np.abs(yp - yt)) / denom) if denom > 0 else float('nan')
        return mse, mape, wmape

    # 定义时段索引
    morning_idx = np.array([7, 8, 9])
    evening_idx = np.array([18, 19, 20])
    all_idx = np.arange(24)

    # 早峰（仅真值>5）
    mse_morning, mape_morning, wmape_morning = compute_metrics_gt5(
        y_true[:, morning_idx].reshape(-1), y_pred[:, morning_idx].reshape(-1)
    )
    # 晚峰（仅真值>5）
    mse_evening, mape_evening, wmape_evening = compute_metrics_gt5(
        y_true[:, evening_idx].reshape(-1), y_pred[:, evening_idx].reshape(-1)
    )
    # 全天（仅真值>5）
    mse_all, mape_all, wmape_all = compute_metrics_gt5(
        y_true[:, all_idx].reshape(-1), y_pred[:, all_idx].reshape(-1)
    )

    print("\n=== Test Metrics (y_true > 5 only) ===")
    print(f"Morning 7-9   -> MSE: {mse_morning:.4f}, MAPE: {mape_morning:.4f}, WMAPE: {wmape_morning:.4f}")
    print(f"Evening 18-20 -> MSE: {mse_evening:.4f}, MAPE: {mape_evening:.4f}, WMAPE: {wmape_evening:.4f}")
    print(f"All-day 0-23  -> MSE: {mse_all:.4f}, MAPE: {mape_all:.4f}, WMAPE: {wmape_all:.4f}")
    
    return {
        'morning': {'mse': mse_morning, 'mape': mape_morning, 'wmape': wmape_morning},
        'evening': {'mse': mse_evening, 'mape': mape_evening, 'wmape': wmape_evening},
        'all_day': {'mse': mse_all, 'mape': mape_all, 'wmape': wmape_all}
    }

def main_train(all_data, batch_size=512, epochs=30, model_save="pred_model/net_divvy_informer_1.pth"):
    # prepare dataloaders (your function)
    train_loader, val_loader, test_loader = load_data(all_data, batch_size=batch_size)
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    enc_in = 8  # your feature size
    dec_in = 8
    c_out = 1

    model = Informer(
        enc_in=enc_in, dec_in=dec_in, c_out=c_out,
        seq_len=168, label_len=24, out_len=24,
        d_model=256, n_heads=8, e_layers=3, d_layers=2, d_ff=1024, distil=True
    ).to(DEVICE)

    criterion = PeakHuberLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

    best_val_loss = 1e9
    for epoch in range(1, epochs + 1):
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, DEVICE)
        val_loss, val_mae, val_rmse, _, _ = evaluate(model, val_loader, criterion, DEVICE)
        scheduler.step()

        print(f"[Epoch {epoch}] train_loss={train_loss:.6f} val_loss={val_loss:.6f} val_mae={val_mae:.6f} val_rmse={val_rmse:.6f}")

        # save best
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_save)
            print("Saved best model.")

    # load best and test
    model.load_state_dict(torch.load(model_save))
    test_loss, test_mae, test_rmse, preds, trues = evaluate(model, test_loader, criterion, DEVICE)
    print(f"TEST: loss={test_loss:.6f} mae={test_mae:.6f} rmse={test_rmse:.6f}")
    evaluate_metrics(preds, trues)
    # return model and results
    return model, (preds, trues)

# model, results = main_train(all_data)

In [None]:

model = Informer(
        enc_in=8, dec_in=8, c_out=1,
        seq_len=168, label_len=24, out_len=24,
        d_model=256, n_heads=8, e_layers=3, d_layers=2, d_ff=1024, distil=True
    ).to(device)
model.load_state_dict(torch.load("pred_model/net_divvy_informer_1.pth"))
criterion = PeakHuberLoss()
test_loss, test_mae, test_rmse, preds, trues = evaluate(model, test_dataloader, criterion, device)
print(f"TEST: loss={test_loss:.6f} mae={test_mae:.6f} rmse={test_rmse:.6f}")


TEST: loss=55.015587 mae=4.080723 rmse=9.547833


In [26]:
evaluate_metrics(preds, trues)
print(trues.shape,preds.shape)

(15813, 24, 1)

=== Test Metrics (y_true > 5 only) ===
Morning 7-9   -> MSE: 589.6521, MAPE: 0.2628, WMAPE: 0.4143
Evening 18-20 -> MSE: 423.0138, MAPE: 0.2415, WMAPE: 0.3759
All-day 0-23  -> MSE: 438.8655, MAPE: 0.2563, WMAPE: 0.3528
(15813, 24, 1) (15813, 24, 1)
