In [None]:
import numpy as np

# 加载 .npy 文件
data = np.load('./processed_data/SMD/machine-1-1_train.npy')
print(f"数据已加载，类型: {type(data)}")  

数据已加载，类型: <class 'numpy.ndarray'>


In [3]:
# 查看数据形状（样本数, 特征数）
print(f"数据形状: {data.shape}")

# 查看数据类型（如 float32, int64 等）
print(f"数据类型: {data.dtype}")

# 查看数据维度
print(f"数据维度: {data.ndim}")

# 查看数据集行数和列数
rows, columns = data.shape

if columns == 38:
    print("数据符合38维特征要求")
else:
    print(f"数据维度异常，当前数据特征维度为{columns}")

数据形状: (28479, 38)
数据类型: float64
数据维度: 2
数据符合38维特征要求


In [7]:
import numpy as np

def inspect_npy_file(file_path):
    try:
        # 加载数据
        data = np.load(file_path)
        print(f"数据已加载，类型: {type(data)}")
        
        # 基本信息
        print("\n=== 基本信息 ===")
        print(f"数据形状: {data.shape}")
        print(f"数据类型: {data.dtype}")
        print(f"数据维度: {data.ndim}")
        
        # 统计信息
        print("\n=== 整体统计信息 ===")
        print(f"数据均值: {data.mean()}")
        print(f"数据标准差: {data.std()}")
        print(f"数据最小值: {data.min()}")
        print(f"数据最大值: {data.max()}")
        
        # 前几行信息
        rows, columns = data.shape
        print("\n=== 数据前几行内容 ===")
        if rows < 5:
            print(f"数据前{rows}行内容：")
            print(data[:rows, :])
        else:
            print("数据前5行内容：")
            print(data[:5, :])
        
        # 异常值检查
        print("\n=== 异常值检查 ===")
        has_nan = np.isnan(data).any()
        print(f"数据是否包含 NaN: {has_nan}")
        
        has_inf = np.isinf(data).any()
        print(f"数据是否包含无穷大: {has_inf}")
        
        # 如果有NaN，统计每列的NaN数量
        if has_nan:
            print("\n=== NaN 分布 ===")
            for i in range(data.shape[1]):
                nan_count = np.isnan(data[:, i]).sum()
                if nan_count > 0:
                    print(f"特征 {i+1} 包含 {nan_count} 个 NaN 值")
        
        return data
    
    except FileNotFoundError:
        print(f"错误：文件 '{file_path}' 不存在")
        return None
    except Exception as e:
        print(f"错误：加载文件时发生异常: {e}")
        return None

# 使用示例
if __name__ == "__main__":
    file_path = './processed_data/SMD/machine-1-1_labels.npy'  # 替换为你的文件路径
    data = inspect_npy_file(file_path)
    
    # 如果数据加载成功，可以进一步处理
    if data is not None:
        # 例如：保存数据的前100行到新文件
        np.save('your_data_first_100.npy', data[:100])


数据已加载，类型: <class 'numpy.ndarray'>

=== 基本信息 ===
数据形状: (28479, 38)
数据类型: float64
数据维度: 2

=== 整体统计信息 ===
数据均值: 0.03460352133889976
数据标准差: 0.18277340520395194
数据最小值: 0.0
数据最大值: 1.0

=== 数据前几行内容 ===
数据前5行内容：
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

=== 异常值检查 ===
数据是否包含 NaN: False
数据是否包含无穷大: False


In [16]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
from time import time

# 设置颜色输出
class color:
    HEADER = '\033[95m'
    GREEN = '\033[92m'
    BOLD = '\033[1m'
    ENDC = '\033[0m'

# 修复后的LSTM_AD模型定义
class LSTM_AD(nn.Module):
    def __init__(self, feats, window_size=10):
        super(LSTM_AD, self).__init__()
        self.name = 'LSTM_AD'
        self.lr = 0.002
        self.n_feats = feats
        self.n_hidden = 64
        self.window_size = window_size
        
        # 修改LSTM层以处理窗口数据
        self.lstm = nn.LSTM(feats, self.n_hidden, batch_first=True)
        self.lstm2 = nn.LSTM(self.n_hidden, self.n_feats, batch_first=True)
        self.fcn = nn.Sequential(nn.Linear(self.n_feats, self.n_feats), nn.Sigmoid())

    def forward(self, x):
        # 重塑输入以匹配[batch, seq_len, features]
        batch_size = x.size(0)
        x = x.view(batch_size, self.window_size, self.n_feats)
        
        hidden = (torch.rand(1, batch_size, self.n_hidden, dtype=torch.float32).to(x.device), 
                  torch.randn(1, batch_size, self.n_hidden, dtype=torch.float32).to(x.device))
        hidden2 = (torch.rand(1, batch_size, self.n_feats, dtype=torch.float32).to(x.device), 
                   torch.randn(1, batch_size, self.n_feats, dtype=torch.float32).to(x.device))
        
        out, hidden = self.lstm(x, hidden)
        out, hidden2 = self.lstm2(out, hidden2)
        out = self.fcn(out[:, -1, :])  # 只取最后一个时间步的输出
        return out

# 修复后的数据窗口化函数
def convert_to_windows(data, model):
    windows = []
    w_size = model.window_size
    
    for i in range(len(data)):
        if i >= w_size:
            w = data[i - w_size:i]
        else:
            # 处理序列开始部分的填充
            padding = data[0].repeat(w_size - i, 1)
            w = torch.cat([padding, data[0:i]])
        windows.append(w)  # 不再展平，保持[window_size, features]形状
    
    return torch.stack(windows)

# 加载数据集
def load_dataset(dataset):
    folder = os.path.join("processed_data", dataset)  # 修改为processed/SMD目录
    if not os.path.exists(folder):
        raise Exception(f'Processed Data not found in {folder}')
    
    # 加载SMD数据
    train = np.load(os.path.join(folder, 'machine-1-1_train.npy'))
    test = np.load(os.path.join(folder, 'machine-1-1_test.npy'))
    labels = np.load(os.path.join(folder, 'machine-1-1_labels.npy'))
    
    print(f"Loaded dataset from {folder}")
    print(f"Train shape: {train.shape}, Test shape: {test.shape}, Labels shape: {labels.shape}")
    
    return train, test, labels

# 训练函数
def train(model, train_loader, optimizer, scheduler, num_epochs=5):
    model.train()
    criterion = nn.MSELoss()
    accuracy_list = []
    
    for epoch in tqdm(range(num_epochs)):
        total_loss = 0
        for data in train_loader:
            data = data.float()  # 确保数据为float32
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, data[:, -1, :])  # 比较预测值与窗口最后一个时间步
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        scheduler.step()
        avg_loss = total_loss / len(train_loader)
        lr = optimizer.param_groups[0]['lr']
        accuracy_list.append((avg_loss, lr))
        tqdm.write(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.6f}, LR: {lr:.6f}')
    
    return accuracy_list

# 测试函数
def test(model, test_loader):
    model.eval()
    criterion = nn.MSELoss(reduction='none')
    all_losses = []
    all_predictions = []
    
    with torch.no_grad():
        for data in test_loader:
            data = data.float()  # 确保数据为float32
            output = model(data)
            loss = criterion(output, data[:, -1, :])  # 比较预测值与窗口最后一个时间步
            all_losses.append(loss.cpu().numpy())
            all_predictions.append(output.cpu().numpy())
    
    return np.vstack(all_losses), np.vstack(all_predictions)

# POT评估函数（Peak Over Threshold）
def pot_eval(train_loss, test_loss, labels):
    # 简化版POT评估（实际应用中可能需要更复杂的实现）
    # 这里使用训练损失的95%分位数作为阈值
    threshold = np.quantile(train_loss, 0.95)
    
    # 预测异常
    predictions = (test_loss > threshold).astype(int)
    
    # 计算指标
    tp = np.sum((predictions == 1) & (labels == 1))
    fp = np.sum((predictions == 1) & (labels == 0))
    fn = np.sum((predictions == 0) & (labels == 1))
    tn = np.sum((predictions == 0) & (labels == 0))
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'TP': tp,
        'FP': fp,
        'FN': fn,
        'TN': tn
    }, predictions

# 保存模型
def save_model(model, optimizer, scheduler, epoch, accuracy_list, dataset):
    folder = f'checkpoints/LSTM_AD_{dataset}/'
    os.makedirs(folder, exist_ok=True)
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'accuracy_list': accuracy_list
    }, f'{folder}/model.ckpt')
    print(f"Model saved to {folder}")

# 主函数
def main():
    dataset = 'SMD'
    print(f"{color.HEADER}Training LSTM_AD on {dataset}{color.ENDC}")
    
    # 加载数据
    train_data, test_data, labels = load_dataset(dataset)
    
    # 转换为PyTorch张量，使用float32（单精度）
    train_tensor = torch.FloatTensor(train_data)
    test_tensor = torch.FloatTensor(test_data)
    labels_tensor = torch.FloatTensor(labels)
    
    # 修复：动态创建Args对象
    args = type('Args', (object,), {
        'model': 'LSTM_AD',
        'dataset': dataset,
        'test': False,
        'retrain': False,
        'less': False
    })()
    
    # 初始化模型，传入窗口大小和特征数
    window_size = 10
    model = LSTM_AD(train_data.shape[1], window_size)
    
    # 初始化优化器和调度器
    optimizer = torch.optim.AdamW(model.parameters(), lr=model.lr, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5, 0.9)
    
    # 窗口化数据
    train_windows = convert_to_windows(train_tensor, model)
    test_windows = convert_to_windows(test_tensor, model)
    
    # 创建数据加载器
    train_loader = DataLoader(train_windows, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_windows, batch_size=64)
    
    # 训练模型
    start_time = time()
    accuracy_list = train(model, train_loader, optimizer, scheduler, num_epochs=5)
    end_time = time()
    print(f"Training time: {end_time - start_time:.2f} seconds")
    
    # 保存模型
    save_model(model, optimizer, scheduler, 5, accuracy_list, dataset)
    
    # 测试模型
    train_loss, _ = test(model, train_loader)
    test_loss, predictions = test(model, test_loader)
    
    # 评估
    print(f"{color.HEADER}Evaluating LSTM_AD on {dataset}{color.ENDC}")
    
    # 计算每个特征的评估结果
    df = pd.DataFrame()
    for i in range(test_loss.shape[1]):
        train_feat_loss = train_loss[:, i]
        test_feat_loss = test_loss[:, i]
        feat_labels = labels[:, i]
        
        result, _ = pot_eval(train_feat_loss, test_feat_loss, feat_labels)
        df = df._append({
            'Feature': i,
            'Precision': result['Precision'],
            'Recall': result['Recall'],
            'F1-Score': result['F1-Score']
        }, ignore_index=True)
    
    # 计算整体评估结果（平均所有特征）
    overall_loss = np.mean(test_loss, axis=1)
    overall_labels = (np.sum(labels, axis=1) >= 1).astype(int)
    overall_result, _ = pot_eval(np.mean(train_loss, axis=1), overall_loss, overall_labels)
    
    print("\nPer-feature Results:")
    print(df)
    print("\nOverall Results:")
    for key, value in overall_result.items():
        print(f"{key}: {value:.4f}")

if __name__ == "__main__":
    main()

[95mTraining LSTM_AD on SMD[0m
Loaded dataset from processed_data\SMD
Train shape: (28479, 38), Test shape: (28479, 38), Labels shape: (28479, 38)


 20%|██        | 1/5 [00:04<00:18,  4.74s/it]

Epoch 1/5, Loss: 0.009853, LR: 0.002000


 40%|████      | 2/5 [00:09<00:13,  4.58s/it]

Epoch 2/5, Loss: 0.003040, LR: 0.002000


 60%|██████    | 3/5 [00:17<00:12,  6.43s/it]

Epoch 3/5, Loss: 0.003036, LR: 0.002000


 80%|████████  | 4/5 [00:26<00:07,  7.12s/it]

Epoch 4/5, Loss: 0.003031, LR: 0.002000


100%|██████████| 5/5 [00:31<00:00,  6.32s/it]


Epoch 5/5, Loss: 0.001303, LR: 0.001800
Training time: 31.60 seconds
Model saved to checkpoints/LSTM_AD_SMD/
[95mEvaluating LSTM_AD on SMD[0m

Per-feature Results:
    Feature  Precision    Recall  F1-Score
0       0.0   0.315381  0.946617  0.473130
1       1.0   0.156659  0.462471  0.234039
2       2.0   0.108661  0.529412  0.180313
3       3.0   0.103463  0.567059  0.174997
4       4.0   0.000000  0.000000  0.000000
5       5.0   0.039037  0.409747  0.071283
6       6.0   0.024401  0.102888  0.039446
7       7.0   0.000000  0.000000  0.000000
8       8.0   0.260467  0.263592  0.262020
9       9.0   0.225324  0.588722  0.325911
10     10.0   0.143590  0.175686  0.158025
11     11.0   0.338600  0.225564  0.270758
12     12.0   0.444877  0.257218  0.325968
13     13.0   0.140866  0.143982  0.142407
14     14.0   0.258498  0.245219  0.251684
15     15.0   0.140635  0.177255  0.156836
16     16.0   0.000000  0.000000  0.000000
17     17.0   0.000000  0.000000  0.000000
18     18.0   0.0