### WESAD 数据集分析与模型训练

#### 加载数据
我们首先加载WESAD数据集中的特定受试者的数据。每个受试者的数据存储在一个`.pkl`文件中，包含了信号数据、标签和受试者信息。



In [19]:
import os
import pickle
import numpy as np

def load_wesad_data(participant_id, data_path='WESAD'):
    file_path = os.path.join(data_path, f'S{participant_id}', f'S{participant_id}.pkl')
    with open(file_path, 'rb') as file:
        data = pickle.load(file, encoding='latin1')
    return data

# 加载特定受试者的数据
participant_id = 2  # 示例中加载第2个受试者的数据
data = load_wesad_data(participant_id)

# 查看数据结构
print(data.keys())  # dict_keys(['signal', 'label', 'subject'])
print(data['signal'].keys())  # dict_keys(['chest', 'wrist'])
print(data['signal']['chest'].keys())  # dict_keys(['ACC', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp'])
print(data['signal']['chest']['ECG'].shape)  # 查看具体数据的形状
# 查看label的标签
print(data['subject'])  # S2


dict_keys(['signal', 'label', 'subject'])
dict_keys(['chest', 'wrist'])
dict_keys(['ACC', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp'])
(4255300, 1)
S2


#### 数据结构分析

- **信号数据（signal）**：包括来自`chest`和`wrist`传感器的数据。
  - **胸部传感器（chest）**：
    - **ACC**：加速度计数据，用于测量运动和振动。
    - **ECG**：心电图数据，用于分析心脏活动。
    - **EMG**：肌电图数据，用于测量肌肉活动。
    - **EDA**：皮肤电活动数据，用于测量皮肤电导率变化。
    - **Temp**：体温数据，用于测量皮肤表面温度。
    - **Resp**：呼吸数据，用于测量呼吸频率和深度。
- **标签数据（label）**：情绪和压力状态的标签。
- **受试者信息（subject）**：包含受试者的基本信息。

数据的维度：
- 数据包含4255300个样本，每个样本有6个特征。


In [8]:
from sklearn.preprocessing import StandardScaler

def preprocess_data(data):
    ecg_data = data['signal']['chest']['ECG']
    eda_data = data['signal']['chest']['EDA'].reshape(-1, 1)
    temp_data = data['signal']['chest']['Temp'].reshape(-1, 1)
    acc_data = data['signal']['chest']['ACC']
    
    # 确保所有数据的长度相同
    min_length = min(len(ecg_data), len(eda_data), len(temp_data), len(acc_data))
    ecg_data = ecg_data[:min_length]
    eda_data = eda_data[:min_length]
    temp_data = temp_data[:min_length]
    acc_data = acc_data[:min_length]

    combined_data = np.hstack((ecg_data, eda_data, temp_data, acc_data))
    
    scaler = StandardScaler()
    combined_data = scaler.fit_transform(combined_data)
    
    labels = data['label'][:min_length]
    
    # 输出标签的唯一值和范围
    print(f"标签的唯一值: {np.unique(labels)}")
    print(f"标签的最大值: {np.max(labels)}")
    print(f"标签的最小值: {np.min(labels)}")
    
    return combined_data, labels

preprocessed_data, labels = preprocess_data(data)
print(preprocessed_data.shape)
print(labels.shape)


标签的唯一值: [0 1 2 3 4 6 7]
标签的最大值: 7
标签的最小值: 0
(4255300, 6)
(4255300,)


In [21]:
def preprocess_data(data):
    ecg_data = data['signal']['chest']['ECG']
    eda_data = data['signal']['chest']['EDA'].reshape(-1, 1)
    temp_data = data['signal']['chest']['Temp'].reshape(-1, 1)
    acc_data = data['signal']['chest']['ACC']
    
    # 确保所有数据的长度相同
    min_length = min(len(ecg_data), len(eda_data), len(temp_data), len(acc_data))
    ecg_data = ecg_data[:min_length]
    eda_data = eda_data[:min_length]
    temp_data = temp_data[:min_length]
    acc_data = acc_data[:min_length]

    combined_data = np.hstack((ecg_data, eda_data, temp_data, acc_data))
    
    scaler = StandardScaler()
    combined_data = scaler.fit_transform(combined_data)
    
    labels = data['label'][:min_length]
    
    # 输出标签的唯一值和范围
    unique_labels = np.unique(labels)
    print(f"标签的唯一值: {unique_labels}")
    print(f"标签的最大值: {np.max(labels)}")
    print(f"标签的最小值: {np.min(labels)}")
    
    # 输出每一个维度的标签以及对应的维度
    dimension_labels = ['ECG', 'EDA', 'Temp', 'ACC_X', 'ACC_Y', 'ACC_Z']
    for i, label in enumerate(dimension_labels):
        print(f"维度 {i+1} ({label}) 的标签值: {combined_data[:, i]}")
    
    return combined_data, labels

preprocessed_data, labels = preprocess_data(data)
print(preprocessed_data.shape)
print(labels.shape)


标签的唯一值: [0 1 2 3 4 6 7]
标签的最大值: 7
标签的最小值: 0
维度 1 (ECG) 的标签值: [ 0.13117504  0.12404464  0.09938534 ... -0.04322267 -0.00697647
  0.01857413]
维度 2 (EDA) 的标签值: [ 3.00556138  3.01911462  2.99970884 ... -0.94305162 -0.93873923
 -0.93904726]
维度 3 (Temp) 的标签值: [-0.59005113 -0.58321442 -0.57642536 ...  0.45472658  0.47428385
  0.45241591]
维度 4 (ACC_X) 的标签值: [1.1508615  0.98183069 0.88132631 ... 0.67346382 0.68031688 0.66432756]
维度 5 (ACC_Y) 的标签值: [-1.38394068 -1.37770742 -1.34654205 ...  0.14629919  0.15253245
  0.1743484 ]
维度 6 (ACC_Z) 的标签值: [-0.45070835 -0.43671941 -0.38809102 ...  0.39462741  0.39995645
  0.40128876]
(4255300, 6)
(4255300,)


In [None]:
# 单独实现
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 将数据转换为PyTorch张量
X_tensor = torch.tensor(preprocessed_data, dtype=torch.float32)
y_tensor = torch.tensor(labels, dtype=torch.long)

# 划分训练集和测试集
train_size = int(0.8 * len(X_tensor))
test_size = len(X_tensor) - train_size
X_train, X_test = torch.utils.data.random_split(TensorDataset(X_tensor, y_tensor), [train_size, test_size])

# 创建数据加载器
train_loader = DataLoader(X_train, batch_size=32, shuffle=True)
test_loader = DataLoader(X_test, batch_size=32, shuffle=False)

# 定义CNN模型
class SimpleCNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * (input_size // 2), 100)
        self.fc2 = nn.Linear(100, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = x.view(-1, 64 * (x.shape[2]))
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# 初始化模型、损失函数和优化器
input_size = preprocessed_data.shape[1]
num_classes = len(np.unique(labels))
model = SimpleCNN(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.unsqueeze(1)  # 添加通道维度
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')


#### 划分训练集和测试集

我们将预处理后的数据集划分为训练集和测试集，其中20%的数据用于测试，80%的数据用于训练。


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(preprocessed_data, labels, test_size=0.2, random_state=42)


#### 构建和训练模型

使用PyTorch构建和训练一个简单的卷积神经网络（CNN）模型。


In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# 将数据转换为PyTorch张量
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# 创建数据加载器
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 打印标签的唯一值，确保标签范围正确
print(f"训练集标签的唯一值: {np.unique(y_train_tensor.numpy())}")
print(f"测试集标签的唯一值: {np.unique(y_test_tensor.numpy())}")

# 确定类别数
num_classes = len(np.unique(y_train_tensor.numpy()))
print(f"类别数: {num_classes}")

# 定义CNN模型
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=2)
        self.pool = nn.MaxPool1d(2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64 * ((X_train_tensor.shape[1] - 1) // 2), 100)
        self.fc2 = nn.Linear(100, num_classes)  # 动态设置类别数

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = SimpleCNN(num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.unsqueeze(1)  # 添加通道维度
        print(f"输入张量的形状: {inputs.shape}")
        print(f"标签张量的形状: {labels.shape}, 标签: {labels}")
        optimizer.zero_grad()
        outputs = model(inputs)
        print(f"模型输出的形状: {outputs.shape}, 输出: {outputs}")
        loss = criterion(outputs, labels)
        print(f"损失: {loss.item()}")
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

# 测试模型
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.unsqueeze(1)  # 添加通道维度
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'测试集准确率: {accuracy:.2f}%')


训练集标签的唯一值: [0 1 2 3 4 6 7]
测试集标签的唯一值: [0 1 2 3 4 6 7]
类别数: 7
输入张量的形状: torch.Size([32, 1, 6])
标签张量的形状: torch.Size([32]), 标签: tensor([4, 0, 0, 4, 0, 4, 0, 3, 2, 4, 0, 1, 0, 4, 0, 0, 0, 0, 1, 4, 0, 4, 0, 1,
        0, 4, 3, 1, 0, 0, 0, 0])
模型输出的形状: torch.Size([32, 7]), 输出: tensor([[ 0.0706,  0.0507,  0.2244,  0.1398, -0.1710,  0.2390, -0.1726],
        [ 0.1827, -0.0148,  0.2556,  0.1626, -0.0368,  0.1507, -0.0920],
        [ 0.4400,  0.1939,  0.2683,  0.1646, -0.2736,  0.3212, -0.0038],
        [ 0.1223,  0.0279,  0.2056,  0.1041, -0.0901,  0.1405, -0.0888],
        [-0.0193,  0.0948,  0.2325,  0.1827, -0.2816,  0.2274, -0.1541],
        [ 0.0451,  0.0693,  0.2068,  0.1399, -0.1906,  0.2479, -0.1803],
        [ 0.2539,  0.0896,  0.3333,  0.1771, -0.1736,  0.2243, -0.1259],
        [ 0.1679, -0.0799,  0.2240,  0.1677,  0.0016,  0.1273, -0.0868],
        [ 0.1137, -0.0570,  0.1935,  0.1327,  0.0543,  0.0816, -0.0471],
        [ 0.1080,  0.0640,  0.2073,  0.0865, -0.1021,  0.1646, -0.0938],

IndexError: Target 7 is out of bounds.