In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from ydata_synthetic.synthesizers.timeseries import TimeGAN
from ydata_synthetic.synthesizers import ModelParameters
import tensorflow as tf
from tensorflow.keras import layers

#==========================
# 1. 数据加载与预处理
#==========================
data_path = 'data_GAN_log_transformed.csv'
data = pd.read_csv(data_path)

# 排除非特征列
exclude_cols = ['唯一编号', '日期', '住院日序号', '性别', '年龄']
feature_cols = [c for c in data.columns if c not in exclude_cols]

data[feature_cols] = data[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

unique_ids = data['唯一编号'].unique()
max_time_step = data['住院日序号'].max()
num_samples = len(unique_ids)  # 原始数据集的患者数
num_time_steps = max_time_step
num_features = len(feature_cols)

# 将数据转换为 (样本, 时间步, 特征)
time_series_data = np.zeros((num_samples, num_time_steps, num_features))
for i, uid in enumerate(unique_ids):
    patient_data = data[data['唯一编号'] == uid].sort_values('住院日序号')
    patient_series = patient_data[feature_cols].values
    seq_len = patient_series.shape[0]
    time_series_data[i, :seq_len, :] = patient_series

print("输入数据形状:", time_series_data.shape)

# 标准化处理
ts_reshaped = time_series_data.reshape(-1, num_features)
scaler = StandardScaler()
ts_scaled = scaler.fit_transform(ts_reshaped)
time_series_data_scaled = ts_scaled.reshape(num_samples, num_time_steps, num_features)

#==========================
# 2. 定义含有注意力机制的TimeGAN模型
#==========================
class TimeGANWithAttention(TimeGAN):
    def __init__(self, model_parameters, hidden_dim, seq_len, n_seq, gamma, attention_dim=64):
        super().__init__(model_parameters, hidden_dim, seq_len, n_seq, gamma)
        self.attention_dim = attention_dim

    def build_generator(self):
        with tf.variable_scope('generator'):
            z = layers.Input(shape=(self.seq_len, self.noise_dim))
            x = layers.LSTM(self.hidden_dim, return_sequences=True)(z)
            x = layers.LayerNormalization()(x)
            attention = layers.MultiHeadAttention(num_heads=4, key_dim=self.attention_dim)(x, x)
            x = layers.Add()([x, attention])
            x = layers.LSTM(self.hidden_dim, return_sequences=True)(x)
            x = layers.LayerNormalization()(x)
            output = layers.Dense(self.n_seq, activation='tanh')(x)
            self.generator = tf.keras.models.Model(z, output)

    def build_discriminator(self):
        with tf.variable_scope('discriminator'):
            x = layers.Input(shape=(self.seq_len, self.n_seq))
            x_seq = layers.LSTM(self.hidden_dim, return_sequences=True)(x)
            x_seq = layers.LayerNormalization()(x_seq)
            attention = layers.MultiHeadAttention(num_heads=4, key_dim=self.attention_dim)(x_seq, x_seq)
            x_seq = layers.Add()([x_seq, attention])
            x_seq = layers.LSTM(self.hidden_dim)(x_seq)
            x_seq = layers.LayerNormalization()(x_seq)
            output = layers.Dense(1, activation='sigmoid')(x_seq)
            self.discriminator = tf.keras.models.Model(x, output)

#==========================
# 3. 训练 TimeGAN
#==========================
seq_len = num_time_steps
n_seq = num_features

gan_args = ModelParameters(
    batch_size=128,
    lr=5e-4,
    noise_dim=32,
    layers_dim=128
)

timegan_attention = TimeGANWithAttention(
    model_parameters=gan_args,
    hidden_dim=24,
    seq_len=seq_len,
    n_seq=n_seq,
    gamma=1,
    attention_dim=64
)

print("开始训练 TimeGANWithAttention 模型...")
timegan_attention.train(time_series_data_scaled, train_steps=4000)
print("TimeGANWithAttention 模型训练完成！")

#==========================
# 4. 生成合成数据
#    为了体现先生成多一些数据再裁剪，
#    假设TIMEGAN可以批量生成更多数据，然后裁剪到99个。
#    如果你的时间GAN只能一次性生成指定数量，可直接设为num_samples。
#==========================
generated_count = num_samples * 2  # 例如先生成2倍数量
synthetic_data = timegan_attention.sample(generated_count)

# 裁切到 num_samples (99个)
synthetic_data = synthetic_data[:num_samples]

print("合成数据形状:", synthetic_data.shape)

#==========================
# 5. 在生成的数据中计算每个人的变化度
#    这里使用与之前类似的策略：对三个时间点的数据计算特征标准差，再取平均
#==========================
variation_scores = []
for i in range(num_samples):
    patient_series = synthetic_data[i, :, :]  # 维度：(3天, 特征数)
    # 计算每个特征在3个时间点上的标准差，然后对所有特征的std求平均
    feature_std = np.std(patient_series, axis=0)  
    variation_score = np.mean(feature_std)
    variation_scores.append(variation_score)

variation_scores = np.array(variation_scores)

# 对variation_scores从大到小排序，取前25%为异常
sorted_indices = np.argsort(variation_scores)[::-1]  # 大到小
top_25_percent = int(np.ceil(num_samples * 0.25))
anomaly_indices = sorted_indices[:top_25_percent]

# 创建标签：异常为1，正常为0
labels = np.zeros(num_samples, dtype=int)
labels[anomaly_indices] = 1

#==========================
# 6. 对异常患者的数据做增幅处理
#    对选中的异常患者，在随机一些时间步和一些随机特征上增幅
#==========================
np.random.seed(42)
scaling_factor = 1.5
for idx in anomaly_indices:
    # 随机选一些时间步
    num_anomaly_time_points = np.random.randint(1, 3)  # 1到2个时间步
    anomaly_time_steps = np.random.choice(seq_len, num_anomaly_time_points, replace=False)
    
    # 对每个选中的时间步，再随机选一些特征
    # 例如随机选 5% - 10%的特征进行增幅
    num_anomaly_features = max(1, int(num_features * 0.1))
    anomaly_features = np.random.choice(num_features, num_anomaly_features, replace=False)

    # 增幅特定时间步和特定特征的值
    synthetic_data[idx, anomaly_time_steps[:, None], anomaly_features] *= scaling_factor

#==========================
# 7. 保存结果
#==========================
np.save('synthetic_data.npy', synthetic_data)
np.save('synthetic_labels.npy', labels)

num_anomalies = np.sum(labels == 1)
num_normals = np.sum(labels == 0)
print("数据与标签已保存。")
print(f"异常样本数: {num_anomalies}, 正常样本数: {num_normals}")


输入数据形状: (99, 3, 57)
开始训练 TimeGANWithAttention 模型...


Emddeding network training:   0%|          | 0/4000 [00:00<?, ?it/s]



Emddeding network training: 100%|██████████| 4000/4000 [00:36<00:00, 110.82it/s]
Supervised network training: 100%|██████████| 4000/4000 [00:25<00:00, 156.31it/s]
Joint networks training:   0%|          | 0/4000 [00:00<?, ?it/s]



Joint networks training: 100%|██████████| 4000/4000 [15:03<00:00,  4.43it/s]


TimeGANWithAttention 模型训练完成！


Synthetic data generation: 100%|██████████| 2/2 [00:00<00:00, 14.70it/s]

合成数据形状: (99, 3, 57)
数据与标签已保存。
异常样本数: 25, 正常样本数: 74



