In [215]:
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
import numpy as np

In [244]:
# 数据读取和预处理
# mut_matrix是提供的突变矩阵，data_y是样本的疾病状态标签
data = pd.read_csv("./clean_data_final.csv")
data_y = data["overall_survival"]
mut_matrix = data.iloc[:,497:]

In [245]:
# 数据集和批量大小
data = TensorDataset(torch.from_numpy(mut_matrix.values).float(), torch.from_numpy(data_y.values).float())
batch_size = 64
data_loader = DataLoader(data, batch_size=batch_size, shuffle=True)

In [246]:
# 定义训练集和验证集的比例
train_ratio = 0.8
valid_ratio = 0.2

# 计算对应的数据量
train_size = int(train_ratio * len(data))
valid_size = len(data) - train_size

# 使用 random_split() 随机分配训练集和验证集
train_set, valid_set = random_split(data, [train_size, valid_size])

# 创建数据加载器
train_loader = DataLoader(train_set, batch_size=batch_size)
valid_loader = DataLoader(valid_set, batch_size=batch_size)

In [247]:
valid_indices = valid_set.indices

In [248]:
# 定义VAE模型结构
class VAE(nn.Module):
    def __init__(self, input_size, hidden_size, latent_size):
        super().__init__()

        # 定义编码器
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.Sigmoid()
        )
        self.mean = nn.Linear(hidden_size, latent_size)
        self.log_var = nn.Linear(hidden_size, latent_size)

        # 定义解码器
        self.decoder = nn.Sequential(
            nn.Linear(latent_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, input_size),
            nn.Sigmoid()
        )

    def encode(self, x):
        h = self.encoder(x)
        mean = self.mean(h)
        log_var = self.log_var(h)
        return mean, log_var

    def reparameterize(self, mean, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        z = mean + eps * std
        return z

    def decode(self, z):
        x_hat = self.decoder(z)
        return x_hat

    def forward(self, x):
        mean, log_var = self.encode(x)
        z = self.reparameterize(mean, log_var)
        x_hat = self.decode(z)
        return x_hat, mean, log_var


# 训练VAE模型
input_size = mut_matrix.shape[1]
hidden_size = 256
latent_size = 64
lr = 1e-3
num_epochs = 50

model = VAE(input_size, hidden_size, latent_size)
optimizer = Adam(model.parameters(), lr=lr)
loss_func = nn.BCELoss(reduction='sum')

for epoch in range(num_epochs):
    for i, (x, _) in enumerate(train_loader):
        x_hat, mean, log_var = model(x)
        kl_divergence = -0.5 * torch.sum(1 + log_var - mean.pow(2) - log_var.exp())
        reconstruction_loss = loss_func(x_hat, x)
        loss = kl_divergence + reconstruction_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item()/batch_size:.4f}")


Epoch [1/50], Step [10/24], Loss: 55.7305
Epoch [1/50], Step [20/24], Loss: 22.9559
Epoch [2/50], Step [10/24], Loss: 19.3815
Epoch [2/50], Step [20/24], Loss: 20.8169
Epoch [3/50], Step [10/24], Loss: 19.1017
Epoch [3/50], Step [20/24], Loss: 19.8626
Epoch [4/50], Step [10/24], Loss: 18.8397
Epoch [4/50], Step [20/24], Loss: 19.5237
Epoch [5/50], Step [10/24], Loss: 18.4583
Epoch [5/50], Step [20/24], Loss: 19.3582
Epoch [6/50], Step [10/24], Loss: 18.4667
Epoch [6/50], Step [20/24], Loss: 19.3943
Epoch [7/50], Step [10/24], Loss: 18.5105
Epoch [7/50], Step [20/24], Loss: 19.3567
Epoch [8/50], Step [10/24], Loss: 18.5992
Epoch [8/50], Step [20/24], Loss: 19.4463
Epoch [9/50], Step [10/24], Loss: 18.6196
Epoch [9/50], Step [20/24], Loss: 19.3839
Epoch [10/50], Step [10/24], Loss: 18.5940
Epoch [10/50], Step [20/24], Loss: 19.3321
Epoch [11/50], Step [10/24], Loss: 18.5320
Epoch [11/50], Step [20/24], Loss: 19.5412
Epoch [12/50], Step [10/24], Loss: 18.4602
Epoch [12/50], Step [20/24], 

In [249]:
# 获取VAE模型的Z表达
z_matrix = np.zeros((len(valid_set), latent_size))
with torch.no_grad():
    model.eval()
    for i, x in enumerate(valid_loader):
        z_mean, _ = model.encode(x[0])
        z_matrix[i*batch_size:(i+1)*batch_size] = z_mean.cpu().numpy()



In [250]:
# 在Z表达的基础上构建分类器模型进行分类
class Classifier(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super().__init__()

        layers = []
        for i in range(len(hidden_sizes)):
            if i == 0:
                layers.append(nn.Linear(input_size, hidden_sizes[i],dtype=torch.float64))
            else:
                layers.append(nn.Linear(hidden_sizes[i-1], hidden_sizes[i],dtype=torch.float64))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_sizes[-1], output_size,dtype=torch.float64))
        layers.append(nn.Sigmoid())
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [251]:
# 分类器模型结构和训练过程
input_size = latent_size
hidden_sizes = [32, 16]
output_size = 1
lr = 1e-3
num_epochs = 50

model = Classifier(input_size, hidden_sizes, output_size)
optimizer = Adam(model.parameters(), lr=lr)
loss_func = nn.BCELoss()

In [252]:
for epoch in range(num_epochs):
    for i, (z, y) in enumerate(zip(z_matrix, data_y)):
        z = torch.from_numpy(z).double()
        y = torch.tensor([[y]]).double().reshape([1,1])
        y_pred = model(z).reshape([1,1])

        loss = loss_func(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(valid_loader)}], Loss: {loss.item():.4f}")

Epoch [1/50], Step [10/6], Loss: 0.7360
Epoch [1/50], Step [20/6], Loss: 0.6556
Epoch [1/50], Step [30/6], Loss: 0.6479
Epoch [1/50], Step [40/6], Loss: 0.6431
Epoch [1/50], Step [50/6], Loss: 0.6412
Epoch [1/50], Step [60/6], Loss: 0.6372
Epoch [1/50], Step [70/6], Loss: 0.7604
Epoch [1/50], Step [80/6], Loss: 0.7590
Epoch [1/50], Step [90/6], Loss: 0.6298
Epoch [1/50], Step [100/6], Loss: 0.7696
Epoch [1/50], Step [110/6], Loss: 0.6190
Epoch [1/50], Step [120/6], Loss: 0.6071
Epoch [1/50], Step [130/6], Loss: 0.7803
Epoch [1/50], Step [140/6], Loss: 0.8048
Epoch [1/50], Step [150/6], Loss: 0.6025
Epoch [1/50], Step [160/6], Loss: 0.7947
Epoch [1/50], Step [170/6], Loss: 0.6063
Epoch [1/50], Step [180/6], Loss: 0.5993
Epoch [1/50], Step [190/6], Loss: 0.6087
Epoch [1/50], Step [200/6], Loss: 0.6095
Epoch [1/50], Step [210/6], Loss: 0.7799
Epoch [1/50], Step [220/6], Loss: 0.7756
Epoch [1/50], Step [230/6], Loss: 0.6250
Epoch [1/50], Step [240/6], Loss: 0.6291
Epoch [1/50], Step [250/6

In [253]:
with torch.no_grad():
    model.eval()
    predictions = []
    for z in z_matrix:
        z=torch.from_numpy(z)
        y_pred = model(z)
        predictions.append(y_pred.item())
    predictions = np.array(predictions)

In [254]:
threshold = 0.5
predictions[predictions >= threshold] = 1
predictions[predictions < threshold] = 0
predictions

array([1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1.,
       1., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 1.,
       1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1.,
       1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1.,
       1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1.,
       1., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1.,
       1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1.,
       0., 1., 1., 1., 1.

In [255]:
np.sum(predictions==np.array(data_y[valid_indices]))/len(predictions)

0.5172413793103449