In [63]:

import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

from utils import get_loss_function , calculate_metrics_all, load_data, plot_scatter, FlexibleMLP, Autoencoder, set_seed
from train import CombinedModel, CustomDataset


In [None]:

config = {"mol": 'all', "P_spectra_dim": 101, 
     "input_csv_path": '/root/CPP_Customize_Transfer_Learning/data/demo/train_x.csv',
     "label_csv_path": '/root/CPP_Customize_Transfer_Learning/data/demo/train_y.csv',
     } 

file_paths = ['/root/CPP_Customize_Transfer_Learning/data/P_mol_spectra/mol1-p.xlsx', 
              '/root/CPP_Customize_Transfer_Learning/data/P_mol_spectra/mol2-p.xlsx', 
              '/root/CPP_Customize_Transfer_Learning/data/P_mol_spectra/mol3-p.xlsx']

autoencoder_checkpoint = '/root/CPP_Customize_Transfer_Learning/ckpt/autoencoder/1500.pth'
ckpt_dir = "/root/CPP_Customize_Transfer_Learning/ckpt/demo"
results_file_path = '/root/CPP_Customize_Transfer_Learning/result/base_results_summary.csv'


In [None]:
input_dim = 125
output_dim_autoencoder = 121
loss_function_name = 'mae' 
sampling_frequency = 15
freeze = False
output_dim_combined = 501
onehot_dim = 7  
batch_size = 128
learning_rate = 5e-5
lr_decay_step = 200  
lr_decay_gamma = 0.7 
augment_times = 10
noise_std = 1e-7
test_size = 0.1
k_folds = 10
num_epochs = 300
random_seed = 2024

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs, labels = load_data(config["input_csv_path"], config["label_csv_path"])

all_predictions = []
all_labels = []
set_seed(random_seed)



autoencoder = torch.load(autoencoder_checkpoint)
autoencoder = autoencoder.to(device)

if freeze:
    for param in autoencoder.parameters():
        param.requires_grad = False
        
combined_model = CombinedModel(autoencoder, P_spectra_dim=config['P_spectra_dim'], output_dim=output_dim_combined, mlp_output_dim=output_dim_combined, embedding_dim=onehot_dim).to(device)

criterion = get_loss_function(loss_function_name)
optimizer = optim.Adam(
    [param for param in combined_model.parameters() if param.requires_grad],
    lr=learning_rate
)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=lr_decay_step, gamma=lr_decay_gamma)

train_inputs = inputs
train_labels = labels

train_dataset = CustomDataset(train_inputs, train_labels, file_paths, sampling_frequency, random_seed, is_train=True)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, worker_init_fn=set_seed(random_seed))

for epoch in range(num_epochs):
    combined_model.train()
    total_loss = 0
    for inputs_tr, labels_tr in train_loader:
        inputs_tr, labels_tr = inputs_tr.to(device), labels_tr.to(device)
        
        optimizer.zero_grad()
        
        outputs = combined_model(inputs_tr)
        loss = criterion(outputs, labels_tr)
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    scheduler.step()
    
    print(f' Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader)}, LR: {scheduler.get_last_lr()[0]}')


# Save the model weights after each fold
model_save_path = os.path.join(ckpt_dir, f"model_{config['mol']}_epochs_{num_epochs}_seed_{random_seed}.pth")
torch.save(combined_model, model_save_path)
print(f"Model weights saved to {model_save_path}")

 Epoch [1/300], Loss: 0.8475263118743896, LR: 5e-05
 Epoch [2/300], Loss: 0.7922291874885559, LR: 5e-05
 Epoch [3/300], Loss: 0.7040727972984314, LR: 5e-05
 Epoch [4/300], Loss: 0.6087739884853363, LR: 5e-05
 Epoch [5/300], Loss: 0.5484233617782592, LR: 5e-05
 Epoch [6/300], Loss: 0.5280864804983139, LR: 5e-05
 Epoch [7/300], Loss: 0.5119353652000427, LR: 5e-05
 Epoch [8/300], Loss: 0.4949608951807022, LR: 5e-05
 Epoch [9/300], Loss: 0.468742698431015, LR: 5e-05
 Epoch [10/300], Loss: 0.41805826127529144, LR: 5e-05
 Epoch [11/300], Loss: 0.3713924527168274, LR: 5e-05
 Epoch [12/300], Loss: 0.3412347912788391, LR: 5e-05
 Epoch [13/300], Loss: 0.31182089149951936, LR: 5e-05
 Epoch [14/300], Loss: 0.29267154932022094, LR: 5e-05
 Epoch [15/300], Loss: 0.28103497326374055, LR: 5e-05
 Epoch [16/300], Loss: 0.27368472814559935, LR: 5e-05
 Epoch [17/300], Loss: 0.2715899258852005, LR: 5e-05
 Epoch [18/300], Loss: 0.2676327958703041, LR: 5e-05
 Epoch [19/300], Loss: 0.2653378590941429, LR: 5e-0

In [None]:
# eval

model_path = "/root/CPP_Customize_Transfer_Learning/ckpt/demo/model_all_epochs_300_seed_2024.pth"
input_csv_path = "/root/CPP_Customize_Transfer_Learning/data/demo/test_x.csv"
label_csv_path = "/root/CPP_Customize_Transfer_Learning/data/demo/test_y.csv"

In [68]:

def predict():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # 加载模型
    model = torch.load(model_path, map_location=device)
    model.eval()
    
    # 加载测试数据
    inputs, labels = load_data(input_csv_path, label_csv_path)
    test_dataset = CustomDataset(inputs, labels, file_paths, sampling_frequency, random_seed, is_train=False)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
    
    
    test_predictions = np.empty(0)
    test_labels = np.empty(0)
    # 进行预测
    for sample_idx, (inputs_te, labels_te) in enumerate(test_loader):
        inputs_te, labels_te = inputs_te.to(device), labels_te.to(device)
        with torch.no_grad():
            output = model(inputs_te)
            test_predictions = np.concatenate((test_predictions, output.cpu().numpy().flatten()))
            test_labels = np.concatenate((test_labels, labels_te.cpu().numpy().flatten()))

    mae, mse, rmse, r2, pearson_corr = calculate_metrics_all(test_labels, test_predictions)
    print(f'Average MAE: {mae}, MSE: {mse}, RMSE: {rmse}, R2: {r2}, Pearson: {pearson_corr} random_seed={random_seed}')


predict()
 

Average MAE: 0.14366924738510345, MSE: 0.04585729283807755, RMSE: 0.21414315968080222, R2: 0.8899666493575359, Pearson: 0.95434617802537 random_seed=2024
