In [1]:
import kagglehub
import torch
import torch.optim as optim
from data_loader_BERT import DataLoader4BERT
from model import TransformerForNumericPrediction
from utils import train, evaluate, adjust_learning_rate
from config import LEARNING_RATE, NUM_EPOCHS, PREVIOUS_LOSS, BERT_MODEL_NAME

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
path = kagglehub.dataset_download("samiraalipour/genomics-of-drug-sensitivity-in-cancer-gdsc")
print("Path to dataset files:", path)

dataloader = DataLoader4BERT(
    gdsc_path = path + '/GDSC_DATASET.csv',
    compounds_path = path + '/Compounds-annotation.csv',
    gdsc2_path = path + '/GDSC2-dataset.csv',
    cell_lines_path = path + '/Cell_Lines_Details.xlsx',
)

X_train_numeric, X_train_text, y_train_tensor, X_test_numeric, X_test_text, y_test_tensor = dataloader.get_data()

# Move necessary data to GPU
input_ids_train = X_train_text["input_ids"].to(device)
attention_mask_train = X_train_text["attention_mask"].to(device)
X_train_numeric = X_train_numeric.to(device)
y_train_tensor = y_train_tensor.to(device)

input_ids_test = X_test_text["input_ids"].to(device)
attention_mask_test = X_test_text["attention_mask"].to(device)
X_test_numeric = X_test_numeric.to(device)
y_test_tensor = y_test_tensor.to(device)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/andrew-root/.cache/kagglehub/datasets/samiraalipour/genomics-of-drug-sensitivity-in-cancer-gdsc/versions/2


  warn(msg)


Loading Done!
Preprocess Done!
數值特徵與文本特徵分配完成


In [2]:
# Load pre-trained model
model = TransformerForNumericPrediction(model_name=BERT_MODEL_NAME, num_numeric_features=X_train_numeric.shape[1])
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
# Training loop
num_epochs = NUM_EPOCHS
previous_loss = PREVIOUS_LOSS
for epoch in range(num_epochs):
    train_loss = train(model, input_ids_train, attention_mask_train, X_train_numeric, y_train_tensor, optimizer)
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_loss:.4f}")
    
    # Adjust learning rate based on loss
    adjust_learning_rate(optimizer, train_loss, previous_loss)
    previous_loss = train_loss

In [None]:
# Evaluate on test data
mae, mse = evaluate(model, input_ids_test, attention_mask_test, X_test_numeric, y_test_tensor)
# print(f"Mean Absolute Error on Test Data: {mae:.4f}")
print(f"Mean Square Error on Test Data: {mse:.4f}")

roberta Mean Square Error on Test Data: 3.1066