In [1]:
# Move data to CPU initially and only transfer to GPU when needed
# Ensure non-model data stays on CPU as much as possible
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import numpy as np


class DataLoader4BERT:
    def __init__(self, gdsc_path, compounds_path, gdsc2_path, cell_lines_path, bert_model_name="bert-base-uncased"):
        self.gdsc_path = gdsc_path
        self.compounds_path = compounds_path
        self.gdsc2_path = gdsc2_path
        self.cell_lines_path = cell_lines_path
        self.bert_model_name = bert_model_name
        
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        
        self.gdsc_dataset = None
        self.compounds_annotation = None
        self.gdsc2_dataset = None
        self.cell_lines_details = None
        self.final_df = None
        self.X_train_text = None
        self.X_test_text = None
        self.X_train_numeric = None
        self.X_test_numeric = None
        self.y_train = None
        self.y_test = None
        
        self.load_data()
        self.preprocess_data()
        self.define_features_and_target()

    def load_data(self):
        self.gdsc_dataset = pd.read_csv(self.gdsc_path)
        self.compounds_annotation = pd.read_csv(self.compounds_path)
        self.gdsc2_dataset = pd.read_csv(self.gdsc2_path)
        self.cell_lines_details = pd.read_excel(self.cell_lines_path)
        print('Loading Done!')

    def preprocess_data(self):
        self.gdsc_dataset = self.gdsc_dataset.dropna()
        self.compounds_annotation = self.compounds_annotation.dropna()
        self.gdsc2_dataset = self.gdsc2_dataset.dropna()
        self.cell_lines_details = self.cell_lines_details.dropna()

        merged_df = pd.merge(self.gdsc2_dataset, self.cell_lines_details, left_on='COSMIC_ID', right_on='COSMIC identifier', how='left')
        self.final_df = pd.merge(merged_df, self.compounds_annotation, on='DRUG_ID', how='left')
        print('Preprocess Done!')

    def define_features_and_target(self):
        numeric_features = ['AUC', 'Z_SCORE']
        text_features = ['Cancer Type\n(matching TCGA label)', 
                        'GDSC\nTissue descriptor 1', 
                        'GDSC\nTissue\ndescriptor 2']

        for feature in numeric_features:
            if feature not in self.final_df.columns:
                raise ValueError(f"數值特徵 {feature} 不存在於 final_df 中。")

        X_numeric = self.final_df[numeric_features].fillna(0).astype(float)
        if X_numeric.empty:
            raise ValueError("數值特徵提取結果為空，請檢查數據處理步驟。")

        X_text = self.final_df[text_features].fillna('')
        text_inputs = X_text.apply(lambda x: ' '.join(x), axis=1).tolist()
        tokenized = self.tokenizer(text_inputs, padding=True, truncation=True, return_tensors="pt", max_length=8)

        if 'LN_IC50' not in self.final_df.columns:
            raise ValueError("目標變數 'LN_IC50' 不存在於 final_df 中。")
        y = self.final_df['LN_IC50']

        X_train_numeric, X_test_numeric, y_train, y_test = train_test_split(
            X_numeric, y, test_size=0.2, random_state=42
        )
        input_ids_train, input_ids_test, attention_mask_train, attention_mask_test = train_test_split(
            tokenized['input_ids'], tokenized['attention_mask'], test_size=0.2, random_state=42
        )

        self.X_train_numeric, self.X_test_numeric = X_train_numeric, X_test_numeric
        self.y_train, self.y_test = y_train, y_test
        self.X_train_text = {"input_ids": input_ids_train, "attention_mask": attention_mask_train}
        self.X_test_text = {"input_ids": input_ids_test, "attention_mask": attention_mask_test}

        print("數值特徵與文本特徵分配完成")

    def get_data(self):
        # Return tensors in CPU initially
        X_train_numeric_tensor = torch.tensor(self.X_train_numeric.values).float()
        X_test_numeric_tensor = torch.tensor(self.X_test_numeric.values).float()
        y_train_tensor = torch.tensor(self.y_train.values).float()
        y_test_tensor = torch.tensor(self.y_test.values).float()
        
        X_train_text = self.X_train_text
        X_test_text = self.X_test_text

        return X_train_numeric_tensor, X_train_text, y_train_tensor, X_test_numeric_tensor, X_test_text, y_test_tensor


# Initialize model and only move data to GPU as needed
from transformers import BertModel
import torch
import torch.nn as nn

class BertForNumericPrediction(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", num_numeric_features=2):
        super(BertForNumericPrediction, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.numeric_embed = nn.Linear(num_numeric_features, self.bert.config.hidden_size)
        self.fc = nn.Linear(self.bert.config.hidden_size * 2, 1)

    def forward(self, input_ids, attention_mask, numeric_features):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = bert_outputs.last_hidden_state[:, 0, :]
        numeric_embedding = self.numeric_embed(numeric_features)
        combined = torch.cat([cls_output, numeric_embedding], dim=-1)
        output = self.fc(combined)
        return output


import kagglehub

# Download latest version
path = kagglehub.dataset_download("samiraalipour/genomics-of-drug-sensitivity-in-cancer-gdsc")

print("Path to dataset files:", path)

dataloader = DataLoader4BERT(path + '/GDSC_DATASET.csv',
                             path + '/Compounds-annotation.csv',
                             path + '/GDSC2-dataset.csv',
                             path + '/Cell_Lines_Details.xlsx')

X_train_numeric, X_train_text, y_train_tensor, X_test_numeric, X_test_text, y_test_tensor = dataloader.get_data()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move only necessary data to GPU
input_ids_train = X_train_text["input_ids"].to(device)
attention_mask_train = X_train_text["attention_mask"].to(device)
X_train_numeric = X_train_numeric.to(device)

model = BertForNumericPrediction(bert_model_name="bert-base-uncased", num_numeric_features=X_train_numeric.shape[1])
model = model.to(device)

# Perform forward pass
output_train = model(
    input_ids=input_ids_train,
    attention_mask=attention_mask_train,
    numeric_features=X_train_numeric
)

print("模型輸出維度：", output_train.shape)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/andrew-root/.cache/kagglehub/datasets/samiraalipour/genomics-of-drug-sensitivity-in-cancer-gdsc/versions/2


  warn(msg)


Loading Done!
Preprocess Done!
數值特徵與文本特徵分配完成


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.92 GiB. GPU 0 has a total capacity of 11.76 GiB of which 3.29 GiB is free. Including non-PyTorch memory, this process has 8.43 GiB memory in use. Of the allocated memory 8.26 GiB is allocated by PyTorch, and 55.26 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [1]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import numpy as np

class DataLoader4BERT:
    def __init__(self, gdsc_path, compounds_path, gdsc2_path, cell_lines_path, bert_model_name="bert-base-uncased"):
        self.gdsc_path = gdsc_path
        self.compounds_path = compounds_path
        self.gdsc2_path = gdsc2_path
        self.cell_lines_path = cell_lines_path
        self.bert_model_name = bert_model_name
        
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        
        self.gdsc_dataset = None
        self.compounds_annotation = None
        self.gdsc2_dataset = None
        self.cell_lines_details = None
        self.final_df = None
        self.X_train_text = None
        self.X_test_text = None
        self.X_train_numeric = None
        self.X_test_numeric = None
        self.y_train = None
        self.y_test = None
        
        self.load_data()
        self.preprocess_data()
        self.define_features_and_target()

    def load_data(self):
        self.gdsc_dataset = pd.read_csv(self.gdsc_path)
        self.compounds_annotation = pd.read_csv(self.compounds_path)
        self.gdsc2_dataset = pd.read_csv(self.gdsc2_path)
        self.cell_lines_details = pd.read_excel(self.cell_lines_path)
        print('Loading Done!')

    def preprocess_data(self):
        self.gdsc_dataset = self.gdsc_dataset.dropna()
        self.compounds_annotation = self.compounds_annotation.dropna()
        self.gdsc2_dataset = self.gdsc2_dataset.dropna()
        self.cell_lines_details = self.cell_lines_details.dropna()

        merged_df = pd.merge(self.gdsc2_dataset, self.cell_lines_details, left_on='COSMIC_ID', right_on='COSMIC identifier', how='left')
        self.final_df = pd.merge(merged_df, self.compounds_annotation, on='DRUG_ID', how='left')
        print('Preprocess Done!')

    def define_features_and_target(self):
        # 數值特徵
        numeric_features = ['AUC', 'Z_SCORE']
        # 文本特徵
        text_features = ['Cancer Type\n(matching TCGA label)', 
                        'GDSC\nTissue descriptor 1', 
                        'GDSC\nTissue\ndescriptor 2']
        
        # 檢查數值特徵是否存在於 DataFrame
        for feature in numeric_features:
            if feature not in self.final_df.columns:
                raise ValueError(f"數值特徵 {feature} 不存在於 final_df 中。")

        # 提取數值和文本特徵
        X_numeric = self.final_df[numeric_features].fillna(0).astype(float)
        if X_numeric.empty:
            raise ValueError("數值特徵提取結果為空，請檢查數據處理步驟。")

        # 文本特徵處理
        X_text = self.final_df[text_features].fillna('')
        text_inputs = X_text.apply(lambda x: ' '.join(x), axis=1).tolist()
        tokenized = self.tokenizer(text_inputs, padding=True, truncation=True, return_tensors="pt", max_length=64)

        # 目標變數
        if 'LN_IC50' not in self.final_df.columns:
            raise ValueError("目標變數 'LN_IC50' 不存在於 final_df 中。")
        y = self.final_df['LN_IC50']

        # 拆分數據集
        X_train_numeric, X_test_numeric, y_train, y_test = train_test_split(
            X_numeric, y, test_size=0.2, random_state=42
        )
        input_ids_train, input_ids_test, attention_mask_train, attention_mask_test = train_test_split(
            tokenized['input_ids'], tokenized['attention_mask'], test_size=0.2, random_state=42
        )

        # 分配數據
        self.X_train_numeric, self.X_test_numeric = X_train_numeric, X_test_numeric
        self.y_train, self.y_test = y_train, y_test
        self.X_train_text = {"input_ids": input_ids_train, "attention_mask": attention_mask_train}
        self.X_test_text = {"input_ids": input_ids_test, "attention_mask": attention_mask_test}

        print("數值特徵與文本特徵分配完成")

    def get_data(self):
        # 數值特徵轉換為 tensor
        X_train_numeric_tensor = torch.tensor(self.X_train_numeric.values).float()
        X_test_numeric_tensor = torch.tensor(self.X_test_numeric.values).float()
        y_train_tensor = torch.tensor(self.y_train.values).float()
        y_test_tensor = torch.tensor(self.y_test.values).float()
        
        # 文本特徵保持 tokenized 格式
        X_train_text = self.X_train_text
        X_test_text = self.X_test_text

        return X_train_numeric_tensor, X_train_text, y_train_tensor, X_test_numeric_tensor, X_test_text, y_test_tensor


from transformers import BertModel
import torch
import torch.nn as nn

class BertForNumericPrediction(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", num_numeric_features=2):
        super(BertForNumericPrediction, self).__init__()
        # 初始化 BERT 模型
        try:
            self.bert = BertModel.from_pretrained(bert_model_name)
        except Exception as e:
            raise RuntimeError(f"無法加載 BERT 模型 {bert_model_name}，錯誤訊息: {e}")
        
        # 數值特徵嵌入層
        self.numeric_embed = nn.Linear(num_numeric_features, self.bert.config.hidden_size)
        # 數值與文本特徵融合後的輸出層
        self.fc = nn.Linear(self.bert.config.hidden_size * 2, 1)  # 融合後的輸出層

    def forward(self, input_ids, attention_mask, numeric_features):
        # 檢查輸入尺寸是否正確
        if input_ids.size(0) != numeric_features.size(0):
            raise ValueError(
                f"input_ids 和 numeric_features 的 batch size 不匹配："
                f"{input_ids.size(0)} != {numeric_features.size(0)}"
            )
        if numeric_features.size(1) != self.numeric_embed.in_features:
            raise ValueError(
                f"numeric_features 的特徵數量錯誤："
                f"{numeric_features.size(1)} != {self.numeric_embed.in_features}"
            )

        try:
            # 通過 BERT 獲取文本特徵
            bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            cls_output = bert_outputs.last_hidden_state[:, 0, :]  # 提取 [CLS] 標記的輸出
        except RuntimeError as e:
            raise RuntimeError(f"BERT 模型在 forward 過程中發生錯誤: {e}")

        try:
            # 數值特徵嵌入
            numeric_embedding = self.numeric_embed(numeric_features)  # 將數值特徵嵌入到與文本特徵相同的維度
        except RuntimeError as e:
            raise RuntimeError(f"數值特徵嵌入層發生錯誤: {e}")

        try:
            # 特徵融合
            combined = torch.cat([cls_output, numeric_embedding], dim=-1)
            
            # 最終預測
            output = self.fc(combined)
        except RuntimeError as e:
            raise RuntimeError(f"特徵融合或輸出層計算時發生錯誤: {e}")

        return output

import kagglehub

# Download latest version
path = kagglehub.dataset_download("samiraalipour/genomics-of-drug-sensitivity-in-cancer-gdsc")

print("Path to dataset files:", path)

dataloader = DataLoader4BERT(path + '/GDSC_DATASET.csv',
                             path + '/Compounds-annotation.csv',
                             path + '/GDSC2-dataset.csv',
                             path + '/Cell_Lines_Details.xlsx')

# 獲取數據
X_train_numeric, X_train_text, y_train_tensor, X_test_numeric, X_test_text, y_test_tensor = dataloader.get_data()

# 初始化模型
model = BertForNumericPrediction(bert_model_name="bert-base-uncased", num_numeric_features=X_train_numeric.shape[1])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 將數據移至設備
input_ids_train = X_train_text["input_ids"].to(device)
attention_mask_train = X_train_text["attention_mask"].to(device)
X_train_numeric = X_train_numeric.to(device)

# 將數據傳入模型
output_train = model(
    input_ids=input_ids_train,
    attention_mask=attention_mask_train,
    numeric_features=X_train_numeric
)

# 打印輸出維度
print("模型輸出維度：", output_train.shape)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/andrew-root/.cache/kagglehub/datasets/samiraalipour/genomics-of-drug-sensitivity-in-cancer-gdsc/versions/2


  warn(msg)


Loading Done!
Preprocess Done!
數值特徵與文本特徵分配完成


RuntimeError: BERT 模型在 forward 過程中發生錯誤: CUDA out of memory. Tried to allocate 11.75 GiB. GPU 0 has a total capacity of 11.76 GiB of which 11.10 GiB is free. Including non-PyTorch memory, this process has 638.00 MiB memory in use. Of the allocated memory 484.74 MiB is allocated by PyTorch, and 51.26 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)