In [1]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizerFast, BertModel

In [2]:
# 1. 데이터 로드
file_path = r'C:\Users\user\Desktop\WorkSpace\DACON_drugdevproject\data\train.csv'
train_data = pd.read_csv(file_path)

file_path = r'C:\Users\user\Desktop\WorkSpace\DACON_drugdevproject\data\test.csv'
test_data = pd.read_csv(file_path)

In [3]:
# 2. 필요한 열만 추출
dataset = train_data[['Smiles', 'IC50_nM']]

In [4]:
# 3. 토크나이저 로드
checkpoint = 'unikei/bert-base-smiles'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)

In [5]:
# 4. SMILES 데이터를 토큰화
smiles_list = dataset['Smiles'].tolist()
tokenized_inputs = tokenizer(smiles_list, padding=True, truncation=True, return_tensors="pt")

In [6]:
# 5. IC50 값을 텐서로 변환
ic50_values = dataset['IC50_nM'].tolist()
labels = torch.tensor(ic50_values, dtype=torch.float)

In [7]:
# 6. 데이터셋 생성
train_dataset = TensorDataset(tokenized_inputs['input_ids'], tokenized_inputs['attention_mask'], labels)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
for batch in train_dataloader:
        input_ids, attention_mask, ic50_labels = batch
        print(f"input_ids shape: {input_ids.shape}")  # (batch_size, seq_len)
        print(f"attention_mask shape: {attention_mask.shape}")  # (batch_size, seq_len)
        print(f"labels shape: {ic50_labels.shape}")  # (batch_size,)
        break

input_ids shape: torch.Size([16, 96])
attention_mask shape: torch.Size([16, 96])
labels shape: torch.Size([16])


In [8]:
class EncoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=768, nhead=12, num_encoder_layers=12, dim_feedforward=3072, dropout=0.1):
        super(EncoderOnlyTransformer, self).__init__()
        # 토큰 ID를 임베딩 차원으로 변환하기 위한 임베딩 레이어
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        
        # Transformer Encoder Layer
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        # Positional encoding
        self.positional_encoding = nn.Parameter(torch.zeros(1, 512, d_model))  # Assuming max seq_len = 512
        
        # Affine (Linear) layer for regression (output 1 dimension)
        self.regressor = nn.Linear(d_model, 1)
        
        # Activation
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask=None):
        # 임베딩 레이어 적용 (input_ids: [batch_size, seq_len] -> [batch_size, seq_len, d_model])
        embedded_input = self.embedding(input_ids)
        if torch.isnan(embedded_input).any():
            print("Embedding layer에서 NaN 발생")
        # Apply positional encoding
        seq_len = embedded_input.size(1)
        inputs_with_position = embedded_input + self.positional_encoding[:, :seq_len, :]

        # attention_mask를 bool 타입으로 변환 (0 -> False, 1 -> True)
        if attention_mask is not None:
            attention_mask = attention_mask.bool()

        # Transformer encoder expects input as [batch_size, seq_len, d_model]
        encoded_output = self.transformer_encoder(inputs_with_position)

        # Apply pooling on the sequence dimension (e.g., using the output of the [CLS] token or average pooling)
        pooled_output = torch.mean(encoded_output, dim=1)  # Simple average pooling

        # Apply ReLU activation
        pooled_output = self.relu(pooled_output)

        # Apply Affine layer for regression
        ic50_preds = self.regressor(pooled_output)
        
        return ic50_preds


In [9]:
print(torch.cuda.is_available())  # True가 나와야 정상
print(torch.cuda.get_device_name(0))  # 첫 번째 GPU 이름 출력

True
NVIDIA GeForce RTX 3050


In [14]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# 데이터 로더는 이미 만들어졌다고 가정합니다
# train_dataloader: 훈련 데이터로부터 가져온 배치
# test_dataloader: 테스트 데이터로부터 가져온 배치

# 모델 초기화
model = EncoderOnlyTransformer(vocab_size = tokenizer.vocab_size, d_model=768, nhead=6, num_encoder_layers=6, dim_feedforward=1024, dropout=0.1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 손실 함수 및 옵티마이저
loss_fn = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-2)

# 학습 설정
num_epochs = 20
train_loss_history = []

# 학습 루프
for epoch in range(num_epochs):
    model.train()  # 모델을 학습 모드로 설정
    epoch_loss = 0.0
    
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        # 모델 예측
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        outputs = outputs.squeeze()  # (batch_size, 1) -> (batch_size,)로 변경
        
        # 손실 계산
        loss = loss_fn(outputs, labels)
        
        # 역전파 및 옵티마이저 업데이트
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    
    avg_epoch_loss = epoch_loss / len(train_dataloader)
    train_loss_history.append(avg_epoch_loss)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_epoch_loss:.4f}")

print("학습 완료!")


Epoch [1/20], Loss: 7132199.6368
Epoch [2/20], Loss: 6974961.8545
Epoch [3/20], Loss: 6973385.3932
Epoch [4/20], Loss: 6972908.8961
Epoch [5/20], Loss: 6972466.8060
Epoch [6/20], Loss: 6973344.2765
Epoch [7/20], Loss: 6975956.5033
Epoch [8/20], Loss: 6974144.0517
Epoch [9/20], Loss: 6970526.6675
Epoch [10/20], Loss: 6970071.9188
Epoch [11/20], Loss: 6967670.2723
Epoch [12/20], Loss: 6971723.1785
Epoch [13/20], Loss: 6972133.3463
Epoch [14/20], Loss: 6972833.0236
Epoch [15/20], Loss: 6973010.1268
Epoch [16/20], Loss: 6975412.0237
Epoch [17/20], Loss: 6974001.7897
Epoch [18/20], Loss: 6974686.0300
Epoch [19/20], Loss: 6970714.4038
Epoch [20/20], Loss: 6972421.2377
학습 완료!


: 

In [None]:

# Test data tokenization
test_smiles_list = test_data['Smiles'].tolist()
test_tokenized_inputs = tokenizer(test_smiles_list, padding=True, truncation=True, return_tensors="pt")

# Load the trained model (assuming the model is already trained and available)
# model = EncoderOnlyTransformer(...)  # The model should be defined and trained earlier

# Put the model in evaluation mode
model.eval()

# Move the model and data to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
test_tokenized_inputs = {key: value.to(device) for key, value in test_tokenized_inputs.items()}

# Make predictions
with torch.no_grad():
    ic50_predictions = model(test_tokenized_inputs['input_ids'], test_tokenized_inputs['attention_mask'])

# Convert predictions to CPU and detach
ic50_predictions = ic50_predictions.cpu().numpy()

# Save the predictions to a CSV file
test_data['Predicted_IC50_nM'] = ic50_predictions
output_file = '/mnt/data/test_predictions.csv'
test_data.to_csv(output_file, index=False)

print(f"Predictions saved to {output_file}")
