# ChemBERTa (reproduce)


In [1]:
from pathlib import Path
import os, sys, json

# 프로젝트 루트 (필요 시 수정)
ROOT = Path('/home/yoo122333/capstone/chemberta_repro_final')
CODE_ROOT = ROOT / 'code' / 'bert-loves-chemistry'
RUNS_DIR = ROOT / 'runs'
DATA_DIR = CODE_ROOT / 'chemberta' / 'data'

# 환경 변수
os.environ.setdefault('CUDA_VISIBLE_DEVICES', '5') # cuda 디바이스 설정
os.environ.setdefault('TOKENIZERS_PARALLELISM', 'false') # 토크나이저 병렬처리 비활성화
os.environ.setdefault('WANDB_DISABLED', 'true') # Weights & Biases 비활성화

# PYTHONPATH 세팅
sys.path.insert(0, str(CODE_ROOT))

print('ROOT:', ROOT)
print('CODE_ROOT:', CODE_ROOT)
print('RUNS_DIR:', RUNS_DIR)
print('DATA_DIR:', DATA_DIR)

ROOT: /home/yoo122333/capstone/chemberta_repro_final
CODE_ROOT: /home/yoo122333/capstone/chemberta_repro_final/code/bert-loves-chemistry
RUNS_DIR: /home/yoo122333/capstone/chemberta_repro_final/runs
DATA_DIR: /home/yoo122333/capstone/chemberta_repro_final/code/bert-loves-chemistry/chemberta/data


In [2]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import Dataset, DataLoader

from transformers import (
    RobertaTokenizerFast,
    RobertaConfig,
    DataCollatorForLanguageModeling,
)
from huggingface_hub import snapshot_download

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)

  from .autonotebook import tqdm as notebook_tqdm


  from .autonotebook import tqdm as notebook_tqdm


device: cuda


## 0) 토크나이저 + 설정
`seyonec/ChemBERTa-zinc-base-v1`을 기본으로 사용합니다.
로컬 캐시에 있으면 인터넷 없이 로드됩니다.

- 구조는 노트북 내부 구현을 사용합니다.
- pretrained weights는 **state_dict만 로드**하여 주입합니다.

기본 config는 **원 논문 설정**(RoBERTa base 스타일)을 사용합니다.


In [3]:
MODEL_NAME = 'seyonec/ChemBERTa-zinc-base-v1' # 원논문에서 가중치는 huggingface에 업로드 되어 있다고 언급
USE_PRETRAINED_WEIGHTS = True
LOCAL_ONLY = True  # 로컬 캐시만 사용

# 원 논문 기준 RoBERTa config (base 스타일)
PAPER_CONFIG = dict(
    vocab_size=52000,
    max_position_embeddings=512,
    num_hidden_layers=6,
    num_attention_heads=12,
    hidden_size=768,
    intermediate_size=3072,
    type_vocab_size=1,
    hidden_act='gelu',
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    layer_norm_eps=1e-5,
    pad_token_id=1,
)

try:
    tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME, local_files_only=LOCAL_ONLY)
    print('Loaded tokenizer from local cache')
except Exception:
    print('Local cache not found, downloading...')
    tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME, local_files_only=False)

print('vocab size:', tokenizer.vocab_size)

Loaded tokenizer from local cache
vocab size: 767


## 1) RoBERTa 구조 (노트북 내부 구현)
아래 클래스들이 바로 수정 가능한 **직접 구현 코드**입니다.
필요한 실험은 여기서 바로 수정하세요.


In [4]:
def get_activation(name):
    if name in ('gelu', 'gelu_new', 'gelu_fast'):
        return F.gelu
    if name == 'relu':
        return F.relu
    if name == 'tanh':
        return torch.tanh
    raise ValueError(name)

class RobertaEmbeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size, padding_idx=config.pad_token_id)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.pad_token_id = config.pad_token_id

    def create_position_ids_from_input_ids(self, input_ids):
        mask = input_ids.ne(self.pad_token_id).int()
        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask)) * mask
        return incremental_indices.long() + self.pad_token_id

    def forward(self, input_ids, token_type_ids=None):
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)
        position_ids = self.create_position_ids_from_input_ids(input_ids)
        embeddings = self.word_embeddings(input_ids)
        embeddings = embeddings + self.position_embeddings(position_ids)
        embeddings = embeddings + self.token_type_embeddings(token_type_ids)
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

class RobertaSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_dim = config.hidden_size // config.num_attention_heads
        self.all_head_size = self.num_heads * self.head_dim

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_heads, self.head_dim)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask=None):
        query_layer = self.transpose_for_scores(self.query(hidden_states))
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        attn_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attn_scores = attn_scores / math.sqrt(self.head_dim)
        if attention_mask is not None:
            attn_scores = attn_scores + attention_mask
        attn_probs = F.softmax(attn_scores, dim=-1)
        attn_probs = self.dropout(attn_probs)
        context = torch.matmul(attn_probs, value_layer)
        context = context.permute(0, 2, 1, 3).contiguous()
        new_context_shape = context.size()[:-2] + (self.all_head_size,)
        context = context.view(*new_context_shape)
        return context, attn_probs

class RobertaSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

class RobertaAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.self = RobertaSelfAttention(config)
        self.output = RobertaSelfOutput(config)

    def forward(self, hidden_states, attention_mask=None):
        self_outputs = self.self(hidden_states, attention_mask=attention_mask)
        attn_output = self.output(self_outputs[0], hidden_states)
        return attn_output, self_outputs[1]

class RobertaIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        self.act = get_activation(config.hidden_act)

    def forward(self, hidden_states):
        return self.act(self.dense(hidden_states))

class RobertaOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

class RobertaLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = RobertaAttention(config)
        self.intermediate = RobertaIntermediate(config)
        self.output = RobertaOutput(config)

    def forward(self, hidden_states, attention_mask=None):
        attn_output, _ = self.attention(hidden_states, attention_mask=attention_mask)
        intermediate_output = self.intermediate(attn_output)
        layer_output = self.output(intermediate_output, attn_output)
        return layer_output

class RobertaEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask=None):
        for layer_module in self.layer:
            hidden_states = layer_module(hidden_states, attention_mask=attention_mask)
        return hidden_states

class RobertaModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = RobertaEmbeddings(config)
        self.encoder = RobertaEncoder(config)
        self.config = config

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        extended_mask = (1.0 - attention_mask[:, None, None, :]) * -10000.0
        embeddings = self.embeddings(input_ids, token_type_ids=token_type_ids)
        sequence_output = self.encoder(embeddings, attention_mask=extended_mask)
        return sequence_output

class RobertaLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
        self.decoder.bias = self.bias

    def forward(self, features):
        x = self.dense(features)
        x = F.gelu(x)
        x = self.layer_norm(x)
        x = self.decoder(x)
        return x

class RobertaForMaskedLM(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.roberta = RobertaModel(config)
        self.lm_head = RobertaLMHead(config)
        self.config = config

    def tie_weights(self):
        self.lm_head.decoder.weight = self.roberta.embeddings.word_embeddings.weight

    def forward(self, input_ids, attention_mask=None, labels=None):
        sequence_output = self.roberta(input_ids, attention_mask=attention_mask)
        logits = self.lm_head(sequence_output)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
        return logits, loss


## 2) Pretrained weights 로드 (모델 클래스 없이 state_dict만)


In [5]:
def load_state_dict_from_hf(model_name, local_only=True): # 허깅페이스에서 학습된 가중치를 가져오는 함수
    snapshot_dir = snapshot_download(repo_id=model_name, local_files_only=local_only)
    safetensors_path = Path(snapshot_dir) / 'model.safetensors'
    bin_path = Path(snapshot_dir) / 'pytorch_model.bin'
    if safetensors_path.exists():
        from safetensors.torch import load_file
        return load_file(str(safetensors_path))
    return torch.load(str(bin_path), map_location='cpu')

def load_config_from_hf(model_name, local_only=True):
    snapshot_dir = snapshot_download(repo_id=model_name, local_files_only=local_only)
    cfg_path = Path(snapshot_dir) / 'config.json'
    with open(cfg_path) as f:
        cfg_dict = json.load(f)
    return RobertaConfig(**cfg_dict)


## 3) MLM 파이프라인 (ZINC)
- 데이터 로드
- Dataset/Dataloader
- 모델 로드 (pretrained weights 주입 가능)
- 학습/평가 루프


In [6]:
# ZINC SMILES 텍스트 파일 토큰화해서 15퍼센트 마스킹 랜덤 마스킹하고 배치 생성하는 파이프라인

class MLMSmilesDataset(Dataset): # smiles 문자열을 mlm 학습용으로 변환하는 커스텀 데이터셋
    def __init__(self, path, tokenizer, max_len=128):
        self.lines = [l.strip() for l in open(path) if l.strip()]
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.lines[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt',
        )
        return {k: v.squeeze(0) for k, v in enc.items()}

# ZINC 데이터 우선 사용
zinc_candidates = [
    DATA_DIR / '100k_rndm_zinc_drugs_clean.txt',
    DATA_DIR / '250k_rndm_zinc_drugs_clean_sorted copy.txt',
]
mlm_path = None
for p in zinc_candidates:
    if p.exists():
        mlm_path = p
        break
if mlm_path is None:
    mlm_path = DATA_DIR / 'pubchem_1k_smiles.txt'

print('MLM dataset path:', mlm_path)
mlm_dataset = MLMSmilesDataset(mlm_path, tokenizer, max_len=128)
mlm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
mlm_loader = DataLoader(mlm_dataset, batch_size=8, shuffle=True, collate_fn=mlm_collator)

print('MLM samples:', len(mlm_dataset))


MLM dataset path: /home/yoo122333/capstone/chemberta_repro_final/code/bert-loves-chemistry/chemberta/data/100k_rndm_zinc_drugs_clean.txt
MLM samples: 100000


In [7]:
# MLM 모델 준비 (원 논문 config 우선)
if USE_PRETRAINED_WEIGHTS:
    # pretrained weights는 HF state_dict에서만 로드
    state_dict = load_state_dict_from_hf(MODEL_NAME, local_only=LOCAL_ONLY)
    # weights와 config mismatch를 피하려면 HF config를 사용
    config = load_config_from_hf(MODEL_NAME, local_only=LOCAL_ONLY)
else:
    cfg = dict(PAPER_CONFIG)
    # 토크나이저 vocab에 맞춤
    cfg['vocab_size'] = tokenizer.vocab_size
    config = RobertaConfig(**cfg)
    state_dict = None

mlm_model = RobertaForMaskedLM(config)
mlm_model.tie_weights()
if state_dict is not None:
    missing, unexpected = mlm_model.load_state_dict(state_dict, strict=False)
    print('missing keys:', len(missing))
    print('unexpected keys:', len(unexpected))

mlm_model.to(device)


  return torch.load(str(bin_path), map_location='cpu')


  return torch.load(str(bin_path), map_location='cpu')


missing keys: 0
unexpected keys: 2


  return torch.load(str(bin_path), map_location='cpu')


missing keys: 0
unexpected keys: 2


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(767, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [8]:
# MLM 학습/평가 루프
from torch.optim import AdamW

def train_mlm_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0.0
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        logits, loss = model(**batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    return total_loss / max(1, len(loader))

def eval_mlm_epoch(model, loader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            logits, loss = model(**batch)
            total_loss += loss.item()
    return total_loss / max(1, len(loader))

optimizer = AdamW(mlm_model.parameters(), lr=5e-5)

# for epoch in range(2):
#     train_loss = train_mlm_epoch(mlm_model, mlm_loader, optimizer)
#     eval_loss = eval_mlm_epoch(mlm_model, mlm_loader)
#     print(f'[MLM] epoch {epoch+1} | train_loss={train_loss:.4f} | eval_loss={eval_loss:.4f}')


## 4) Regression 파이프라인
- 데이터 로드
- Dataset/Dataloader
- 모델 로드 (pretrained weights 주입 가능)
- 학습/평가 루프


In [9]:
class SmilesRegressionDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_len=128, norm_path=None):
        df = pd.read_csv(csv_path)
        self.smiles = df.iloc[:, 0].astype(str).tolist()
        self.labels = df.iloc[:, 1:].astype(float).values
        self.tokenizer = tokenizer
        self.max_len = max_len

        if norm_path:
            with open(norm_path) as f:
                norm = json.load(f)
            mean = torch.tensor(norm['mean'], dtype=torch.float32)
            std = torch.tensor(norm['std'], dtype=torch.float32)
            std = torch.where(std == 0, torch.ones_like(std), std)
            self.labels = (torch.tensor(self.labels, dtype=torch.float32) - mean) / std
            self.labels = self.labels.numpy()

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.smiles[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt',
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

reg_csv = DATA_DIR / 'pubchem_descriptors_sample_1k_clean.csv'
reg_norm = DATA_DIR / 'pubchem_descriptors_sample_1k_normalization_values_199.json'

reg_dataset = SmilesRegressionDataset(reg_csv, tokenizer, max_len=128, norm_path=reg_norm)
reg_loader = DataLoader(reg_dataset, batch_size=8, shuffle=True)

print('Regression samples:', len(reg_dataset))


Regression samples: 998


In [10]:
class RobertaRegressionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features):
        x = features[:, 0, :]
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class RobertaForRegression(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.roberta = RobertaModel(config)
        self.regression = RobertaRegressionHead(config)
        self.loss_fn = nn.MSELoss()

    def forward(self, input_ids, attention_mask=None, labels=None):
        sequence_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.regression(sequence_output)
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return logits, loss

# Regression 모델 준비
if USE_PRETRAINED_WEIGHTS:
    config = load_config_from_hf(MODEL_NAME, local_only=LOCAL_ONLY)
    config.num_labels = reg_dataset[0]['labels'].numel()
    state_dict = load_state_dict_from_hf(MODEL_NAME, local_only=LOCAL_ONLY)
else:
    cfg = dict(PAPER_CONFIG)
    cfg['vocab_size'] = tokenizer.vocab_size
    cfg['num_labels'] = reg_dataset[0]['labels'].numel()
    config = RobertaConfig(**cfg)
    state_dict = None

reg_model = RobertaForRegression(config)
if state_dict is not None:
    missing, unexpected = reg_model.load_state_dict(state_dict, strict=False)
    print('missing keys:', len(missing))
    print('unexpected keys:', len(unexpected))

reg_model.to(device)


  return torch.load(str(bin_path), map_location='cpu')


  return torch.load(str(bin_path), map_location='cpu')


missing keys: 4
unexpected keys: 9


  return torch.load(str(bin_path), map_location='cpu')


missing keys: 4
unexpected keys: 9


RobertaForRegression(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(767, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [11]:
from torch.optim import AdamW

def train_reg_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0.0
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        logits, loss = model(**batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    return total_loss / max(1, len(loader))

def eval_reg_epoch(model, loader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            logits, loss = model(**batch)
            total_loss += loss.item()
    return total_loss / max(1, len(loader))

optimizer = AdamW(reg_model.parameters(), lr=5e-5)

# for epoch in range(2):
#     train_loss = train_reg_epoch(reg_model, reg_loader, optimizer)
#     eval_loss = eval_reg_epoch(reg_model, reg_loader)
#     print(f'[REG] epoch {epoch+1} | train_loss={train_loss:.4f} | eval_loss={eval_loss:.4f}')


## 5) Classification 파이프라인 (MolNet, multi-task 지원)
- MolNet(DeepChem) 로딩만 사용합니다.
- `DATASET_NAME`만 바꿔서 실험합니다. 예: `"bbbp"`, `"bace_classification"`, `"hiv"`, `"clintox"`.
- multi-task 데이터셋은 BCEWithLogits + 마스킹으로 처리합니다.


In [48]:
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np
from torch.optim import AdamW

# MoleculeNet 벤치마크 표준 스플릿 설정 (논문 Table 기준)
MOLNET_SPLITS = {
    "bace_classification": "scaffold",
    "bbbp": "scaffold",
    "clintox": "scaffold",
}

# Dataset configuration
DATASET_NAME = "clintox"
NUM_SEEDS = 5  # ChemBERTa paper uses 5 seeds (0-4) for reporting mean±std

CLS_MAX_LEN = 128
CLS_BATCH_SIZE = 64  # ChemBERTa paper uses 64
CLS_EPOCHS = 5


In [49]:
from chemberta.utils.molnet_dataloader import load_molnet_dataset

def _standardize_molnet_df(df, label_cols):
    df = df.copy()
    if "smiles" not in df.columns:
        df = df.rename(columns={df.columns[0]: "smiles"})
    cols = ["smiles"] + label_cols
    df = df[cols]
    for c in label_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    df = df[df[label_cols].notna().any(axis=1)]
    return df

# MoleculeNet 벤치마크 표준에 따른 스플릿 선택
split_type = MOLNET_SPLITS.get(DATASET_NAME.lower(), "scaffold")
print(f"Dataset: {DATASET_NAME} | Split type: {split_type}")

# Load dataset using DeepChem
tasks, (train_df, valid_df, test_df), _ = load_molnet_dataset(
    DATASET_NAME, split=split_type, df_format="chemprop"
)
label_cols = list(tasks)

train_df = _standardize_molnet_df(train_df, label_cols)
valid_df = _standardize_molnet_df(valid_df, label_cols)
test_df = _standardize_molnet_df(test_df, label_cols)

NUM_TASKS = len(label_cols)
print("train/valid/test:", len(train_df), len(valid_df), len(test_df))
print("tasks:", label_cols)


'split' is deprecated.  Use 'splitter' instead.


Dataset: clintox | Split type: scaffold
Using tasks ['CT_TOX'] from available tasks for clintox: ['FDA_APPROVED', 'CT_TOX']
train/valid/test: 1184 148 148
tasks: ['CT_TOX']


In [50]:
class SmilesClassificationDataset(Dataset):
    def __init__(self, df, tokenizer, label_cols, max_len=128):
        self.smiles = df["smiles"].astype(str).tolist()
        labels = df[label_cols].to_numpy(dtype=np.float32)
        self.label_mask = ~np.isnan(labels)
        labels = np.nan_to_num(labels, nan=0.0)
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.smiles[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)
        item["label_mask"] = torch.tensor(self.label_mask[idx], dtype=torch.float32)
        return item


In [51]:
class RobertaClassificationHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features):
        x = features[:, 0, :]
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class RobertaForSequenceClassification(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)
        self.loss_fn = nn.BCEWithLogitsLoss(reduction="none")
        self.config = config

    def forward(self, input_ids, attention_mask=None, labels=None, label_mask=None):
        sequence_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            if label_mask is not None:
                loss = loss * label_mask
                denom = label_mask.sum().clamp(min=1.0)
                loss = loss.sum() / denom
            else:
                loss = loss.mean()
        return logits, loss


In [52]:
def train_cls_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0.0
    for batch in loader:
        labels = batch["labels"].to(device)
        label_mask = batch["label_mask"].to(device)
        inputs = {k: v.to(device) for k, v in batch.items() if k not in ("labels", "label_mask")}
        logits, loss = model(**inputs, labels=labels, label_mask=label_mask)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    return total_loss / max(1, len(loader))

def _safe_mean(vals):
    vals = [v for v in vals if not np.isnan(v)]
    return float(np.mean(vals)) if vals else float("nan")

def eval_cls_epoch(model, loader, num_tasks):
    model.eval()
    total_loss = 0.0
    y_true, y_prob, y_mask = [], [], []
    with torch.no_grad():
        for batch in loader:
            labels = batch["labels"].to(device)
            label_mask = batch["label_mask"].to(device)
            inputs = {k: v.to(device) for k, v in batch.items() if k not in ("labels", "label_mask")}
            logits, loss = model(**inputs, labels=labels, label_mask=label_mask)
            total_loss += loss.item()
            probs = torch.sigmoid(logits)
            y_true.append(labels.detach().cpu())
            y_prob.append(probs.detach().cpu())
            y_mask.append(label_mask.detach().cpu())

    if len(y_true) == 0:
        return 0.0, {"roc_auc": float("nan"), "avg_precision": float("nan")}

    y_true = torch.cat(y_true).numpy()
    y_prob = torch.cat(y_prob).numpy()
    y_mask = torch.cat(y_mask).numpy()

    roc_aucs = []
    aps = []
    for t in range(num_tasks):
        mask = y_mask[:, t].astype(bool)
        if mask.sum() == 0:
            roc_aucs.append(float("nan"))
            aps.append(float("nan"))
            continue
        yt = y_true[mask, t]
        yp = y_prob[mask, t]
        if len(np.unique(yt)) < 2:
            roc_aucs.append(float("nan"))
            aps.append(float("nan"))
        else:
            roc_aucs.append(roc_auc_score(yt, yp))
            aps.append(average_precision_score(yt, yp))

    return total_loss / max(1, len(loader)), {
        "roc_auc": _safe_mean(roc_aucs),
        "avg_precision": _safe_mean(aps),
    }


In [53]:
train_dataset = SmilesClassificationDataset(train_df, tokenizer, label_cols, max_len=CLS_MAX_LEN)
valid_dataset = SmilesClassificationDataset(valid_df, tokenizer, label_cols, max_len=CLS_MAX_LEN)
test_dataset = SmilesClassificationDataset(test_df, tokenizer, label_cols, max_len=CLS_MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=CLS_BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=CLS_BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CLS_BATCH_SIZE, shuffle=False)

config = load_config_from_hf(MODEL_NAME, local_only=LOCAL_ONLY)
config.num_labels = int(NUM_TASKS)


In [54]:
# Multi-seed training following ChemBERTa paper (5 seeds)
# Use distributed seeds to avoid correlation between consecutive seeds
SEEDS = [42, 123, 1337, 2023, 9999]

all_test_results = []

for idx, seed in enumerate(SEEDS[:NUM_SEEDS]):
    print(f"{'='*60}")
    print(f"SEED {idx+1}/{NUM_SEEDS} (seed={seed})")
    print(f"{'='*60}")

    # Set seed (before model initialization for reproducibility)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    # Reset model for each seed
    cls_model = RobertaForSequenceClassification(config).to(device)
    if USE_PRETRAINED_WEIGHTS:
        state_dict_pretrained = load_state_dict_from_hf(MODEL_NAME, local_only=LOCAL_ONLY)
        cls_model.load_state_dict(state_dict_pretrained, strict=False)

    optimizer = AdamW(cls_model.parameters(), lr=5e-5)

    for epoch in range(CLS_EPOCHS):
        train_loss = train_cls_epoch(cls_model, train_loader, optimizer)
        val_loss, val_metrics = eval_cls_epoch(cls_model, valid_loader, NUM_TASKS)
        print(
            f"[CLS] epoch {epoch+1} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f} | "
            f"roc_auc={val_metrics['roc_auc']:.4f} | ap={val_metrics['avg_precision']:.4f}"
        )

    test_loss, test_metrics = eval_cls_epoch(cls_model, test_loader, NUM_TASKS)
    print(
        f"[CLS] test_loss={test_loss:.4f} | roc_auc={test_metrics['roc_auc']:.4f} | ap={test_metrics['avg_precision']:.4f}"
    )
    all_test_results.append(test_metrics)

# Calculate mean and std across seeds
print(f"{'='*60}")
print(f"FINAL RESULTS (mean ± std over {NUM_SEEDS} seeds)")
print(f"{'='*60}")
roc_aucs = [r['roc_auc'] for r in all_test_results]
aps = [r['avg_precision'] for r in all_test_results]
print(f"ROC-AUC: {np.mean(roc_aucs):.4f} ± {np.std(roc_aucs):.4f}")
print(f"Avg Precision: {np.mean(aps):.4f} ± {np.std(aps):.4f}")
print(f"Individual ROC-AUCs: {[f'{x:.4f}' for x in roc_aucs]}")


SEED 1/5 (seed=42)


  return torch.load(str(bin_path), map_location='cpu')


[CLS] epoch 1 | train_loss=0.2721 | val_loss=0.1791 | roc_auc=0.7523 | ap=0.9872
[CLS] epoch 2 | train_loss=0.1946 | val_loss=0.1724 | roc_auc=0.8040 | ap=0.9905
[CLS] epoch 3 | train_loss=0.1467 | val_loss=0.1573 | roc_auc=0.8850 | ap=0.9948
[CLS] epoch 4 | train_loss=0.0998 | val_loss=0.1682 | roc_auc=0.8815 | ap=0.9946
[CLS] epoch 5 | train_loss=0.0739 | val_loss=0.2025 | roc_auc=0.9108 | ap=0.9961
[CLS] test_loss=0.1745 | roc_auc=0.8841 | ap=0.9915
SEED 2/5 (seed=123)


  return torch.load(str(bin_path), map_location='cpu')


[CLS] epoch 1 | train_loss=0.2809 | val_loss=0.1745 | roc_auc=0.7195 | ap=0.9847
[CLS] epoch 2 | train_loss=0.1979 | val_loss=0.1654 | roc_auc=0.7653 | ap=0.9876
[CLS] epoch 3 | train_loss=0.1383 | val_loss=0.1517 | roc_auc=0.9120 | ap=0.9962
[CLS] epoch 4 | train_loss=0.0902 | val_loss=0.1687 | roc_auc=0.8873 | ap=0.9947
[CLS] epoch 5 | train_loss=0.0645 | val_loss=0.1650 | roc_auc=0.9378 | ap=0.9973
[CLS] test_loss=0.1939 | roc_auc=0.8881 | ap=0.9914
SEED 3/5 (seed=1337)


  return torch.load(str(bin_path), map_location='cpu')


[CLS] epoch 1 | train_loss=0.3206 | val_loss=0.1914 | roc_auc=0.6174 | ap=0.9694
[CLS] epoch 2 | train_loss=0.2190 | val_loss=0.2012 | roc_auc=0.6796 | ap=0.9794
[CLS] epoch 3 | train_loss=0.1629 | val_loss=0.1664 | roc_auc=0.7864 | ap=0.9879
[CLS] epoch 4 | train_loss=0.1140 | val_loss=0.1873 | roc_auc=0.8404 | ap=0.9920
[CLS] epoch 5 | train_loss=0.0855 | val_loss=0.1943 | roc_auc=0.9143 | ap=0.9962
[CLS] test_loss=0.1911 | roc_auc=0.8273 | ap=0.9864
SEED 4/5 (seed=2023)


  return torch.load(str(bin_path), map_location='cpu')


[CLS] epoch 1 | train_loss=0.2829 | val_loss=0.2131 | roc_auc=0.7265 | ap=0.9845
[CLS] epoch 2 | train_loss=0.1975 | val_loss=0.1590 | roc_auc=0.8392 | ap=0.9923
[CLS] epoch 3 | train_loss=0.1511 | val_loss=0.1553 | roc_auc=0.8908 | ap=0.9951
[CLS] epoch 4 | train_loss=0.1116 | val_loss=0.1724 | roc_auc=0.9002 | ap=0.9955
[CLS] epoch 5 | train_loss=0.0755 | val_loss=0.1371 | roc_auc=0.9026 | ap=0.9957
[CLS] test_loss=0.1665 | roc_auc=0.8633 | ap=0.9893
SEED 5/5 (seed=9999)


  return torch.load(str(bin_path), map_location='cpu')


[CLS] epoch 1 | train_loss=0.2918 | val_loss=0.1820 | roc_auc=0.6984 | ap=0.9804
[CLS] epoch 2 | train_loss=0.2081 | val_loss=0.1694 | roc_auc=0.7688 | ap=0.9874
[CLS] epoch 3 | train_loss=0.1510 | val_loss=0.1542 | roc_auc=0.8744 | ap=0.9942
[CLS] epoch 4 | train_loss=0.1001 | val_loss=0.1818 | roc_auc=0.9155 | ap=0.9961
[CLS] epoch 5 | train_loss=0.0773 | val_loss=0.1752 | roc_auc=0.9296 | ap=0.9969
[CLS] test_loss=0.2053 | roc_auc=0.8153 | ap=0.9861
FINAL RESULTS (mean ± std over 5 seeds)
ROC-AUC: 0.8556 ± 0.0295
Avg Precision: 0.9890 ± 0.0023
Individual ROC-AUCs: ['0.8841', '0.8881', '0.8273', '0.8633', '0.8153']
