In [1]:
!pip install transformers



In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv("/kaggle/input/neurips-open-polymer-prediction-2025/train.csv")  # replace with actual path
target_cols = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
print(df[target_cols].isnull().sum())

Tg         7462
FFV         943
Tc         7236
Density    7360
Rg         7359
dtype: int64


In [3]:
df = df[df[target_cols].notnull().any(axis=1)].reset_index(drop=True)

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[target_cols] = scaler.fit_transform(df[target_cols])

# Save the scaler to use it during inference
import joblib
joblib.dump(scaler, 'target_scaler.pkl')

['target_scaler.pkl']

In [5]:
df_clean = df[['SMILES'] + target_cols]
df_clean.to_csv("preprocessed_train.csv", index=False)

df_clean.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,SMILES,Tg,FFV,Tc,Density,Rg
0,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.251068,-0.566261,,
1,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.108023,,,
2,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.393413,,,
3,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.679303,,,
4,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,-0.396594,,,


In [6]:
from transformers import AutoTokenizer

# Load ChemBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

# Tokenize the SMILES strings
# (pad to longest sequence in batch, truncate if needed)
tokenized = tokenizer(
    list(df_clean["SMILES"]),
    padding="max_length",
    truncation=True,
    max_length=256,
    return_tensors="pt"
)

# Check output
print(tokenized.keys())  # ['input_ids', 'attention_mask']
print(tokenized['input_ids'].shape)


tokenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/501 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/9.43k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

dict_keys(['input_ids', 'attention_mask'])
torch.Size([7973, 256])


In [7]:
import torch

# Get target tensor (float32)
targets = torch.tensor(df_clean[target_cols].values, dtype=torch.float32)


In [8]:
from torch.utils.data import Dataset

class PolymerDataset(Dataset):
    def __init__(self, encodings, targets):
        self.encodings = encodings
        self.targets = targets

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.targets[idx]
        return item

    def __len__(self):
        return len(self.targets)
dataset = PolymerDataset(tokenized, targets)


In [9]:
!pip install pytorch-lightning

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cusparse_cu12-12

In [10]:
import pytorch_lightning as pl
from transformers import AutoModel
import torch.nn as nn
import torch

class ChemBERTaRegressor(pl.LightningModule):
    def __init__(self, n_targets=5, lr=2e-5):
        super().__init__()
        self.save_hyperparameters()
        self.chemberta = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
        self.regressor = nn.Linear(self.chemberta.config.hidden_size, n_targets)
        self.loss_fn = nn.MSELoss(reduction='none')  # we'll manually mask it
        self.lr = lr

    def forward(self, input_ids, attention_mask):
        outputs = self.chemberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        return self.regressor(pooled_output)

    def compute_loss(self, preds, targets):
        mask = ~torch.isnan(targets)
        loss = self.loss_fn(preds, targets)
        masked_loss = loss[mask]
        return masked_loss.mean()

    def training_step(self, batch, batch_idx):
        preds = self(batch['input_ids'], batch['attention_mask'])
        loss = self.compute_loss(preds, batch['labels'])
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        preds = self(batch['input_ids'], batch['attention_mask'])
        loss = self.compute_loss(preds, batch['labels'])
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)


In [11]:
from torch.utils.data import DataLoader, random_split

# 90/10 split
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=16)

In [12]:
from transformers import AutoTokenizer, AutoModel

chemberta_path = "/kaggle/input/chemberta-offline-model/chemberta"

tokenizer = AutoTokenizer.from_pretrained(chemberta_path)
model = AutoModel.from_pretrained(chemberta_path)


2025-06-19 09:36:55.287457: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750325815.463150      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750325815.519109      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [13]:
import pytorch_lightning as pl
from transformers import AutoModel
import torch.nn as nn
import torch

class ChemBERTaRegressor(pl.LightningModule):
    def __init__(self, n_targets=5, lr=2e-5, model_path=None):
        super().__init__()
        self.save_hyperparameters()

        # Make sure model_path is passed
        if model_path is None:
            raise ValueError("You must pass model_path for offline ChemBERTa usage.")

        self.chemberta = AutoModel.from_pretrained(model_path)
        self.regressor = nn.Linear(self.chemberta.config.hidden_size, n_targets)
        self.loss_fn = nn.MSELoss(reduction='none')
        self.lr = lr

    def forward(self, input_ids, attention_mask):
        outputs = self.chemberta(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = outputs.last_hidden_state[:, 0, :]  # CLS token
        return self.regressor(cls_token)

    def compute_loss(self, preds, targets):
        mask = ~torch.isnan(targets)
        loss = self.loss_fn(preds, targets)
        masked_loss = loss[mask]
        return masked_loss.mean()

    def training_step(self, batch, batch_idx):
        preds = self(batch['input_ids'], batch['attention_mask'])
        loss = self.compute_loss(preds, batch['labels'])
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        preds = self(batch['input_ids'], batch['attention_mask'])
        loss = self.compute_loss(preds, batch['labels'])
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)


In [14]:
model_path = "/kaggle/input/chemberta-offline-model/chemberta"
model = ChemBERTaRegressor(model_path=model_path)
trainer = pl.Trainer(
    max_epochs=5,
    accelerator='gpu' if torch.cuda.is_available() else 'cpu',
    precision=16,
)

trainer.fit(model, train_loader, val_loader)


/usr/local/lib/python3.11/dist-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [15]:
# Load test data
test_df = pd.read_csv("/kaggle/input/neurips-open-polymer-prediction-2025/test.csv")  # replace path

# Tokenize SMILES
test_tokenized = tokenizer(
    list(test_df["SMILES"]),
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Wrap in a Dataset
class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings["input_ids"])

test_dataset = TestDataset(test_tokenized)
test_loader = DataLoader(test_dataset, batch_size=16)


In [16]:
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(model.device) for k, v in batch.items()}
        preds = model(batch['input_ids'], batch['attention_mask'])
        predictions.append(preds.cpu())
all_preds = torch.cat(predictions, dim=0).numpy()


In [17]:
scaler = joblib.load("target_scaler.pkl")
final_preds = scaler.inverse_transform(all_preds)

In [18]:

target_cols = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
submission_df = test_df[["id"]].copy()
submission_df[target_cols] = final_preds


submission_df.to_csv("submission.csv", index=False)
submission_df.head()

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,160.099625,0.383385,0.362935,1.048936,16.710196
1,1422188626,99.689941,0.371782,0.295857,1.168385,12.914957
2,2032016830,80.613205,0.363323,0.303689,1.176921,12.100863
