# Packages

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
 
from sklearn.linear_model import ElasticNet 
from sklearn.model_selection import KFold, GridSearchCV  
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader   
from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModel, AutoModelForSequenceClassification

# Dataset and Pretrained Model

* The training dataset `train.csv` is loaded to `df_train`.
* The transformer model `ChemBerta` is available in Kaggle (https://www.kaggle.com/code/michaelrowen/opp2025-chemberta-pre-trained-base).

In [2]:
df_train = pd.read_csv('/kaggle/input/melting-point/train.csv')
chemberta_model = '/kaggle/input/c/transformers/default/1/ChemBERTa-77M-MLM'

In [130]:
from sklearn.preprocessing import StandardScaler
Tm_Scaler = StandardScaler()
Tm_Scaler.fit(df_train[['Tm']])
df_train['TmS'] = Tm_Scaler.transform(df_train[['Tm']]).flatten()

Define the data handler for fine-tuning ChemBerta with pytorch.

In [140]:
class ChemDataset(Dataset):  
    def __init__(self, df, tokenizer, max_length=128):  
        self.smiles = df['SMILES'].tolist()  
        self.labels = df['TmS'].tolist()  
        self.tokenizer = tokenizer  
        self.max_length = max_length  
  
    def __len__(self):  
        return len(self.labels)  
  
    def __getitem__(self, idx):  
        encoding = self.tokenizer(  
            self.smiles[idx],  
            truncation=True,  
            padding='max_length',  
            max_length=self.max_length,  
            return_tensors='pt'  
        )  
        item = {key: val.squeeze(0) for key, val in encoding.items()}  
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  
        return item  

Define the transformer model ChemBerta.

In [141]:
# Model retrieved from https://www.kaggle.com/code/michaelrowen/opp2025-chemberta-pre-trained-base
class BERTEmbedder:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(  
            model_name,
            num_labels = 1,  # Regression task  
            problem_type = 'regression'
        )  
        self.model.eval()

Load the transfomer model.

In [142]:
chemberta = BERTEmbedder(model_name=chemberta_model)
optimizer = AdamW(chemberta.model.parameters(), lr=5e-4)
loss_fn = nn.L1Loss()

dataset = ChemDataset(df_train, chemberta.tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/c/transformers/default/1/ChemBERTa-77M-MLM and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The unit test for creating and loading `ChemDataset` in development. 

In [143]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'  
chemberta.model.to(device)

n_epochs = 50
for epoch in range(n_epochs):
    chemberta.model.train()
    epoch_loss = 0
    epoch_size = 0
    for batch in dataloader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}  
        labels = batch['labels'].to(device).unsqueeze(1)  # shape [B,1]  
        outputs = chemberta.model(**inputs).logits  # shape [B,1] 
        loss = loss_fn(outputs, labels)  
        optimizer.zero_grad()  
        loss.backward()  
        optimizer.step()  
        epoch_loss += loss.item() * dataloader.batch_size * (Tm_Scaler.var_[0]**0.5)
        epoch_size += dataloader.batch_size
    
    print(f"Epoch {epoch + 1} done, last batch loss: {epoch_loss/epoch_size:.4f}")  

Epoch 1 done, last batch loss: 45.9048
Epoch 2 done, last batch loss: 39.5324
Epoch 3 done, last batch loss: 36.4212
Epoch 4 done, last batch loss: 34.5219
Epoch 5 done, last batch loss: 33.0328
Epoch 6 done, last batch loss: 32.2097
Epoch 7 done, last batch loss: 30.9195
Epoch 8 done, last batch loss: 30.4775
Epoch 9 done, last batch loss: 29.8145
Epoch 10 done, last batch loss: 29.5090
Epoch 11 done, last batch loss: 28.8498
Epoch 12 done, last batch loss: 28.0860
Epoch 13 done, last batch loss: 27.5519
Epoch 14 done, last batch loss: 26.8073
Epoch 15 done, last batch loss: 26.5618
Epoch 16 done, last batch loss: 26.5693
Epoch 17 done, last batch loss: 26.5914
Epoch 18 done, last batch loss: 26.1552
Epoch 19 done, last batch loss: 25.4796
Epoch 20 done, last batch loss: 25.3421
Epoch 21 done, last batch loss: 24.9859
Epoch 22 done, last batch loss: 24.2280
Epoch 23 done, last batch loss: 23.7546
Epoch 24 done, last batch loss: 23.7413
Epoch 25 done, last batch loss: 25.6481
Epoch 26 

In [144]:
chemberta.model.to('cpu')

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(600, 384, padding_idx=1)
      (position_embeddings): Embedding(515, 384, padding_idx=1)
      (token_type_embeddings): Embedding(1, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.144, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-2): 3 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.109, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
         

In [145]:
def extract_chembert_embeddings(smiles_list, embedder, n_data):
    n_latent = 384
    embeddings = np.zeros((n_data, n_latent))
    
    for i, smiles in enumerate(smiles_list):
        with torch.no_grad():
            # Getting the model output
            encoded_input = embedder.tokenizer(smiles, return_tensors='pt', padding=True, truncation=True)
            model_output = embedder.model(**encoded_input, output_hidden_states=True)
            
            # Getting the CLS token from model output
            embedding = model_output.hidden_states[3][:, 0, :]
            embeddings[i, :] = embedding.numpy()
    
    return pd.DataFrame(embeddings)

In [146]:
# Code retrieved from https://www.kaggle.com/code/michaelrowen/opp2025-chemberta-pre-trained-base
def extract_simple_molecular_features(smiles_list):
    features = []
    for smiles in smiles_list:
        feature_vector = [
            len(smiles),  # SMILES length
            smiles.count('C'),  # Carbon count
            smiles.count('N'),  # Nitrogen count
            smiles.count('O'),  # Oxygen count
            smiles.count('S'),  # Sulfur count
            smiles.count('P'),  # Phosphorus count
            smiles.count('F'),  # Fluorine count
            smiles.count('Cl'),  # Chlorine count
            smiles.count('Br'),  # Bromine count
            smiles.count('I'),  # Iodine count
            smiles.count('='),  # Double bonds
            smiles.count('#'),  # Triple bonds
            smiles.count('-'),  # Single bonds
            smiles.count('(') + smiles.count(')'),  # Branching
            smiles.count('[') + smiles.count(']'),  # Bracket atoms
            smiles.count('@'),  # Chirality centers
            smiles.count('c'),  # Aromatic carbon
            smiles.count('n'),  # Aromatic nitrogen
            smiles.count('o'),  # Aromatic oxygen
            smiles.count('s'),  # Aromatic sulfur
        ]
        features.append(feature_vector)
    
    return pd.DataFrame(features)

In [147]:
embeddings_train = extract_chembert_embeddings(df_train['SMILES'], chemberta, df_train.shape[0])

In [148]:
#embeddings_train = extract_chembert_embeddings(df_train['SMILES'], chemberta, df_train.shape[0])
molecular_features_train = extract_simple_molecular_features(df_train['SMILES'])

In [149]:
print(embeddings_train.shape, type(embeddings_train))
print(molecular_features_train.shape, type(molecular_features_train))

(2662, 384) <class 'pandas.core.frame.DataFrame'>
(2662, 20) <class 'pandas.core.frame.DataFrame'>


In [150]:
df_ttl = pd.concat([df_train, embeddings_train, molecular_features_train], axis=1)
y_train = df_ttl['TmS']
# X_train = df_ttl.drop(df_train.columns, axis=1)
X_train = df_ttl.drop(['id', 'Tm', 'TmS', 'SMILES'], axis=1)
X_train.columns = [str(colname) for colname in X_train.columns]

In [152]:
# from sklearn.ensemble import GradientBoostingRegressor
seed = 20251017

# Initialize the classifier  
#model = RandomForestRegressor(n_jobs=-1)
model = ElasticNet()

# Define hyperparameter grid for optimization  
param_grid = {
    'alpha' : [0.1, 0.5, 1, 5, 50, 100, 200],
    'l1_ratio': [0.25, 0.5, 0.75]
}  

# Set up cross-validation  
cv = KFold(n_splits=10, shuffle=True, random_state=seed)  
  
# Set up GridSearchCV  
gs = GridSearchCV(  
    estimator=model,  
    param_grid=param_grid,  
    cv=cv,  
    scoring='neg_mean_absolute_error',  
    verbose=1,  
    n_jobs=-1  
)  
  
# Fit the model with categorical feature information  
gs.fit(X_train, y_train)  
  
# Print the best hyperparameters and score  
print("Best hyperparameters:", gs.best_params_)  
print("Best CV accuracy:", gs.best_score_ * (Tm_Scaler.var_[0]**0.5))  
  
# Get the best model  
best_model = gs.best_estimator_

Fitting 10 folds for each of 21 candidates, totalling 210 fits
Best hyperparameters: {'alpha': 0.1, 'l1_ratio': 0.25}
Best CV accuracy: -26.33931911040452


In [154]:
best_model

In [161]:
df_test = pd.read_csv('/kaggle/input/melting-point/test.csv')
embeddings_test = extract_chembert_embeddings(df_test['SMILES'], chemberta, df_test.shape[0])
molecular_features_test = extract_simple_molecular_features(df_test['SMILES'])

df_ttl = pd.concat([df_test, embeddings_test, molecular_features_test], axis=1)
# X_test = df_ttl.drop(df_test.columns, axis=1)
X_test = df_ttl.drop(['id', 'SMILES'], axis=1)
X_test.columns = [str(colname) for colname in X_test.columns]

y_pred = Tm_Scaler.inverse_transform(np.expand_dims(best_model.predict(X_test), axis=1))

In [164]:
df_out = pd.DataFrame({'id': df_ttl['id'],'Tm': y_pred.flatten()})
df_out.to_csv('./submission.csv', index=False)