In [None]:
import numpy as np
import pandas as pd
import torch
 
from sklearn.linear_model import ElasticNet 
from sklearn.model_selection import KFold, GridSearchCV  
from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModel

In [None]:
df_train = pd.read_csv('/kaggle/input/melting-point/train.csv')
chemberta_model = '/kaggle/input/c/transformers/default/1/ChemBERTa-77M-MLM'

In [None]:
# Model retrieved from https://www.kaggle.com/code/michaelrowen/opp2025-chemberta-pre-trained-base
class BERTEmbedder:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.model.eval()

print('Loading ChemBERTa model...')
try:
    chemberta = BERTEmbedder(model_name=chemberta_model)
    print('ChemBERTa loaded successfully!')
except Exception as e:
    print(f'Error loading ChemBERTa: {e}')

In [None]:
def extract_chembert_embeddings(smiles_list, embedder, n_data):
    n_latent = 384
    embeddings = np.zeros((n_data, n_latent))
    
    for i, smiles in enumerate(smiles_list):
        with torch.no_grad():
            # Getting the model output
            encoded_input = embedder.tokenizer(smiles, return_tensors='pt', padding=True, truncation=True)
            model_output = embedder.model(**encoded_input)
        
            # Getting the CLS token from model output
            embedding = model_output[0][:,0,:]
            embeddings[i, :] = embedding.numpy()
    
    return pd.DataFrame(embeddings)

In [None]:
# Code retrieved from https://www.kaggle.com/code/michaelrowen/opp2025-chemberta-pre-trained-base
def extract_simple_molecular_features(smiles_list):
    features = []
    for smiles in smiles_list:
        feature_vector = [
            len(smiles),  # SMILES length
            smiles.count('C'),  # Carbon count
            smiles.count('N'),  # Nitrogen count
            smiles.count('O'),  # Oxygen count
            smiles.count('S'),  # Sulfur count
            smiles.count('P'),  # Phosphorus count
            smiles.count('F'),  # Fluorine count
            smiles.count('Cl'),  # Chlorine count
            smiles.count('Br'),  # Bromine count
            smiles.count('I'),  # Iodine count
            smiles.count('='),  # Double bonds
            smiles.count('#'),  # Triple bonds
            smiles.count('-'),  # Single bonds
            smiles.count('(') + smiles.count(')'),  # Branching
            smiles.count('[') + smiles.count(']'),  # Bracket atoms
            smiles.count('@'),  # Chirality centers
            smiles.count('c'),  # Aromatic carbon
            smiles.count('n'),  # Aromatic nitrogen
            smiles.count('o'),  # Aromatic oxygen
            smiles.count('s'),  # Aromatic sulfur
        ]
        features.append(feature_vector)
    
    return pd.DataFrame(features)

In [None]:
embeddings_train = extract_chembert_embeddings(df_train['SMILES'], chemberta, df_train.shape[0])
molecular_features_train = extract_simple_molecular_features(df_train['SMILES'])

In [None]:
print(embeddings_train.shape, type(embeddings_train))
print(molecular_features_train.shape, type(molecular_features_train))

In [None]:
df_ttl = pd.concat([df_train, embeddings_train, molecular_features_train], axis=1)
df_ttl.drop(['id', 'SMILES'], axis=1, inplace=True)
y_train = df_ttl['Tm']
X_train = df_ttl.drop(['Tm'], axis=1)
X_train.columns = [str(colname) for colname in X_train.columns]

In [None]:
seed = 4

# Initialize the classifier  
#model = RandomForestRegressor(n_jobs=-1)
model = ElasticNet()

# Define hyperparameter grid for optimization  
param_grid = {
    'alpha' : [0.1, 0.5, 1, 5, 50, 100, 200],
    'l1_ratio': [0.25, 0.5, 0.75]
}  

# Set up cross-validation  
cv = KFold(n_splits=10, shuffle=True, random_state=seed)  
  
# Set up GridSearchCV  
gs = GridSearchCV(  
    estimator=model,  
    param_grid=param_grid,  
    cv=cv,  
    scoring='neg_mean_absolute_error',  
    verbose=1,  
    n_jobs=-1  
)  
  
# Fit the model with categorical feature information  
gs.fit(X_train, y_train)  
  
# Print the best hyperparameters and score  
print("Best hyperparameters:", gs.best_params_)  
print("Best CV accuracy:", gs.best_score_)  
  
# Get the best model  
best_model = gs.best_estimator_

In [None]:
best_model

In [None]:
df_test = pd.read_csv('/kaggle/input/melting-point/test.csv')
embeddings_test = extract_chembert_embeddings(df_test['SMILES'], chemberta, df_test.shape[0])
molecular_features_test = extract_simple_molecular_features(df_test['SMILES'])

df_ttl = pd.concat([df_test, embeddings_test, molecular_features_test], axis=1)
X_test = df_ttl.drop(['id', 'SMILES'], axis=1)
X_test.columns = [str(colname) for colname in X_test.columns]

y_pred = best_model.predict(X_test)

In [None]:
df_out = pd.DataFrame({'id': df_ttl['id'],'Tm': y_pred})
df_out.to_csv('./submission.csv', index=False)