Original Notebook: 

https://www.kaggle.com/code/samithsachidanandan/neurips-rdkit-multi-models-lb-0-033

# Import Dependencies 

In [1]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3


## Importing Required Libraries

In [2]:
 # Let's begin by importing the essential Python libraries needed for data processing, visualization, and modeling.

import pandas as pd
import numpy as np


from sklearn.ensemble import HistGradientBoostingRegressor,ExtraTreesRegressor

from xgboost import XGBRegressor
from catboost import CatBoostRegressor



from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split, cross_val_score



import networkx as nx
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops
from rdkit import Chem

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [3]:
class CFG:
    TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
    SEED = 42
    FOLDS = 5

In [4]:
useless_cols = [   
    
    'MaxPartialCharge', 
    # Nan data
    'BCUT2D_MWHI',
    'BCUT2D_MWLOW',
    'BCUT2D_CHGHI',
    'BCUT2D_CHGLO',
    'BCUT2D_LOGPHI',
    'BCUT2D_LOGPLOW',
    'BCUT2D_MRHI',
    'BCUT2D_MRLOW',

    # Constant data
    'NumRadicalElectrons',
    'SMR_VSA8',
    'SlogP_VSA9',
    'fr_barbitur',
    'fr_benzodiazepine',
    'fr_dihydropyridine',
    'fr_epoxide',
    'fr_isothiocyan',
    'fr_lactam',
    'fr_nitroso',
    'fr_prisulfonamd',
    'fr_thiocyan',

    # High correlated data >0.95
    'MaxEStateIndex',
    'HeavyAtomMolWt',
    'ExactMolWt',
    'NumValenceElectrons',
    'Chi0',
    'Chi0n',
    'Chi0v',
    'Chi1',
    'Chi1n',
    'Chi1v',
    'Chi2n',
    'Kappa1',
    'LabuteASA',
    'HeavyAtomCount',
    'MolMR',
    'Chi3n',
    'BertzCT',
    'Chi2v',
    'Chi4n',
    'HallKierAlpha',
    'Chi3v',
    'Chi4v',
    'MinAbsPartialCharge',
    'MinPartialCharge',
    'MaxAbsPartialCharge',
    'FpDensityMorgan2',
    'FpDensityMorgan3',
    'Phi',
    'Kappa3',
    'fr_nitrile',
    'SlogP_VSA6',
    'NumAromaticCarbocycles',
    'NumAromaticRings',
    'fr_benzene',
    'VSA_EState6',
    'NOCount',
    'fr_C_O',
    'fr_C_O_noCOO',
    'NumHDonors',
    'fr_amide',
    'fr_Nhpyrrole',
    'fr_phenol',
    'fr_phenol_noOrthoHbond',
    'fr_COO2',
    'fr_halogen',
    'fr_diazo',
    'fr_nitro_arom',
    'fr_phos_ester'
]


# Read Files

### Main Files

In [5]:
#We will load both the training and test datasets using pandas, and store test IDs 
train=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
ss=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv')
ID=test['id'].copy()

### Extra Files

In [6]:
tc_smiles =pd.read_csv('/kaggle/input/tc-smiles/Tc_SMILES.csv')
tg_smiles =pd.read_csv('/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv')
ktg_smiles =pd.read_excel('/kaggle/input/smiles-extra-data/data_tg3.xlsx')
de_smiles =pd.read_excel('/kaggle/input/smiles-extra-data/data_dnst1.xlsx')

# Preprocessing 

In [7]:
def make_smile_canonical(smile): # To avoid duplicates, for example: canonical '*C=C(*)C' == '*C(=C*)C'
    try:
        mol = Chem.MolFromSmiles(smile)
        canon_smile = Chem.MolToSmiles(mol, canonical=True)
        return canon_smile
    except:
        return np.nan

train['SMILES'] = train['SMILES'].apply(lambda s: make_smile_canonical(s))
test['SMILES'] = test['SMILES'].apply(lambda s: make_smile_canonical(s))

In [8]:
# Drop invalid rows from train and test
train = train[train['SMILES'].notnull()].reset_index(drop=True)
test = test[test['SMILES'].notnull()].reset_index(drop=True)


In [9]:
ktg_smiles.rename(columns={'Tg [K]': 'Tg'}, inplace=True)
tg_smiles.rename(columns={'Tg (C)': 'Tg'}, inplace=True)
tc_smiles.rename(columns={'TC_mean': 'Tc'}, inplace=True)
de_smiles.rename(columns={'density(g/cm3)': 'Density'}, inplace=True)

In [10]:
de_smiles['SMILES'] = de_smiles['SMILES'].apply(lambda s: make_smile_canonical(s))
de_smiles = de_smiles[(de_smiles['SMILES'].notnull())&(de_smiles['Density'].notnull())&(de_smiles['Density'] != 'nylon')]
de_smiles['Density'] = de_smiles['Density'].astype('float64')
de_smiles['Density'] -= 0.118

ktg_smiles['Tg'] = ktg_smiles['Tg'] - 273.15

[04:53:00] SMILES Parse Error: syntax error while parsing: *O[Si](*)([R])[R]
[04:53:00] SMILES Parse Error: check for mistakes around position 12:
[04:53:00] *O[Si](*)([R])[R]
[04:53:00] ~~~~~~~~~~~^
[04:53:00] SMILES Parse Error: Failed parsing SMILES '*O[Si](*)([R])[R]' for input: '*O[Si](*)([R])[R]'
[04:53:00] SMILES Parse Error: syntax error while parsing: *NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4
[04:53:00] SMILES Parse Error: check for mistakes around position 28:
[04:53:00] c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=
[04:53:00] ~~~~~~~~~~~~~~~~~~~~^
[04:53:00] SMILES Parse Error: Failed parsing SMILES '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4' for input: '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4'
[04:53:00] SMILES Parse Error: syntax error while parsing: O=C=N[R1]N=C=O.O[R2]O.O[R3]O
[04:53:00] SMILES Parse Error: check for mistakes around position 7:
[04:53:00] O=C=N[R1]N=C=O.O[R2]O.O[R3]O
[04:53:00] ~~~~~~^
[04:53:00] SMILES Parse Error: F

In [11]:
def preprocessing(df):
    desc_names = [desc[0] for desc in Descriptors.descList if desc[0] not in useless_cols]
    descriptors = [compute_all_descriptors(smi) for smi in df['SMILES'].to_list()]

    graph_feats = {'graph_diameter': [], 'avg_shortest_path': [], 'num_cycles': []}
    for smile in df['SMILES']:
         compute_graph_features(smile, graph_feats)
        
    result = pd.concat(
        [
            pd.DataFrame(descriptors, columns=desc_names),
            pd.DataFrame(graph_feats)
        ],
        axis=1
    )

    result = result.replace([-np.inf, np.inf], np.nan)
    return result

# Feature Extraction 

In [12]:

def add_extra_data(df_train, df_extra, target):
    n_samples_before = len(df_train[df_train[target].notnull()])
    
    df_extra['SMILES'] = df_extra['SMILES'].apply(lambda s: make_smile_canonical(s))
    df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()
    cross_smiles = set(df_extra['SMILES']) & set(df_train['SMILES'])
    unique_smiles_extra = set(df_extra['SMILES']) - set(df_train['SMILES'])

    # Make priority target value from competition's df
    for smile in df_train[df_train[target].notnull()]['SMILES'].tolist():
        if smile in cross_smiles:
            cross_smiles.remove(smile)

    # Imput missing values for competition's SMILES
    for smile in cross_smiles:
        df_train.loc[df_train['SMILES']==smile, target] = df_extra[df_extra['SMILES']==smile][target].values[0]
    
    df_train = pd.concat([df_train, df_extra[df_extra['SMILES'].isin(unique_smiles_extra)]], axis=0).reset_index(drop=True)

    n_samples_after = len(df_train[df_train[target].notnull()])
    print(f'\nFor target "{target}" added {n_samples_after-n_samples_before} new samples!')
    print(f'New unique SMILES: {len(unique_smiles_extra)}')
    return df_train

train = add_extra_data(train, tc_smiles, 'Tc')
train = add_extra_data(train, tg_smiles, 'Tg')
train = add_extra_data(train, ktg_smiles, 'Tg')
train = add_extra_data(train, de_smiles, 'Density')


For target "Tc" added 129 new samples!
New unique SMILES: 129

For target "Tg" added 151 new samples!
New unique SMILES: 136

For target "Tg" added 499 new samples!
New unique SMILES: 499

For target "Density" added 634 new samples!
New unique SMILES: 524


In [13]:
def compute_all_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(desc_names)
    return [desc[1](mol) for desc in Descriptors.descList if desc[0] not in useless_cols]

def compute_graph_features(smiles, graph_feats):
    mol = Chem.MolFromSmiles(smiles)
    adj = rdmolops.GetAdjacencyMatrix(mol)
    G = nx.from_numpy_array(adj)

    graph_feats['graph_diameter'].append(nx.diameter(G) if nx.is_connected(G) else 0)
    graph_feats['avg_shortest_path'].append(nx.average_shortest_path_length(G) if nx.is_connected(G) else 0)
    graph_feats['num_cycles'].append(len(list(nx.cycle_basis(G))))

train = pd.concat([train, preprocessing(train)], axis=1)
test = pd.concat([test, preprocessing(test)], axis=1)

# Find constant columns for each target
all_features = train.columns[7:].tolist()
features = {}
for target in CFG.TARGETS:
    const_descs = []
    for col in train.columns.drop(CFG.TARGETS):
        if train[train[target].notnull()][col].nunique() == 1:
            const_descs.append(col)
    features[target] = [f for f in all_features if f not in const_descs]

print(train.shape)
train['Ipc']=np.log10(train['Ipc'])  
for n in train.columns[7:]:
    train[n]=train[n].replace(-np.inf,np.nan)
    train[n]=train[n].replace(np.inf,np.nan)    
    train[n].fillna(train[n].mean())
  
print(train.shape)
test['Ipc']=np.log10(test['Ipc'])
for n in test.columns[7:]:
    train[n]=train[n].replace(-np.inf,np.nan)
    train[n]=train[n].replace(np.inf,np.nan)      
    test[n].fillna(train[n].mean())

(9261, 158)
(9261, 158)


# Data Preparation For Model Training 

In [14]:
# We'll separate train to be one model for each target variable.
t_1=train[['SMILES','Tg']].copy()
t_2=train[['SMILES','FFV']].copy()
t_3=train[['SMILES','Tc']].copy()
t_4=train[['SMILES','Density']].copy()
t_5=train[['SMILES','Rg']].copy()

# We will drop the rows with missing values related to that target after separation.
#This is important , dropping them beforehand would result Null for all data.
t_1.dropna(inplace=True)
t_2.dropna(inplace=True)
t_3.dropna(inplace=True)
t_4.dropna(inplace=True)
t_5.dropna(inplace=True)

In [15]:
train=train.drop(['id','Tg','FFV','Tc','Density','Rg'],axis=1)
test=test.drop(['id','SMILES'],axis=1)

In [16]:
tg=t_1.merge(train,on='SMILES',how='left')
ffv=t_2.merge(train,on='SMILES',how='left')
tc=t_3.merge(train,on='SMILES',how='left')
density=t_4.merge(train,on='SMILES',how='left')
rg=t_5.merge(train,on='SMILES',how='left')

In [17]:
for i in (tg,tc,density,ffv,rg):
    i.drop('SMILES',axis=1,inplace=True)
    i.dropna(inplace=True)

# Model 

In [18]:


def train_and_evaluate(train_d, test_d, model_class, target, submission=False, model_params=None, cv=False):
    X = train_d.drop(target, axis=1)
    y = train_d[target].copy()
    
    if model_params is None:
        model_params = {}

    model = model_class(**model_params)

    if not submission:
        if cv:
            scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=5)
            return -scores.mean()
        else:
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            return mean_absolute_error(y_val, y_pred)
    else:
        model.fit(X, y)
        return model.predict(test_d)


# Model Evaluation

In [19]:
print("Tg MAE:", train_and_evaluate(tg, test, HistGradientBoostingRegressor, 'Tg'))
print("FFV MAE:", train_and_evaluate(ffv, test, ExtraTreesRegressor, 'FFV'))
print("Tc MAE:", train_and_evaluate(tc, test, XGBRegressor, 'Tc'))
print("Density MAE:", train_and_evaluate(density, test, ExtraTreesRegressor, 'Density'))
print("Rg MAE:", train_and_evaluate(rg, test, ExtraTreesRegressor, 'Rg'))


Tg MAE: 35.40659970616041
FFV MAE: 0.006684088847030577
Tc MAE: 0.03570747889537072
Density MAE: 0.03939028978212005
Rg MAE: 1.7970949344762608


# Final Model For Submission

In [20]:
sub = {
    'id': ID,

    'Tg':   0.3 * train_and_evaluate(tg, test, ExtraTreesRegressor, 'Tg', submission=True) +
            0.3 * train_and_evaluate(tg, test, XGBRegressor, 'Tg', submission=True) + 
            0.4 * train_and_evaluate(tg, test, CatBoostRegressor, 'Tg', submission=True),

    'FFV': 0.3 * train_and_evaluate(ffv, test, ExtraTreesRegressor, 'FFV', submission=True) +
           0.3 * train_and_evaluate(ffv, test, XGBRegressor, 'FFV', submission=True)+
           0.4 * train_and_evaluate(ffv, test, CatBoostRegressor, 'FFV', submission=True),

    'Tc': 0.3 * train_and_evaluate(tc, test, ExtraTreesRegressor, 'Tc', submission=True) +
          0.3 * train_and_evaluate(tc, test, XGBRegressor, 'Tc', submission=True) + 
          0.4 * train_and_evaluate(tc, test, CatBoostRegressor, 'Tc', submission=True),

    'Density': train_and_evaluate(density, test, ExtraTreesRegressor, 'Density', submission=True),

    'Rg': train_and_evaluate(rg, test, ExtraTreesRegressor, 'Rg', submission=True),
}



Learning rate set to 0.041914
0:	learn: 119.9584337	total: 61.6ms	remaining: 1m 1s
1:	learn: 117.2674750	total: 67.3ms	remaining: 33.6s
2:	learn: 114.4613405	total: 73ms	remaining: 24.3s
3:	learn: 111.8487907	total: 78.6ms	remaining: 19.6s
4:	learn: 109.4261930	total: 84ms	remaining: 16.7s
5:	learn: 106.8803364	total: 90ms	remaining: 14.9s
6:	learn: 104.5482567	total: 95.6ms	remaining: 13.6s
7:	learn: 102.3481366	total: 102ms	remaining: 12.6s
8:	learn: 100.1395327	total: 107ms	remaining: 11.8s
9:	learn: 98.0239020	total: 113ms	remaining: 11.2s
10:	learn: 96.2951582	total: 119ms	remaining: 10.7s
11:	learn: 94.3171449	total: 124ms	remaining: 10.2s
12:	learn: 92.6619335	total: 130ms	remaining: 9.87s
13:	learn: 90.9837088	total: 135ms	remaining: 9.54s
14:	learn: 89.0933494	total: 141ms	remaining: 9.27s
15:	learn: 87.6248926	total: 146ms	remaining: 9.01s
16:	learn: 86.1622678	total: 153ms	remaining: 8.82s
17:	learn: 84.6350953	total: 159ms	remaining: 8.65s
18:	learn: 83.2105142	total: 164ms

In [21]:
submission=pd.DataFrame(sub)

In [22]:
submission

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,168.235597,0.375068,0.190117,1.1335,21.411298
1,1422188626,190.200663,0.379833,0.233836,1.111195,19.920059
2,2032016830,117.766927,0.352973,0.262746,1.084946,20.361165


In [23]:
submission.to_csv('submission.csv',index=False)

Acknowledgement: 
- [Baseline_For_Beginners](https://www.kaggle.com/code/adamlogman/baseline-for-beginners)
- [Dmitry Uarov](https://www.kaggle.com/dmitryuarov)
- [NeurIPS | Baseline + External data](https://www.kaggle.com/code/dmitryuarov/neurips-baseline-external-data)
- [neurips-rdkit-multi-models](https://www.kaggle.com/code/adamlogman/neurips-rdkit-multi-models)


Result:

Score: 0.036

Rank: 51 (2025-07130-14:03, JST)

Your Best Entry!
Your submission scored 0.036, which is not an improvement of your previous score. Keep trying!