Original Notobook:

https://www.kaggle.com/code/adamlogman/neurips-rdkit-multi-models

This notebook is based on my previous notebook [Baseline_For_Beginners](https://www.kaggle.com/code/adamlogman/baseline-for-beginners),and [Dmitry Uarov](https://www.kaggle.com/dmitryuarov) 's [NeurIPS | Baseline + External data](https://www.kaggle.com/code/dmitryuarov/neurips-baseline-external-data) notebook with modifications to the model

# Import Dependencies 

In [1]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3


In [2]:
 # Importing Required Libraries\nLet's begin by importing the essential Python libraries needed for data processing, visualization, and modeling.

import pandas as pd
import numpy as np


from sklearn.ensemble import HistGradientBoostingRegressor,ExtraTreesRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


import networkx as nx
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops
from rdkit import Chem

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [3]:
class CFG:
    TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
    SEED = 42
    FOLDS = 5

In [4]:
useless_cols = [   
    
    'MaxPartialCharge', 
    # Nan data
    'BCUT2D_MWHI',
    'BCUT2D_MWLOW',
    'BCUT2D_CHGHI',
    'BCUT2D_CHGLO',
    'BCUT2D_LOGPHI',
    'BCUT2D_LOGPLOW',
    'BCUT2D_MRHI',
    'BCUT2D_MRLOW',

    # Constant data
    'NumRadicalElectrons',
    'SMR_VSA8',
    'SlogP_VSA9',
    'fr_barbitur',
    'fr_benzodiazepine',
    'fr_dihydropyridine',
    'fr_epoxide',
    'fr_isothiocyan',
    'fr_lactam',
    'fr_nitroso',
    'fr_prisulfonamd',
    'fr_thiocyan',

    # High correlated data >0.95
    'MaxEStateIndex',
    'HeavyAtomMolWt',
    'ExactMolWt',
    'NumValenceElectrons',
    'Chi0',
    'Chi0n',
    'Chi0v',
    'Chi1',
    'Chi1n',
    'Chi1v',
    'Chi2n',
    'Kappa1',
    'LabuteASA',
    'HeavyAtomCount',
    'MolMR',
    'Chi3n',
    'BertzCT',
    'Chi2v',
    'Chi4n',
    'HallKierAlpha',
    'Chi3v',
    'Chi4v',
    'MinAbsPartialCharge',
    'MinPartialCharge',
    'MaxAbsPartialCharge',
    'FpDensityMorgan2',
    'FpDensityMorgan3',
    'Phi',
    'Kappa3',
    'fr_nitrile',
    'SlogP_VSA6',
    'NumAromaticCarbocycles',
    'NumAromaticRings',
    'fr_benzene',
    'VSA_EState6',
    'NOCount',
    'fr_C_O',
    'fr_C_O_noCOO',
    'NumHDonors',
    'fr_amide',
    'fr_Nhpyrrole',
    'fr_phenol',
    'fr_phenol_noOrthoHbond',
    'fr_COO2',
    'fr_halogen',
    'fr_diazo',
    'fr_nitro_arom',
    'fr_phos_ester'
]


# Read Files

### Main Files

In [5]:
 #We will load both the training and test datasets using pandas, and store test IDs 
train=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
ss=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv')
ID=test['id'].copy()

### Extra Files

In [6]:
tc_smiles =pd.read_csv('/kaggle/input/tc-smiles/Tc_SMILES.csv')
tg_smiles =pd.read_csv('/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv')
ktg_smiles =pd.read_excel('/kaggle/input/smiles-extra-data/data_tg3.xlsx')
de_smiles =pd.read_excel('/kaggle/input/smiles-extra-data/data_dnst1.xlsx')

# Preprocessing 

In [7]:
def make_smile_canonical(smile): # To avoid duplicates, for example: canonical '*C=C(*)C' == '*C(=C*)C'
    try:
        mol = Chem.MolFromSmiles(smile)
        canon_smile = Chem.MolToSmiles(mol, canonical=True)
        return canon_smile
    except:
        return np.nan

train['SMILES'] = train['SMILES'].apply(lambda s: make_smile_canonical(s))
test['SMILES'] = test['SMILES'].apply(lambda s: make_smile_canonical(s))

In [8]:
ktg_smiles.rename(columns={'Tg [K]': 'Tg'}, inplace=True)
tg_smiles.rename(columns={'Tg (C)': 'Tg'}, inplace=True)
tc_smiles.rename(columns={'TC_mean': 'Tc'}, inplace=True)
de_smiles.rename(columns={'density(g/cm3)': 'Density'}, inplace=True)

In [9]:
de_smiles['SMILES'] = de_smiles['SMILES'].apply(lambda s: make_smile_canonical(s))
de_smiles = de_smiles[(de_smiles['SMILES'].notnull())&(de_smiles['Density'].notnull())&(de_smiles['Density'] != 'nylon')]
de_smiles['Density'] = de_smiles['Density'].astype('float64')
de_smiles['Density'] -= 0.118

ktg_smiles['Tg'] = ktg_smiles['Tg'] - 273.15

[04:28:57] SMILES Parse Error: syntax error while parsing: *O[Si](*)([R])[R]
[04:28:57] SMILES Parse Error: check for mistakes around position 12:
[04:28:57] *O[Si](*)([R])[R]
[04:28:57] ~~~~~~~~~~~^
[04:28:57] SMILES Parse Error: Failed parsing SMILES '*O[Si](*)([R])[R]' for input: '*O[Si](*)([R])[R]'
[04:28:57] SMILES Parse Error: syntax error while parsing: *NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4
[04:28:57] SMILES Parse Error: check for mistakes around position 28:
[04:28:57] c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=
[04:28:57] ~~~~~~~~~~~~~~~~~~~~^
[04:28:57] SMILES Parse Error: Failed parsing SMILES '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4' for input: '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4'
[04:28:57] SMILES Parse Error: syntax error while parsing: O=C=N[R1]N=C=O.O[R2]O.O[R3]O
[04:28:57] SMILES Parse Error: check for mistakes around position 7:
[04:28:57] O=C=N[R1]N=C=O.O[R2]O.O[R3]O
[04:28:57] ~~~~~~^
[04:28:57] SMILES Parse Error: F

In [10]:
def preprocessing(df):
    desc_names = [desc[0] for desc in Descriptors.descList if desc[0] not in useless_cols]
    descriptors = [compute_all_descriptors(smi) for smi in df['SMILES'].to_list()]

    graph_feats = {'graph_diameter': [], 'avg_shortest_path': [], 'num_cycles': []}
    for smile in df['SMILES']:
         compute_graph_features(smile, graph_feats)
        
    result = pd.concat(
        [
            pd.DataFrame(descriptors, columns=desc_names),
            pd.DataFrame(graph_feats)
        ],
        axis=1
    )

    result = result.replace([-np.inf, np.inf], np.nan)
    return result

# Feature Extraction 

In [11]:

def add_extra_data(df_train, df_extra, target):
    n_samples_before = len(df_train[df_train[target].notnull()])
    
    df_extra['SMILES'] = df_extra['SMILES'].apply(lambda s: make_smile_canonical(s))
    df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()
    cross_smiles = set(df_extra['SMILES']) & set(df_train['SMILES'])
    unique_smiles_extra = set(df_extra['SMILES']) - set(df_train['SMILES'])

    # Make priority target value from competition's df
    for smile in df_train[df_train[target].notnull()]['SMILES'].tolist():
        if smile in cross_smiles:
            cross_smiles.remove(smile)

    # Imput missing values for competition's SMILES
    for smile in cross_smiles:
        df_train.loc[df_train['SMILES']==smile, target] = df_extra[df_extra['SMILES']==smile][target].values[0]
    
    df_train = pd.concat([df_train, df_extra[df_extra['SMILES'].isin(unique_smiles_extra)]], axis=0).reset_index(drop=True)

    n_samples_after = len(df_train[df_train[target].notnull()])
    print(f'\nFor target "{target}" added {n_samples_after-n_samples_before} new samples!')
    print(f'New unique SMILES: {len(unique_smiles_extra)}')
    return df_train

train = add_extra_data(train, tc_smiles, 'Tc')
train = add_extra_data(train, tg_smiles, 'Tg')
train = add_extra_data(train, ktg_smiles, 'Tg')
train = add_extra_data(train, de_smiles, 'Density')


For target "Tc" added 129 new samples!
New unique SMILES: 129

For target "Tg" added 151 new samples!
New unique SMILES: 136

For target "Tg" added 499 new samples!
New unique SMILES: 499

For target "Density" added 634 new samples!
New unique SMILES: 524


In [12]:
def compute_all_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(desc_names)
    return [desc[1](mol) for desc in Descriptors.descList if desc[0] not in useless_cols]

def compute_graph_features(smiles, graph_feats):
    mol = Chem.MolFromSmiles(smiles)
    adj = rdmolops.GetAdjacencyMatrix(mol)
    G = nx.from_numpy_array(adj)

    graph_feats['graph_diameter'].append(nx.diameter(G) if nx.is_connected(G) else 0)
    graph_feats['avg_shortest_path'].append(nx.average_shortest_path_length(G) if nx.is_connected(G) else 0)
    graph_feats['num_cycles'].append(len(list(nx.cycle_basis(G))))

train = pd.concat([train, preprocessing(train)], axis=1)
test = pd.concat([test, preprocessing(test)], axis=1)

# Find constant columns for each target
all_features = train.columns[7:].tolist()
features = {}
for target in CFG.TARGETS:
    const_descs = []
    for col in train.columns.drop(CFG.TARGETS):
        if train[train[target].notnull()][col].nunique() == 1:
            const_descs.append(col)
    features[target] = [f for f in all_features if f not in const_descs]

print(train.shape)
train['Ipc']=np.log10(train['Ipc'])  
for n in train.columns[7:]:
    train[n]=train[n].replace(-np.inf,np.nan)
    train[n]=train[n].replace(np.inf,np.nan)    
    train[n].fillna(train[n].mean())
  
print(train.shape)
test['Ipc']=np.log10(test['Ipc'])
for n in test.columns[7:]:
    train[n]=train[n].replace(-np.inf,np.nan)
    train[n]=train[n].replace(np.inf,np.nan)      
    test[n].fillna(train[n].mean())

(9261, 158)
(9261, 158)


# Data Preparation For Model Training 

In [13]:
# We'll separate train to be one model for each target variable.
t_1=train[['SMILES','Tg']].copy()
t_2=train[['SMILES','FFV']].copy()
t_3=train[['SMILES','Tc']].copy()
t_4=train[['SMILES','Density']].copy()
t_5=train[['SMILES','Rg']].copy()

# We will drop the rows with missing values related to that target after separation.
#This is important , dropping them beforehand would result Null for all data.
t_1.dropna(inplace=True)
t_2.dropna(inplace=True)
t_3.dropna(inplace=True)
t_4.dropna(inplace=True)
t_5.dropna(inplace=True)

In [14]:
train=train.drop(['id','Tg','FFV','Tc','Density','Rg'],axis=1)
test=test.drop(['id','SMILES'],axis=1)

In [15]:
tg=t_1.merge(train,on='SMILES',how='left')
ffv=t_2.merge(train,on='SMILES',how='left')
tc=t_3.merge(train,on='SMILES',how='left')
density=t_4.merge(train,on='SMILES',how='left')
rg=t_5.merge(train,on='SMILES',how='left')

In [16]:
for i in (tg,tc,density,ffv,rg):
    i.drop('SMILES',axis=1,inplace=True)
    i.dropna(inplace=True)

# Model 

In [17]:
# Let’s define a reusable function to train and evaluate our machine learning model.

def model(train_d,test_d,model,target,submission=False):
    # We divide the data into training and validation sets for model evaluation
    X=train_d.drop(target,axis=1)
    y=train_d[target].copy()
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

    Model=model()
    if submission==False:
       Model.fit(X_train,y_train)
       y_pred=Model.predict(X_test)
       return mean_absolute_error(y_pred,y_test)         # We assess our model performance using MAE metric
    if submission==True:
       Model.fit(X,y)
       submission=Model.predict(test_d)
       return submission
        

# Model Evaluation

In [18]:
model(tg,test,HistGradientBoostingRegressor,'Tg',submission=False)

35.05668069826185

In [19]:
model(ffv,test,ExtraTreesRegressor,'FFV',submission=False)

0.006700057849999993

In [20]:
model(tc,test,CatBoostRegressor,'Tc',submission=False)

Learning rate set to 0.038629
0:	learn: 0.1023207	total: 60.3ms	remaining: 1m
1:	learn: 0.1005159	total: 66.5ms	remaining: 33.2s
2:	learn: 0.0986318	total: 72.5ms	remaining: 24.1s
3:	learn: 0.0970325	total: 77.8ms	remaining: 19.4s
4:	learn: 0.0953473	total: 83.2ms	remaining: 16.5s
5:	learn: 0.0937419	total: 88.2ms	remaining: 14.6s
6:	learn: 0.0922845	total: 93.5ms	remaining: 13.3s
7:	learn: 0.0906400	total: 98.8ms	remaining: 12.3s
8:	learn: 0.0890473	total: 104ms	remaining: 11.5s
9:	learn: 0.0876884	total: 110ms	remaining: 10.9s
10:	learn: 0.0864398	total: 115ms	remaining: 10.3s
11:	learn: 0.0852053	total: 121ms	remaining: 9.93s
12:	learn: 0.0839479	total: 126ms	remaining: 9.57s
13:	learn: 0.0827617	total: 131ms	remaining: 9.24s
14:	learn: 0.0815484	total: 137ms	remaining: 8.97s
15:	learn: 0.0805342	total: 142ms	remaining: 8.72s
16:	learn: 0.0794173	total: 147ms	remaining: 8.51s
17:	learn: 0.0785678	total: 152ms	remaining: 8.31s
18:	learn: 0.0776779	total: 158ms	remaining: 8.13s
19:	le

0.0345094396198343

In [21]:
model(density,test,ExtraTreesRegressor,'Density',submission=False)

0.039362036337160046

In [22]:
model(rg,test,ExtraTreesRegressor,'Rg',submission=False)

1.7868833344265052

# Final Model For Submission

In [23]:
 # Finally, we use the model to predict on the test set and prepare the submission file.

sub={'id':ID,'Tg':model(tg,test,ExtraTreesRegressor,'Tg',submission=True),
     'FFV':model(ffv,test,ExtraTreesRegressor,'FFV',submission=True),
     'Tc':model(tc,test,CatBoostRegressor,'Tc',submission=True),
     'Density':model(density,test,ExtraTreesRegressor,'Density',submission=True),
     'Rg':model(rg,test,ExtraTreesRegressor,'Rg',submission=True)}

Learning rate set to 0.040023
0:	learn: 0.0993555	total: 9.97ms	remaining: 9.96s
1:	learn: 0.0975259	total: 16.5ms	remaining: 8.25s
2:	learn: 0.0958017	total: 22.2ms	remaining: 7.37s
3:	learn: 0.0939474	total: 27.6ms	remaining: 6.88s
4:	learn: 0.0922230	total: 33.2ms	remaining: 6.61s
5:	learn: 0.0906151	total: 38.8ms	remaining: 6.42s
6:	learn: 0.0891982	total: 44.3ms	remaining: 6.28s
7:	learn: 0.0875773	total: 49.8ms	remaining: 6.18s
8:	learn: 0.0861490	total: 55.4ms	remaining: 6.1s
9:	learn: 0.0847840	total: 61.2ms	remaining: 6.06s
10:	learn: 0.0834475	total: 66.6ms	remaining: 5.99s
11:	learn: 0.0823576	total: 72.2ms	remaining: 5.94s
12:	learn: 0.0810865	total: 77.7ms	remaining: 5.9s
13:	learn: 0.0799694	total: 83.2ms	remaining: 5.86s
14:	learn: 0.0787994	total: 88.9ms	remaining: 5.84s
15:	learn: 0.0778174	total: 94.5ms	remaining: 5.81s
16:	learn: 0.0767702	total: 99.9ms	remaining: 5.78s
17:	learn: 0.0758928	total: 106ms	remaining: 5.76s
18:	learn: 0.0749596	total: 111ms	remaining: 5.

In [24]:
submission=pd.DataFrame(sub)

In [25]:
submission

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,161.527389,0.373557,0.210824,1.127298,20.585337
1,1422188626,167.297298,0.380426,0.236521,1.118246,20.212037
2,2032016830,96.975036,0.350746,0.24914,1.079441,20.547315


In [26]:
submission.to_csv('submission.csv',index=False)

Result: 

Score: 0.33

Rank: 16 (2025-07-09-13:51, JST)

Your Best Entry!
Your most recent submission scored 0.033, which is an improvement of your previous score of 0.037. Great job!

Up to rank 16. Now accepting donations for more RAM. #kaggle - https://kaggle.com/competitions/neurips-open-polymer-prediction-2025 