Original Noteboo:

https://www.kaggle.com/code/yutongzhang20080108/ensemble-of-extratree-with-different-seeds

# Import Dependencies 

In [1]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3


In [2]:
 # Importing Required Libraries\nLet's begin by importing the essential Python libraries needed for data processing, visualization, and modeling.

import pandas as pd
import numpy as np

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


import networkx as nx
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops
from rdkit import Chem

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [3]:
useless_cols = [   
    
    'MaxPartialCharge', 
    # Nan data
    'BCUT2D_MWHI',
    'BCUT2D_MWLOW',
    'BCUT2D_CHGHI',
    'BCUT2D_CHGLO',
    'BCUT2D_LOGPHI',
    'BCUT2D_LOGPLOW',
    'BCUT2D_MRHI',
    'BCUT2D_MRLOW',

    # Constant data
    'NumRadicalElectrons',
    'SMR_VSA8',
    'SlogP_VSA9',
    'fr_barbitur',
    'fr_benzodiazepine',
    'fr_dihydropyridine',
    'fr_epoxide',
    'fr_isothiocyan',
    'fr_lactam',
    'fr_nitroso',
    'fr_prisulfonamd',
    'fr_thiocyan',

    # High correlated data >0.95
    'MaxEStateIndex',
    'HeavyAtomMolWt',
    'ExactMolWt',
    'NumValenceElectrons',
    'Chi0',
    'Chi0n',
    'Chi0v',
    'Chi1',
    'Chi1n',
    'Chi1v',
    'Chi2n',
    'Kappa1',
    'LabuteASA',
    'HeavyAtomCount',
    'MolMR',
    'Chi3n',
    'BertzCT',
    'Chi2v',
    'Chi4n',
    'HallKierAlpha',
    'Chi3v',
    'Chi4v',
    'MinAbsPartialCharge',
    'MinPartialCharge',
    'MaxAbsPartialCharge',
    'FpDensityMorgan2',
    'FpDensityMorgan3',
    'Phi',
    'Kappa3',
    'fr_nitrile',
    'SlogP_VSA6',
    'NumAromaticCarbocycles',
    'NumAromaticRings',
    'fr_benzene',
    'VSA_EState6',
    'NOCount',
    'fr_C_O',
    'fr_C_O_noCOO',
    'NumHDonors',
    'fr_amide',
    'fr_Nhpyrrole',
    'fr_phenol',
    'fr_phenol_noOrthoHbond',
    'fr_COO2',
    'fr_halogen',
    'fr_diazo',
    'fr_nitro_arom',
    'fr_phos_ester'
]


# Read Files

In [4]:
tg=pd.read_csv('/kaggle/input/neurips-dataset/tg.csv')
rg=pd.read_csv('/kaggle/input/neurips-dataset/rg.csv')
tc=pd.read_csv('/kaggle/input/neurips-dataset/tc.csv')
ffv=pd.read_csv('/kaggle/input/neurips-dataset/ffv.csv')
density=pd.read_csv('/kaggle/input/neurips-dataset/density.csv')
test=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
ID=test['id'].copy()

# Preprocessing 

In [5]:
def make_smile_canonical(smile): # To avoid duplicates, for example: canonical '*C=C(*)C' == '*C(=C*)C'
    try:
        mol = Chem.MolFromSmiles(smile)
        canon_smile = Chem.MolToSmiles(mol, canonical=True)
        return canon_smile
    except:
        return np.nan
test['SMILES'] = test['SMILES'].apply(lambda s: make_smile_canonical(s))

In [6]:
def preprocessing(df):
    desc_names = [desc[0] for desc in Descriptors.descList if desc[0] not in useless_cols]
    descriptors = [compute_all_descriptors(smi) for smi in df['SMILES'].to_list()]

    graph_feats = {'graph_diameter': [], 'avg_shortest_path': [], 'num_cycles': []}
    for smile in df['SMILES']:
         compute_graph_features(smile, graph_feats)
        
    result = pd.concat(
        [
            pd.DataFrame(descriptors, columns=desc_names),
            pd.DataFrame(graph_feats)
        ],
        axis=1
    )

    result = result.replace([-np.inf, np.inf], np.nan)
    return result

# Feature Extraction 

In [7]:
def compute_all_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(desc_names)
    return [desc[1](mol) for desc in Descriptors.descList if desc[0] not in useless_cols]

def compute_graph_features(smiles, graph_feats):
    mol = Chem.MolFromSmiles(smiles)
    adj = rdmolops.GetAdjacencyMatrix(mol)
    G = nx.from_numpy_array(adj)

    graph_feats['graph_diameter'].append(nx.diameter(G) if nx.is_connected(G) else 0)
    graph_feats['avg_shortest_path'].append(nx.average_shortest_path_length(G) if nx.is_connected(G) else 0)
    graph_feats['num_cycles'].append(len(list(nx.cycle_basis(G))))

test = pd.concat([test, preprocessing(test)], axis=1)
test['Ipc']=np.log10(test['Ipc'])

test=test.drop(['id','SMILES'],axis=1)

# Model 

In [8]:
# Let’s define a reusable function to train and evaluate our machine learning model.

def model_seed_1(train_d,test_d,model,target,submission=False):
    # We divide the data into training and validation sets for model evaluation
    X=train_d.drop(target,axis=1)
    y=train_d[target].copy()
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

    Model=model(random_state=21)
    if submission==False:
       Model.fit(X_train,y_train)
       y_pred=Model.predict(X_test)
       return mean_absolute_error(y_pred,y_test)         # We assess our model performance using MAE metric
    if submission==True:
       Model.fit(X,y)
       submission=Model.predict(test_d)
       return submission
def model_seed_2(train_d,test_d,model,target,submission=False):
    # We divide the data into training and validation sets for model evaluation
    X=train_d.drop(target,axis=1)
    y=train_d[target].copy()
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

    Model=model(random_state=42)
    if submission==False:
       Model.fit(X_train,y_train)
       y_pred=Model.predict(X_test)
       return mean_absolute_error(y_pred,y_test)         # We assess our model performance using MAE metric
    if submission==True:
       Model.fit(X,y)
       submission=Model.predict(test_d)
       return submission 
def model_seed_3(train_d,test_d,model,target,submission=False):
    # We divide the data into training and validation sets for model evaluation
    X=train_d.drop(target,axis=1)
    y=train_d[target].copy()
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

    Model=model(random_state=100)
    if submission==False:
       Model.fit(X_train,y_train)
       y_pred=Model.predict(X_test)
       return mean_absolute_error(y_pred,y_test)         # We assess our model performance using MAE metric
    if submission==True:
       Model.fit(X,y)
       submission=Model.predict(test_d)
       return submission 
def model_seed_4(train_d,test_d,model,target,submission=False):
    # We divide the data into training and validation sets for model evaluation
    X=train_d.drop(target,axis=1)
    y=train_d[target].copy()
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

    Model=model(random_state=456)
    if submission==False:
       Model.fit(X_train,y_train)
       y_pred=Model.predict(X_test)
       return mean_absolute_error(y_pred,y_test)         # We assess our model performance using MAE metric
    if submission==True:
       Model.fit(X,y)
       submission=Model.predict(test_d)
       return submission 
def model_seed_5(train_d,test_d,model,target,submission=False):
    # We divide the data into training and validation sets for model evaluation
    X=train_d.drop(target,axis=1)
    y=train_d[target].copy()
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

    Model=model(random_state=666)
    if submission==False:
       Model.fit(X_train,y_train)
       y_pred=Model.predict(X_test)
       return mean_absolute_error(y_pred,y_test)         # We assess our model performance using MAE metric
    if submission==True:
       Model.fit(X,y)
       submission=Model.predict(test_d)
       return submission 

# Model Evaluation

# Final Model For Submission

In [9]:
# Average predictions from 5 model seeds for each target
tg_result = (
    model_seed_1(tg, test, ExtraTreesRegressor, 'Tg', submission=True) +
    model_seed_2(tg, test, ExtraTreesRegressor, 'Tg', submission=True) +
    model_seed_3(tg, test, ExtraTreesRegressor, 'Tg', submission=True) +
    model_seed_4(tg, test, ExtraTreesRegressor, 'Tg', submission=True) +
    model_seed_5(tg, test, ExtraTreesRegressor, 'Tg', submission=True)
) / 5

ffv_result = (
    model_seed_1(ffv, test, ExtraTreesRegressor, 'FFV', submission=True) +
    model_seed_2(ffv, test, ExtraTreesRegressor, 'FFV', submission=True) +
    model_seed_3(ffv, test, ExtraTreesRegressor, 'FFV', submission=True) +
    model_seed_4(ffv, test, ExtraTreesRegressor, 'FFV', submission=True) +
    model_seed_5(ffv, test, ExtraTreesRegressor, 'FFV', submission=True)
) / 5

tc_result = (
    model_seed_1(tc, test, ExtraTreesRegressor, 'Tc', submission=True) +
    model_seed_2(tc, test, ExtraTreesRegressor, 'Tc', submission=True) +
    model_seed_3(tc, test, ExtraTreesRegressor, 'Tc', submission=True) +
    model_seed_4(tc, test, ExtraTreesRegressor, 'Tc', submission=True) +
    model_seed_5(tc, test, ExtraTreesRegressor, 'Tc', submission=True)
) / 5

density_result = (
    model_seed_1(density, test, ExtraTreesRegressor, 'Density', submission=True) +
    model_seed_2(density, test, ExtraTreesRegressor, 'Density', submission=True) +
    model_seed_3(density, test, ExtraTreesRegressor, 'Density', submission=True) +
    model_seed_4(density, test, ExtraTreesRegressor, 'Density', submission=True) +
    model_seed_5(density, test, ExtraTreesRegressor, 'Density', submission=True)
) / 5

rg_result = (
    model_seed_1(rg, test, ExtraTreesRegressor, 'Rg', submission=True) +
    model_seed_2(rg, test, ExtraTreesRegressor, 'Rg', submission=True) +
    model_seed_3(rg, test, ExtraTreesRegressor, 'Rg', submission=True) +
    model_seed_4(rg, test, ExtraTreesRegressor, 'Rg', submission=True) +
    model_seed_5(rg, test, ExtraTreesRegressor, 'Rg', submission=True)
) / 5


In [10]:
 # Finally, we use the model to predict on the test set and prepare the submission file.

sub={'id':ID,'Tg':tg_result,
     'FFV':ffv_result,
     'Tc':tc_result,
     'Density':density_result,
     'Rg':rg_result}

In [11]:
submission=pd.DataFrame(sub)

In [12]:
submission

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,159.367858,0.37283,0.183653,1.145713,20.42981
1,1422188626,163.984013,0.374475,0.235749,1.112386,19.758896
2,2032016830,95.520068,0.350464,0.26878,1.085902,20.639774


In [13]:
submission.to_csv('submission.csv',index=False)

Result:

Score: 

Rank: 

Runtime: 5min (Kaggle editor)

Your Best Entry!
Your most recent submission scored 0.032, which is an improvement of your previous score of 0.033. Great job!

Moving up to rank 73. rising like my electricity bill. #kaggle - https://kaggle.com/competitions/neurips-open-polymer-prediction-2025 