This notebook is based on my previous notebook [Baseline_For_Beginners](https://www.kaggle.com/code/adamlogman/baseline-for-beginners),and [Dmitry Uarov](https://www.kaggle.com/dmitryuarov) 's [NeurIPS | Baseline + External data](https://www.kaggle.com/code/dmitryuarov/neurips-baseline-external-data) notebook with modifications to the model

# Import Dependencies 

In [1]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3


In [2]:
 # Importing Required Libraries\nLet's begin by importing the essential Python libraries needed for data processing, visualization, and modeling.

import pandas as pd
import numpy as np


from sklearn.ensemble import HistGradientBoostingRegressor,ExtraTreesRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_error, make_scorer


import networkx as nx
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops
from rdkit import Chem

import os, math, time
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler


import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [3]:
class CFG:
    TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
    SEED = 42
    FOLDS = 10

In [4]:
useless_cols = [   
    
    'MaxPartialCharge', 
    # Nan data
    'BCUT2D_MWHI',
    'BCUT2D_MWLOW',
    'BCUT2D_CHGHI',
    'BCUT2D_CHGLO',
    'BCUT2D_LOGPHI',
    'BCUT2D_LOGPLOW',
    'BCUT2D_MRHI',
    'BCUT2D_MRLOW',

    # Constant data
    'NumRadicalElectrons',
    'SMR_VSA8',
    'SlogP_VSA9',
    'fr_barbitur',
    'fr_benzodiazepine',
    'fr_dihydropyridine',
    'fr_epoxide',
    'fr_isothiocyan',
    'fr_lactam',
    'fr_nitroso',
    'fr_prisulfonamd',
    'fr_thiocyan',

    # High correlated data >0.95
    'MaxEStateIndex',
    'HeavyAtomMolWt',
    'ExactMolWt',
    'NumValenceElectrons',
    'Chi0',
    'Chi0n',
    'Chi0v',
    'Chi1',
    'Chi1n',
    'Chi1v',
    'Chi2n',
    'Kappa1',
    'LabuteASA',
    'HeavyAtomCount',
    'MolMR',
    'Chi3n',
    'BertzCT',
    'Chi2v',
    'Chi4n',
    'HallKierAlpha',
    'Chi3v',
    'Chi4v',
    'MinAbsPartialCharge',
    'MinPartialCharge',
    'MaxAbsPartialCharge',
    'FpDensityMorgan2',
    'FpDensityMorgan3',
    'Phi',
    'Kappa3',
    'fr_nitrile',
    'SlogP_VSA6',
    'NumAromaticCarbocycles',
    'NumAromaticRings',
    'fr_benzene',
    'VSA_EState6',
    'NOCount',
    'fr_C_O',
    'fr_C_O_noCOO',
    'NumHDonors',
    'fr_amide',
    'fr_Nhpyrrole',
    'fr_phenol',
    'fr_phenol_noOrthoHbond',
    'fr_COO2',
    'fr_halogen',
    'fr_diazo',
    'fr_nitro_arom',
    'fr_phos_ester'
]

SEED = 42

# Read Files

### Main Files

In [5]:
 #We will load both the training and test datasets using pandas, and store test IDs 
train=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
ss=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv')
ID=test['id'].copy()

In [6]:
train.head()

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.37041,,,
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.37886,,,
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.35547,,,


In [7]:
train.shape, test.shape

((7973, 7), (3, 2))

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7973 entries, 0 to 7972
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       7973 non-null   int64  
 1   SMILES   7973 non-null   object 
 2   Tg       511 non-null    float64
 3   FFV      7030 non-null   float64
 4   Tc       737 non-null    float64
 5   Density  613 non-null    float64
 6   Rg       614 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 436.2+ KB


### Extra Files

In [9]:
tc_smiles =pd.read_csv('/kaggle/input/tc-smiles/Tc_SMILES.csv')
tg_smiles =pd.read_csv('/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv')
ktg_smiles =pd.read_excel('/kaggle/input/smiles-extra-data/data_tg3.xlsx')
de_smiles =pd.read_excel('/kaggle/input/smiles-extra-data/data_dnst1.xlsx')
ds1_smiles=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv')
ds2_smiles=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset2.csv')
ds3_smiles=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv')
ds4_smiles=pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv')

# Preprocessing 

In [10]:
def make_smile_canonical(smile): # To avoid duplicates, for example: canonical '*C=C(*)C' == '*C(=C*)C'
    try:
        mol = Chem.MolFromSmiles(smile)
        canon_smile = Chem.MolToSmiles(mol, canonical=True)
        return canon_smile
    except:
        return np.nan

train['SMILES'] = train['SMILES'].apply(lambda s: make_smile_canonical(s))
test['SMILES'] = test['SMILES'].apply(lambda s: make_smile_canonical(s))

In [11]:
ktg_smiles.rename(columns={'Tg [K]': 'Tg'}, inplace=True)
tg_smiles.rename(columns={'Tg (C)': 'Tg'}, inplace=True)
tc_smiles.rename(columns={'TC_mean': 'Tc'}, inplace=True)
de_smiles.rename(columns={'density(g/cm3)': 'Density'}, inplace=True)
ds1_smiles.rename(columns={'TC_mean': 'Tc'}, inplace=True)

In [12]:
de_smiles['SMILES'] = de_smiles['SMILES'].apply(lambda s: make_smile_canonical(s))
de_smiles = de_smiles[(de_smiles['SMILES'].notnull())&(de_smiles['Density'].notnull())&(de_smiles['Density'] != 'nylon')]
de_smiles['Density'] = de_smiles['Density'].astype('float64')
de_smiles['Density'] -= 0.118

ktg_smiles['Tg'] = ktg_smiles['Tg'] - 273.15

[15:50:33] SMILES Parse Error: syntax error while parsing: *O[Si](*)([R])[R]
[15:50:33] SMILES Parse Error: check for mistakes around position 12:
[15:50:33] *O[Si](*)([R])[R]
[15:50:33] ~~~~~~~~~~~^
[15:50:33] SMILES Parse Error: Failed parsing SMILES '*O[Si](*)([R])[R]' for input: '*O[Si](*)([R])[R]'
[15:50:33] SMILES Parse Error: syntax error while parsing: *NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4
[15:50:33] SMILES Parse Error: check for mistakes around position 28:
[15:50:33] c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=
[15:50:33] ~~~~~~~~~~~~~~~~~~~~^
[15:50:33] SMILES Parse Error: Failed parsing SMILES '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4' for input: '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4'
[15:50:33] SMILES Parse Error: syntax error while parsing: O=C=N[R1]N=C=O.O[R2]O.O[R3]O
[15:50:33] SMILES Parse Error: check for mistakes around position 7:
[15:50:33] O=C=N[R1]N=C=O.O[R2]O.O[R3]O
[15:50:33] ~~~~~~^
[15:50:33] SMILES Parse Error: F

In [13]:
def preprocessing(df):
    desc_names = [desc[0] for desc in Descriptors.descList if desc[0] not in useless_cols]
    descriptors = [compute_all_descriptors(smi) for smi in df['SMILES'].to_list()]

    graph_feats = {'graph_diameter': [], 'avg_shortest_path': [], 'num_cycles': []}
    for smile in df['SMILES']:
         compute_graph_features(smile, graph_feats)
        
    result = pd.concat(
        [
            pd.DataFrame(descriptors, columns=desc_names),
            pd.DataFrame(graph_feats)
        ],
        axis=1
    )

    result = result.replace([-np.inf, np.inf], np.nan)
    return result

# Feature Extraction 

In [14]:
# SMILESを正規化（canonical化）
# 重複する SMILES を集約して平均 target 値に
# df_train に存在するが欠損している target に、df_extra から値を補完

def add_extra_data(df_train, df_extra, target):
    n_samples_before = len(df_train[df_train[target].notnull()])
    
    df_extra['SMILES'] = df_extra['SMILES'].apply(lambda s: make_smile_canonical(s))
    df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()
    cross_smiles = set(df_extra['SMILES']) & set(df_train['SMILES'])
    unique_smiles_extra = set(df_extra['SMILES']) - set(df_train['SMILES'])

    # Make priority target value from competition's df
    for smile in df_train[df_train[target].notnull()]['SMILES'].tolist():
        if smile in cross_smiles:
            cross_smiles.remove(smile)

    # Imput missing values for competition's SMILES
    for smile in cross_smiles:
        df_train.loc[df_train['SMILES']==smile, target] = df_extra[df_extra['SMILES']==smile][target].values[0]
    
    df_train = pd.concat([df_train, df_extra[df_extra['SMILES'].isin(unique_smiles_extra)]], axis=0).reset_index(drop=True)

    n_samples_after = len(df_train[df_train[target].notnull()])
    print(f'\nFor target "{target}" added {n_samples_after-n_samples_before} new samples!')
    print(f'New unique SMILES: {len(unique_smiles_extra)}')
    return df_train

train = add_extra_data(train, tc_smiles, 'Tc')
train = add_extra_data(train, tg_smiles, 'Tg')
train = add_extra_data(train, ktg_smiles, 'Tg')
train = add_extra_data(train, de_smiles, 'Density')

train = add_extra_data(train, ds1_smiles, 'Tc')
#train = add_extra_data(train, ds2_smiles, 'Density')
train = add_extra_data(train, ds3_smiles, 'Tg')
train = add_extra_data(train, ds4_smiles, 'FFV')


For target "Tc" added 129 new samples!
New unique SMILES: 129

For target "Tg" added 151 new samples!
New unique SMILES: 136

For target "Tg" added 499 new samples!
New unique SMILES: 499

For target "Density" added 634 new samples!
New unique SMILES: 524

For target "Tc" added 0 new samples!
New unique SMILES: 0

For target "Tg" added 0 new samples!
New unique SMILES: 0

For target "FFV" added 862 new samples!
New unique SMILES: 819


In [15]:
def compute_all_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(desc_names)
    return [desc[1](mol) for desc in Descriptors.descList if desc[0] not in useless_cols]

def compute_graph_features(smiles, graph_feats):
    mol = Chem.MolFromSmiles(smiles)
    adj = rdmolops.GetAdjacencyMatrix(mol)
    G = nx.from_numpy_array(adj)

    graph_feats['graph_diameter'].append(nx.diameter(G) if nx.is_connected(G) else 0)
    graph_feats['avg_shortest_path'].append(nx.average_shortest_path_length(G) if nx.is_connected(G) else 0)
    graph_feats['num_cycles'].append(len(list(nx.cycle_basis(G))))

train = pd.concat([train, preprocessing(train)], axis=1)
test = pd.concat([test, preprocessing(test)], axis=1)

# Find constant columns for each target
all_features = train.columns[7:].tolist()
features = {}
for target in CFG.TARGETS:
    const_descs = []
    for col in train.columns.drop(CFG.TARGETS):
        if train[train[target].notnull()][col].nunique() == 1:
            const_descs.append(col)
    features[target] = [f for f in all_features if f not in const_descs]

print(train.shape)
train['Ipc']=np.log10(train['Ipc'])  
for n in train.columns[7:]:
    train[n]=train[n].replace(-np.inf,np.nan)
    train[n]=train[n].replace(np.inf,np.nan)    
    train[n].fillna(train[n].mean())
  
print(train.shape)
test['Ipc']=np.log10(test['Ipc'])
for n in test.columns[7:]:
    train[n]=train[n].replace(-np.inf,np.nan)
    train[n]=train[n].replace(np.inf,np.nan)      
    test[n].fillna(train[n].mean())

(10080, 158)
(10080, 158)


In [16]:
train

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,FpDensityMorgan1,AvgIpc,BalabanJ,Ipc,Kappa2,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA7,SlogP_VSA8,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState1,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,NHOHCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAmideBonds,NumAromaticHeterocycles,NumAtomStereoCenters,NumBridgeheadAtoms,NumHAcceptors,NumHeteroatoms,NumHeterocycles,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,NumSpiroAtoms,NumUnspecifiedAtomStereoCenters,RingCount,MolLogP,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_COO,fr_Ar_N,fr_Ar_NH,fr_Ar_OH,fr_COO,fr_C_S,fr_HOCCN,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_Ndealkylation1,fr_Ndealkylation2,fr_SH,fr_aldehyde,fr_alkyl_carbamate,fr_alkyl_halide,fr_allylic_oxid,fr_amidine,fr_aniline,fr_aryl_methyl,fr_azide,fr_azo,fr_bicyclic,fr_ester,fr_ether,fr_furan,fr_guanido,fr_hdrzine,fr_hdrzone,fr_imidazole,fr_imide,fr_isocyan,fr_ketone,fr_ketone_Topliss,fr_lactone,fr_methoxy,fr_morpholine,fr_nitro,fr_nitro_arom_nonortho,fr_oxazole,fr_oxime,fr_para_hydroxylation,fr_phos_acid,fr_piperdine,fr_piperzine,fr_priamide,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea,graph_diameter,avg_shortest_path,num_cycles
0,87817.0,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,0.932,,12.144536,0.105927,-0.105927,0.500278,13.705882,232.323,1.411765,2.456411,2.563477,4.261290,5.852071,0.000000,0.000000,0.0,0.000000,0.000000,103.451541,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,9.531400,5.969305,0.0,0.000000,0.000000,45.951583,6.606882,35.392371,0.000000,0.000000,0.000000,0.000000,0.000000,12.576187,4.736863,0.000000,54.949285,6.923737,0.000000,26.30,0.000000,4.794537,0.0,5.969305,6.606882,24.825916,25.328832,0.000000,12.132734,19.056471,4.736863,6.120445,0.000000,12.144536,0.000000,2.105648,-0.105927,5.344954,4.148954,0.000000,0.533333,0,0,0,0,0,0,1,0,2,4,0,8,0,0,0,0,1,1,3.98170,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,12,4.736842,1
1,106919.0,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.370410,,,,3.523412,0.098918,0.098918,0.125364,16.777778,598.919,0.577778,3.135512,1.451540,10.636376,14.715582,0.000000,0.000000,0.0,0.000000,0.000000,222.907896,0.000000,0.000000,0.0,0.0,32.607024,18.759549,0.000000,0.000000,0.000000,11.374773,0.0,0.000000,5.917906,115.071874,11.467335,130.442582,0.000000,11.467335,11.374773,0.000000,0.000000,0.000000,5.414990,5.917906,143.037592,0.000000,0.000000,24.06,0.000000,0.000000,0.0,5.414990,11.835812,5.917906,121.805341,0.000000,0.000000,129.300420,0.000000,0.326442,0.000000,0.000000,0.000000,18.579143,1.794202,15.456662,6.971589,0.000000,0.441860,2,1,0,1,0,0,2,0,2,4,0,16,1,0,1,0,0,5,12.35960,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,22,8.364477,5
2,388772.0,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.378860,,,,13.714745,0.107441,-3.829434,0.092387,16.301370,1003.207,0.397260,3.440031,0.743739,16.833145,19.899082,9.473726,22.998047,0.0,9.837253,0.000000,222.531582,0.000000,8.417797,0.0,0.0,43.527933,96.764951,5.414990,9.790967,40.947247,43.676159,0.0,0.000000,0.000000,76.363497,0.000000,227.523761,40.246583,25.383483,0.000000,40.246583,0.000000,22.618839,29.884034,0.000000,73.620379,0.000000,12.152040,122.27,19.674506,21.630131,0.0,30.780169,40.246583,67.218863,23.614092,97.061873,60.682977,48.530937,19.317116,72.241292,0.000000,14.038931,5.415568,5.844813,4.224702,11.839224,0.000000,-7.512616,0.145161,0,2,0,2,0,0,0,0,9,13,0,15,2,0,2,0,0,10,14.21700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,45,15.419820,10
3,519416.0,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,,3.978671,0.054569,-0.202102,0.209590,11.523810,542.726,0.333333,3.051372,1.888931,10.012433,10.392685,0.000000,0.000000,0.0,0.000000,0.000000,249.752719,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,11.374773,0.0,0.000000,0.000000,27.694949,11.467335,143.581147,55.634515,11.467335,11.374773,0.000000,0.000000,0.000000,0.000000,27.694949,22.253806,0.000000,55.634515,24.06,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,89.263093,0.000000,0.000000,160.489625,0.000000,-0.147532,0.000000,0.000000,0.000000,27.622010,0.000000,0.000000,8.669733,0.000000,0.100000,2,0,0,0,0,0,0,0,2,4,0,7,0,0,0,0,0,6,11.00768,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,2,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,6.004228,6
4,539187.0,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.355470,,,,13.703218,0.068062,-0.686332,0.014164,15.885714,965.154,0.557143,4.071253,0.862097,15.917240,27.216224,19.273545,0.000000,0.0,0.000000,11.374773,208.369254,20.228637,0.000000,0.0,0.0,0.000000,56.369576,61.943518,35.144068,48.226539,35.005011,0.0,0.000000,0.000000,127.658471,62.530624,116.284678,22.998047,29.116936,22.749545,22.998047,0.000000,86.916574,9.473726,20.228637,136.290767,0.000000,0.000000,182.28,5.969305,29.817711,0.0,39.112847,55.281356,159.002350,0.000000,60.663671,24.265468,9.799819,28.790842,29.433681,0.000000,52.672699,27.589702,2.927775,1.302622,18.810591,5.473191,0.000000,0.518519,0,0,2,2,0,0,2,0,14,18,2,34,0,2,2,0,2,6,11.84500,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,5,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,43,15.644757,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10075,,*c1cccc(OCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5...,,0.349095,,,,13.535196,0.183519,-0.654316,0.180649,14.571429,558.590,0.595238,3.414785,1.069058,10.039516,10.302518,0.000000,0.000000,0.0,0.000000,0.000000,236.890600,0.000000,0.000000,0.0,0.0,0.000000,6.066367,0.000000,0.000000,28.651875,35.382472,0.0,5.316789,0.000000,25.683286,18.113674,107.182945,22.625927,20.440003,5.687386,11.499024,0.000000,42.159271,0.000000,0.000000,67.115241,0.000000,11.126903,93.22,11.814359,19.178149,0.0,17.377811,52.467919,36.332708,11.383156,54.597304,12.132734,18.199101,9.473726,11.334820,0.000000,53.433918,1.330932,2.943064,0.167546,3.930859,1.239711,0.000000,0.176471,0,0,2,2,4,0,0,0,6,10,2,11,0,0,0,0,0,6,5.40730,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,24,9.179704,6
10076,,*c1cccc(OCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5C...,,0.350892,,,,13.523115,0.170148,-0.654527,0.205731,14.634146,544.563,0.609756,3.397762,1.101853,9.828335,9.745139,0.000000,0.000000,0.0,0.000000,0.000000,236.536146,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,28.651875,35.382472,0.0,5.316789,0.000000,19.262465,18.113674,107.182945,22.625927,20.440003,5.687386,11.499024,0.000000,42.159271,0.000000,0.000000,60.694420,0.000000,11.126903,93.22,11.814359,19.178149,0.0,17.377811,52.467919,29.911886,11.383156,54.597304,18.199101,12.132734,9.473726,11.295157,0.000000,53.390206,1.329432,2.925357,0.127349,2.737915,1.179723,0.000000,0.151515,0,0,2,2,4,0,0,0,6,10,2,10,0,0,0,0,0,6,5.01720,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,23,8.818383,6
10077,,*c1cccc(OCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5C(...,,0.345386,,,,13.509907,0.153775,-0.654822,0.231529,14.700000,530.536,0.625000,3.381024,1.136990,9.617263,9.198655,0.000000,0.000000,0.0,0.000000,0.000000,230.115324,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,28.651875,35.382472,0.0,5.316789,0.000000,12.841643,18.113674,107.182945,22.625927,20.440003,5.687386,11.499024,0.000000,42.159271,0.000000,0.000000,54.273598,0.000000,11.126903,93.22,11.814359,19.178149,0.0,17.377811,52.467919,23.491065,11.383156,60.663671,12.132734,12.132734,9.473726,11.242988,0.000000,53.341945,1.327747,2.903519,0.074564,1.634625,1.090070,0.000000,0.125000,0,0,2,2,4,0,0,0,6,10,2,9,0,0,0,0,0,6,4.62710,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,22,8.461092,6
10078,,*c1cccc(Oc2cccc(Oc3cccc(N4C(=O)c5ccc(Oc6ccc(Sc...,,0.362224,,,,13.691748,0.060361,-0.578421,0.118817,13.543860,766.787,0.403509,3.610901,0.821127,13.461585,13.589371,4.736863,11.499024,0.0,0.000000,11.814359,240.022275,9.589074,0.000000,0.0,0.0,0.000000,42.464569,0.000000,11.126903,38.125601,47.144357,0.0,5.316789,0.000000,9.790967,4.899910,179.979350,45.996095,29.913729,5.687386,45.996095,11.761885,28.945508,0.000000,0.000000,41.431955,0.000000,0.000000,111.68,11.814359,19.178149,0.0,22.941262,57.060872,20.440389,11.383156,78.491923,78.862772,12.132734,18.947452,23.761187,1.559725,55.003001,1.343194,1.747344,3.180401,0.000000,0.000000,0.000000,0.000000,0,0,2,2,4,0,0,0,9,13,2,11,0,0,0,0,0,9,10.04250,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,4,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,33,12.172998,9


# Data Preparation For Model Training 

In [17]:
# We'll separate train to be one model for each target variable.
t_1=train[['SMILES','Tg']].copy()
t_2=train[['SMILES','FFV']].copy()
t_3=train[['SMILES','Tc']].copy()
t_4=train[['SMILES','Density']].copy()
t_5=train[['SMILES','Rg']].copy()

# We will drop the rows with missing values related to that target after separation.
#This is important , dropping them beforehand would result Null for all data.
t_1.dropna(inplace=True)
t_2.dropna(inplace=True)
t_3.dropna(inplace=True)
t_4.dropna(inplace=True)
t_5.dropna(inplace=True)

In [18]:
train=train.drop(['id','Tg','FFV','Tc','Density','Rg'],axis=1)
test=test.drop(['id','SMILES'],axis=1)

In [19]:
tg=t_1.merge(train,on='SMILES',how='left')
ffv=t_2.merge(train,on='SMILES',how='left')
tc=t_3.merge(train,on='SMILES',how='left')
density=t_4.merge(train,on='SMILES',how='left')
rg=t_5.merge(train,on='SMILES',how='left')

In [20]:
for i in (tg,tc,density,ffv,rg):
    i.drop('SMILES',axis=1,inplace=True)
    i.dropna(inplace=True)

In [21]:
tg.head()

Unnamed: 0,Tg,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,FpDensityMorgan1,AvgIpc,BalabanJ,Ipc,Kappa2,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA7,SlogP_VSA8,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState1,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,NHOHCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAmideBonds,NumAromaticHeterocycles,NumAtomStereoCenters,NumBridgeheadAtoms,NumHAcceptors,NumHeteroatoms,NumHeterocycles,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,NumSpiroAtoms,NumUnspecifiedAtomStereoCenters,RingCount,MolLogP,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_COO,fr_Ar_N,fr_Ar_NH,fr_Ar_OH,fr_COO,fr_C_S,fr_HOCCN,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_Ndealkylation1,fr_Ndealkylation2,fr_SH,fr_aldehyde,fr_alkyl_carbamate,fr_alkyl_halide,fr_allylic_oxid,fr_amidine,fr_aniline,fr_aryl_methyl,fr_azide,fr_azo,fr_bicyclic,fr_ester,fr_ether,fr_furan,fr_guanido,fr_hdrzine,fr_hdrzone,fr_imidazole,fr_imide,fr_isocyan,fr_ketone,fr_ketone_Topliss,fr_lactone,fr_methoxy,fr_morpholine,fr_nitro,fr_nitro_arom_nonortho,fr_oxazole,fr_oxime,fr_para_hydroxylation,fr_phos_acid,fr_piperdine,fr_piperzine,fr_priamide,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea,graph_diameter,avg_shortest_path,num_cycles
0,208.639749,11.248449,0.023312,-0.334079,0.435492,13.615385,185.183,1.384615,2.210448,3.709964,3.151338,3.993205,0.0,0.0,0.0,0.0,0.0,74.906467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.383612,18.10052,0.0,16.367245,0.0,12.965578,13.089513,0.0,0.0,16.367245,0.0,0.0,0.0,37.231874,14.383612,0.0,6.923737,0.0,0.0,87.3,0.0,14.383612,0.0,37.231874,0.0,0.0,0.0,6.923737,0.0,16.367245,0.0,-0.357391,0.0,33.004449,4.941399,3.000789,-0.488941,0.133398,1.818204,0.0,0.571429,3,0,0,0,2,0,1,0,4,8,0,6,0,0,0,0,1,0,-2.2629,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,4.285714,0
1,-41.266724,2.142025,0.580086,0.580086,0.318231,12.571429,268.538,0.714286,2.306121,2.828262,3.566187,10.902614,0.0,0.0,0.0,0.0,0.0,106.749724,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.042496,0.0,0.0,0.0,45.448667,17.258561,0.0,0.0,0.0,0.0,0.0,44.042496,17.258561,0.0,0.0,38.52493,6.923737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.783491,6.923737,0.0,44.042496,0.0,1.27736,2.142025,0.0,0.0,0.0,4.148997,8.837643,1.611538,6.315771,1.0,0,0,0,0,0,0,0,0,4,6,0,13,0,0,0,0,0,0,5.5217,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,15,5.666667,0
2,-17.282022,2.408148,0.540428,0.540428,0.44803,13.6,138.254,1.0,2.098391,2.813864,2.689242,5.519191,0.0,0.0,0.0,0.0,0.0,64.524444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.869489,0.0,12.654956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.945751,6.923737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.52493,13.344559,0.0,12.654956,0.0,1.25228,0.0,0.0,0.0,0.0,0.0,12.721612,4.026108,0.0,0.8,0,0,0,0,0,0,0,0,0,2,0,8,0,0,0,0,0,0,3.7477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,11,4.333333,0
3,4.250403,12.365137,0.240279,-0.478718,0.243344,11.111111,494.588,0.611111,2.952224,1.61227,8.341744,14.6582,0.0,0.0,0.0,0.0,0.0,167.198088,4.794537,4.794537,0.0,0.0,0.0,24.265468,5.563451,5.563451,29.021539,23.75297,0.0,10.633577,0.0,58.29031,19.696395,70.784743,0.0,10.633577,0.0,0.0,0.0,43.449365,9.84339,0.0,92.798528,6.923737,0.0,110.8,0.0,19.178149,0.0,36.842483,28.860688,12.841643,45.448667,48.530937,0.0,10.633577,9.84339,5.466202,0.0,48.36181,10.341911,1.665014,-1.343049,9.864351,2.409757,0.0,0.428571,2,0,0,0,2,0,0,0,6,10,0,17,0,0,0,0,0,2,4.7127,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,29,10.064011,2
4,168.526313,6.354825,0.021602,-0.021602,0.20235,12.3125,466.672,0.75,3.387866,1.626829,7.802378,9.754142,0.0,0.0,0.0,0.0,0.0,198.888781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.473726,54.683719,0.0,9.967957,0.0,65.214047,13.213764,24.265468,22.070099,21.050632,0.0,11.499024,22.673572,23.181721,0.0,0.0,65.214047,0.0,31.004316,44.24,0.0,0.0,0.0,0.0,0.0,59.158907,59.501676,11.336786,11.336786,38.112943,19.441683,15.210866,3.584144,9.835445,1.019112,4.614546,2.029405,9.718671,6.050587,0.0,0.461538,0,0,0,0,0,2,0,0,6,8,2,13,0,0,0,0,0,4,6.8404,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,6,0,17,6.800357,4


# Model 

In [22]:
# Let’s define a reusable function to train and evaluate our machine learning model.

def model(train_d,test_d,model,target,submission=False):
    # We divide the data into training and validation sets for model evaluation
    X=train_d.drop(target,axis=1)
    y=train_d[target].copy()
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

    Model=model()
    if submission==False:
       Model.fit(X_train,y_train)
       y_pred=Model.predict(X_test)
       return mean_absolute_error(y_pred,y_test)         # We assess our model performance using MAE metric
    if submission==True:
       Model.fit(X,y)
       submission=Model.predict(test_d)
       return submission
        

# **Model Evaluation

In [23]:
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor
)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [24]:
from sklearn.preprocessing import StandardScaler

SEED = 42
N_ITER = 25   # reduce/increase depending on time
CV = 3
N_JOBS = -1

# --- models (callable constructors) ---
models_to_compare = {
    'RandomForest': lambda: RandomForestRegressor(random_state=SEED, n_jobs=1),
    'ExtraTrees': lambda: ExtraTreesRegressor(random_state=SEED, n_jobs=1),
    'GradientBoosting': lambda: GradientBoostingRegressor(random_state=SEED),
    'HistGradientBoosting': lambda: HistGradientBoostingRegressor(random_state=SEED),
    'XGBoost': lambda: XGBRegressor(random_state=SEED, verbosity=0, n_jobs=1),
    'LightGBM': lambda: LGBMRegressor(random_state=SEED, n_jobs=1, verbose=-1),
    'CatBoost': lambda: CatBoostRegressor(random_seed=SEED, verbose=0)
}

# --- parameter distributions for RandomizedSearchCV ---
# Defaults (the tuned values you used earlier)
default_params = {
    'RandomForest':      {'n_estimators': 500,  'max_depth': 12, 'min_samples_leaf': 2, 'max_features': 'sqrt'},
    'ExtraTrees':        {'n_estimators': 500,  'max_depth': 12, 'min_samples_leaf': 2, 'max_features': 'sqrt'},
    'GradientBoosting':  {'n_estimators': 1000, 'learning_rate': 0.05, 'max_depth': 6, 'subsample': 0.8, 'max_features': 'sqrt'},
    'HistGradientBoosting': {'max_iter': 1000, 'learning_rate': 0.05, 'max_leaf_nodes': 31},
    'XGBoost':           {'n_estimators': 1000, 'learning_rate': 0.05, 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.0, 'reg_lambda': 1.0},
    'LightGBM':          {'n_estimators': 1000, 'learning_rate': 0.05, 'num_leaves': 64, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.0, 'reg_lambda': 1.0},
    'CatBoost':          {'iterations': 1000, 'learning_rate': 0.05, 'depth': 6, 'l2_leaf_reg': 3},
}

# Parameter distributions — *default value placed first in each list*
param_distributions = {
    'RandomForest': {
        'n_estimators': [default_params['RandomForest']['n_estimators'], 200, 400, 600, 800, 1000],
        'max_depth':    [default_params['RandomForest']['max_depth'], 6, 10, 16, 24, None],
        'min_samples_leaf': [default_params['RandomForest']['min_samples_leaf'], 1, 2, 4, 8],
        'max_features': [default_params['RandomForest']['max_features'], 'log2', 0.3, 0.5]
    },
    'ExtraTrees': {
        'n_estimators': [default_params['ExtraTrees']['n_estimators'], 200, 400, 600, 800],
        'max_depth':    [default_params['ExtraTrees']['max_depth'], 6, 10, 16, None],
        'min_samples_leaf': [default_params['ExtraTrees']['min_samples_leaf'], 1, 2, 4],
        'max_features': [default_params['ExtraTrees']['max_features'], 'log2', 0.3]
    },
    'GradientBoosting': {
        'n_estimators': [default_params['GradientBoosting']['n_estimators'], 200, 500, 800, 1000],
        'learning_rate': [default_params['GradientBoosting']['learning_rate'], 0.01, 0.03, 0.05, 0.1],
        'max_depth': [default_params['GradientBoosting']['max_depth'], 3, 5, 6, 8],
        'subsample': [default_params['GradientBoosting']['subsample'], 0.6, 0.8, 1.0],
        'max_features': [default_params['GradientBoosting']['max_features'], 0.3, 0.5]
    },
    'HistGradientBoosting': {
        'max_iter': [default_params['HistGradientBoosting']['max_iter'], 200, 500, 800, 1000],
        'learning_rate': [default_params['HistGradientBoosting']['learning_rate'], 0.01, 0.03, 0.05],
        'max_leaf_nodes': [default_params['HistGradientBoosting']['max_leaf_nodes'], 15, 31, 63, 127]
    },
    'XGBoost': {
        'n_estimators': [default_params['XGBoost']['n_estimators'], 200, 500, 800, 1000],
        'learning_rate': [default_params['XGBoost']['learning_rate'], 0.01, 0.03, 0.05, 0.1],
        'max_depth': [default_params['XGBoost']['max_depth'], 3, 5, 6, 8],
        'subsample': [default_params['XGBoost']['subsample'], 0.6, 0.8, 1.0],
        'colsample_bytree': [default_params['XGBoost']['colsample_bytree'], 0.5, 0.7, 0.8, 1.0],
        'reg_alpha': [default_params['XGBoost']['reg_alpha'], 1e-3, 1e-2, 0.1],
        'reg_lambda': [default_params['XGBoost']['reg_lambda'], 0.1, 1, 10]
    },
    'LightGBM': {
        'n_estimators': [default_params['LightGBM']['n_estimators'], 200, 500, 800, 1000],
        'learning_rate': [default_params['LightGBM']['learning_rate'], 0.01, 0.03, 0.05, 0.1],
        'num_leaves': [default_params['LightGBM']['num_leaves'], 31, 64, 127],
        'subsample': [default_params['LightGBM']['subsample'], 0.6, 0.8, 1.0],
        'colsample_bytree': [default_params['LightGBM']['colsample_bytree'], 0.5, 0.7, 0.8, 1.0],
        'reg_alpha': [default_params['LightGBM']['reg_alpha'], 1e-3, 1e-2],
        'reg_lambda': [default_params['LightGBM']['reg_lambda'], 0.1, 1, 10]
    },
    'CatBoost': {
        'iterations': [default_params['CatBoost']['iterations'], 200, 500, 800, 1000],
        'learning_rate': [default_params['CatBoost']['learning_rate'], 0.01, 0.03, 0.05, 0.1],
        'depth': [default_params['CatBoost']['depth'], 4, 6, 8, 10],
        'l2_leaf_reg': [default_params['CatBoost']['l2_leaf_reg'], 1, 3, 5, 10]
    }
}

In [25]:
# --- helper: find train/test dataframes in globals ---
def find_train_df_for_target(target):
    # prefer per-target DF like `tg`, `ffv`, `tc`, `density`, `rg`
    name_variants = [target.lower(), target.upper(), target.capitalize()]
    for n in name_variants + ['train', 'df_train', 'train_df']:
        if n in globals() and isinstance(globals()[n], pd.DataFrame):
            return globals()[n].copy()
    raise RuntimeError(f"No training DataFrame found for target '{target}'. Place either a dataframe named '{target.lower()}' or a global 'train' DataFrame containing the column '{target}'.")

def find_test_df():
    for n in ['test', 'df_test', 'test_df']:
        if n in globals() and isinstance(globals()[n], pd.DataFrame):
            return globals()[n].copy()
    return None

# --- tuning function per model ---
scorer = make_scorer(mean_absolute_error, greater_is_better=False)  # we want to maximize negative MAE -> minimize MAE

def tune_model_for_target(model_name, model_callable, X, y, n_iter=N_ITER, cv=CV):
    params = param_distributions.get(model_name, None)
    estimator = model_callable()
    # For safety set n_jobs=1 inside estimator constructors earlier; RandomizedSearchCV uses n_jobs param
    if params is None:
        # no tuning grid, just return base estimator trained by CV via cross_val_score not needed — return fitted estimator on full data
        estimator.fit(X, y)
        return estimator, None
    rnd = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=params,
        n_iter=n_iter,
        cv=cv,
        scoring='neg_mean_absolute_error',  # sklearn accepts this direct string
        random_state=SEED,
        n_jobs=N_JOBS,
        verbose=1
    )
    rnd.fit(X, y)
    return rnd.best_estimator_, rnd.best_params_

In [26]:
# ---------------- main loop: per-target tuning & final train ----------------
if 'CFG' in globals() and hasattr(CFG, 'TARGETS'):
    targets = list(CFG.TARGETS)
elif 'TARGET_COLS' in globals():
    targets = list(TARGET_COLS)
else:
    targets = ['Tg','FFV','Tc','Density','Rg']

test_df = find_test_df()
predictions = {}
best_summary = {}

for target in targets:
    print("\n" + "#"*80)
    print(f"Processing target: {target}")
    try:
        train_df = find_train_df_for_target(target)
    except Exception as e:
        print("Skipping target due to:", e)
        continue

    if target not in train_df.columns:
        print(f"Target {target} not in chosen train dataframe columns; skipping.")
        continue

    # prepare X,y (drop other target columns to avoid leakage)
    X = train_df.drop(columns=[c for c in (set(train_df.columns) & set(targets)) if c==target], errors='ignore')
    # ensure we don't have the target in features accidentally
    if target in X.columns:
        X = X.drop(columns=[target])

    y = train_df[target].astype(float).copy()
    # drop rows with NaN target
    valid_idx = ~y.isna()
    X = X.loc[valid_idx].reset_index(drop=True)
    y = y.loc[valid_idx].reset_index(drop=True)
    if len(y) == 0:
        print(f"No non-null target values for {target}; skipping.")
        continue

    # simple feature sanity: if there remain any object columns try to convert or drop
    X = X.copy()
    for col in X.select_dtypes(include=['object','category']).columns:
        try:
            X[col] = X[col].astype(float)
        except:
            X = X.drop(columns=[col])
    print("Train shape (after cleaning):", X.shape, "n_targets:", y.shape)

    # iterate models and tune
    best_mae = math.inf
    best_model = None
    best_name = None
    best_params = None

    for name, model_callable in models_to_compare.items():
        try:
            print(f"\nTuning model: {name} for target {target} ...")
            t0 = time.time()
            fitted_est, params = tune_model_for_target(name, model_callable, X.values, y.values, n_iter=N_ITER, cv=CV)
            t1 = time.time()
            # cross-validated result not directly returned here for base estimators; but RandomizedSearchCV stored best_score_ on object if used
            # we can estimate CV MAE by running cross_val_score quickly (but it repeats work). Instead, we'll compute OOF-like MAE via small CV for the fitted estimator:
            from sklearn.model_selection import cross_val_predict
            preds_cv = cross_val_predict(fitted_est, X.values, y.values, cv=CV, n_jobs=N_JOBS)
            mae_cv = mean_absolute_error(y.values, preds_cv)
            print(f"  [{name}] CV MAE = {mae_cv:.6f} (took {t1-t0:.1f}s). Params: {params}")
            if mae_cv < best_mae:
                best_mae = mae_cv
                best_model = fitted_est
                best_name = name
                best_params = params
        except Exception as e:
            print(f"  {name} failed: {e}")

    if best_model is None:
        print(f"No working model for {target}; skipping.")
        continue

    print(f"\nBest model for {target}: {best_name} with CV MAE = {best_mae:.6f}. Params={best_params}")
    best_summary[target] = {'model': best_name, 'mae': best_mae, 'params': best_params}

    # retrain best_model on full X,y
    print("Retraining best model on full data...")
    try:
        best_model.fit(X.values, y.values)
    except Exception as e:
        print("Refit on full data failed; attempting to call fit with pandas inputs:", e)
        best_model.fit(X, y)

    # predict on test set if available
    if test_df is not None and len(test_df)>0:
        # prepare test features same way as train (drop other target columns and object cols)
        X_test = test_df.copy()
        # drop other target columns from features if they exist
        for c in set(X_test.columns) & set(targets):
            if c == target:
                X_test = X_test.drop(columns=[c])
        # drop object columns we couldn't convert earlier
        for col in X_test.select_dtypes(include=['object','category']).columns:
            try:
                X_test[col] = X_test[col].astype(float)
            except:
                X_test = X_test.drop(columns=[col])
        # ensure columns align; if X_test missing some columns, add zeros
        for col in X.columns:
            if col not in X_test.columns:
                X_test[col] = 0.0
        # keep only train feature columns order
        X_test = X_test[X.columns]
        preds_test = best_model.predict(X_test.values)
        predictions[target] = preds_test
        print(f"Predicted test shape for {target}:", preds_test.shape)
    else:
        predictions[target] = None


################################################################################
Processing target: Tg
Train shape (after cleaning): (1160, 151) n_targets: (1160,)

Tuning model: RandomForest for target Tg ...
Fitting 3 folds for each of 25 candidates, totalling 75 fits
  [RandomForest] CV MAE = 47.136283 (took 146.1s). Params: {'n_estimators': 800, 'min_samples_leaf': 1, 'max_features': 0.3, 'max_depth': 16}

Tuning model: ExtraTrees for target Tg ...
Fitting 3 folds for each of 25 candidates, totalling 75 fits
  [ExtraTrees] CV MAE = 45.676753 (took 47.7s). Params: {'n_estimators': 800, 'min_samples_leaf': 1, 'max_features': 0.3, 'max_depth': None}

Tuning model: GradientBoosting for target Tg ...
Fitting 3 folds for each of 25 candidates, totalling 75 fits
  [GradientBoosting] CV MAE = 43.667767 (took 130.1s). Params: {'subsample': 0.6, 'n_estimators': 1000, 'max_features': 0.3, 'max_depth': 5, 'learning_rate': 0.03}

Tuning model: HistGradientBoosting for target Tg ...
Fitting 3 f

In [27]:
# --- Summarize best models ---
print("\n=== Best models summary ===")
for t, info in best_summary.items():
    print(f"{t}: {info['model']}  CV_MAE={info['mae']:.6f}  params={info['params']}")


=== Best models summary ===
Tg: CatBoost  CV_MAE=42.369263  params={'learning_rate': 0.03, 'l2_leaf_reg': 3, 'iterations': 1000, 'depth': 6}
FFV: XGBoost  CV_MAE=0.006229  params={'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0.0, 'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.05, 'colsample_bytree': 0.7}
Tc: LightGBM  CV_MAE=0.032098  params={'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 0.001, 'num_leaves': 64, 'n_estimators': 500, 'learning_rate': 0.01, 'colsample_bytree': 0.5}
Density: GradientBoosting  CV_MAE=0.065598  params={'subsample': 0.8, 'n_estimators': 800, 'max_features': 'sqrt', 'max_depth': 3, 'learning_rate': 0.1}
Rg: GradientBoosting  CV_MAE=1.717341  params={'subsample': 0.8, 'n_estimators': 1000, 'max_features': 'sqrt', 'max_depth': 5, 'learning_rate': 0.01}


# Final Model For Submission

In [28]:
# sanity checks
if test_df is None or len(test_df) == 0:
    print("\nNo test DataFrame found; skipping submission write.")
else:
    samp_path = "sample_submission.csv"
    samp_exists = os.path.exists(samp_path)

    # decide id column and id values to put into submission
    chosen_id_col = None
    ids = None

    if samp_exists:
        samp = pd.read_csv(samp_path)
        samp_id_col = samp.columns[0]
        # Prefer to use the ID column that actually exists in test_df (so IDs are the real ones)
        if samp_id_col in test_df.columns:
            chosen_id_col = samp_id_col
            ids = test_df[chosen_id_col].values
            source_for_ids = f"test_df['{chosen_id_col}'] (matches sample_submission column '{samp_id_col}')"
        else:
            # look for common id names in test_df
            for cand in ['polymer_id', 'polymerId', 'id', 'ID']:
                if cand in test_df.columns:
                    chosen_id_col = cand
                    ids = test_df[chosen_id_col].values
                    source_for_ids = f"test_df['{chosen_id_col}'] (preferred over sample_submission)"
                    break
            else:
                # fallback: use sample_submission id column values (assume sample order is canonical)
                chosen_id_col = samp_id_col
                ids = samp[samp_id_col].values
                source_for_ids = f"sample_submission['{samp_id_col}'] (fallback; test_df has no obvious id column)"
    else:
        # no sample_submission — pick sensible id column from test_df
        for cand in ['polymer_id', 'polymerId', 'id', 'ID']:
            if cand in test_df.columns:
                chosen_id_col = cand
                ids = test_df[chosen_id_col].values
                source_for_ids = f"test_df['{chosen_id_col}']"
                break
        if ids is None:
            # final fallback: use test_df index
            chosen_id_col = None
            ids = test_df.index.values
            source_for_ids = "test_df.index (no id column found)"

    # Now validate lengths: predictions should correspond to test_df order
    n_test = len(test_df)
    print(f"\nChosen id source: {source_for_ids}; n_test_rows={n_test}")

    # Check prediction lengths and warn/fix if needed
    # Predictions dict expected keys = targets list
    sane_preds = {}
    for t in targets:
        arr = predictions.get(t)
        if arr is None:
            # no prediction for this target
            print(f"Warning: no predictions found for target '{t}'. Filling with NaN.")
            sane_preds[t] = np.full((n_test,), np.nan, dtype=float)
        else:
            arr = np.asarray(arr)
            if arr.shape[0] != n_test:
                # attempt to align: common sources of mismatch:
                #  - predictions produced from a different filtered/test set
                #  - predictions length equals number of sample_submission rows
                if samp_exists and arr.shape[0] == len(samp):
                    print(f"Note: predictions for '{t}' match sample_submission length ({len(samp)}). Using sample_submission order for ids if lengths match.")
                    # if ids was taken from sample_submission earlier, it's OK; otherwise we still use arr as-is but must ensure id ordering
                    # prefer to keep arr but warn
                    sane_preds[t] = arr
                else:
                    # try to broadcast/truncate/pad
                    print(f"Warning: prediction length mismatch for '{t}': got {arr.shape[0]}, expected {n_test}. Attempting to fix by truncation/padding with NaN.")
                    if arr.shape[0] > n_test:
                        sane_preds[t] = arr[:n_test]
                    else:
                        pad = np.full((n_test - arr.shape[0],), np.nan, dtype=float)
                        sane_preds[t] = np.concatenate([arr, pad], axis=0)
            else:
                sane_preds[t] = arr

    # Build submission DataFrame
    # Use the id column name from sample_submission if that is the expected header; otherwise fall back to 'id' or the chosen column
    if samp_exists:
        out_id_col = samp.columns[0]
    else:
        out_id_col = chosen_id_col if chosen_id_col is not None else 'id'

    sub = pd.DataFrame()
    sub[out_id_col] = ids

    # Put prediction columns in the same order as 'targets'
    for t in targets:
        sub[t] = sane_preds.get(t, np.full((n_test,), np.nan))

    # If sample_submission exists, reorder to match its columns (keeps canonical order)
    if samp_exists:
        desired_cols = [samp.columns[0]] + [c for c in samp.columns[1:] if c in sub.columns]
        # If sample_submission had extra columns unknown to us, we only include ones we have
        sub = sub.reindex(columns=desired_cols)
        # Try to detect the ID column from test_df
    if "id" in test_df.columns:
        id_col = "id"
    elif "polymer_id" in test_df.columns:
        id_col = "polymer_id"
    else:
        id_col = None  # fallback
    
    # Build submission DataFrame
    sub = pd.DataFrame()
    
    if id_col is not None:
        sub[id_col] = test_df[id_col].values
    else:
        # fallback: use row index as id
        sub["id"] = test_df.index.values

    # Add predictions
    for t in targets:
        arr = predictions.get(t)
        sub[t] = arr if arr is not None else np.nan

    # If sample_submission exists, reorder columns
    if os.path.exists("sample_submission.csv"):
        samp = pd.read_csv("sample_submission.csv")
        sub = sub[samp.columns]  # enforce exact same order

    out_path = "submission.csv"
    sub.to_csv(out_path, index=False)
    print("\n✅ Saved submission with correct test_df IDs to", out_path)


Chosen id source: test_df.index (no id column found); n_test_rows=3

✅ Saved submission with correct test_df IDs to submission.csv


In [29]:
submission=pd.DataFrame(sub)

In [30]:
submission

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,0,168.301324,0.372571,0.176028,1.221852,20.278054
1,1,194.978956,0.376339,0.23517,1.072437,20.371656
2,2,121.170833,0.351574,0.253476,1.066255,21.030346


In [31]:
submission.to_csv('submission.csv',index=False)