In [None]:
!pip install torch_molecule



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

csv_path = 'train.csv'
train_df = pd.read_csv(csv_path)

# 1. split off 20% for dev_test
temp_df, dev_test = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,  # for reproducibility
    shuffle=True
)

# 2. split the remaining 80% into 75% train / 25% valid → 0.6 / 0.2 overall
dev_train, dev_val = train_test_split(
    temp_df,
    test_size=0.25,  # 0.25 * 0.8 = 0.2 of the original
    random_state=42,
    shuffle=True
)

# Verify sizes
print(f"Total rows:   {len(train_df)}")
print(f"Dev train:    {len(dev_train)} ({len(dev_train)/len(train_df):.2%})")
print(f"Dev valid:    {len(dev_val)} ({len(dev_val)/len(train_df):.2%})")
print(f"Dev test:     {len(dev_test)} ({len(dev_test)/len(train_df):.2%})")
print(f"Polymer example:{dev_train['SMILES'].to_list()[:3]}")
print(f"Columns:{dev_train.columns}")

In [None]:
from tqdm.notebook import tqdm as notebook_tqdm
import tqdm
tqdm.tqdm = notebook_tqdm
tqdm.trange = notebook_tqdm

from torch_molecule import LSTMMolecularPredictor
from torch_molecule.utils.search import ParameterType, ParameterSpec

search_parameters = {
    "output_dim": ParameterSpec(ParameterType.INTEGER, (8, 32)),
    "LSTMunits": ParameterSpec(ParameterType.INTEGER, (30, 120)),
    "learning_rate": ParameterSpec(ParameterType.LOG_FLOAT, (1e-4, 1e-2)),
}

lstm = LSTMMolecularPredictor(
    task_type="regression",
    num_task=5,
    batch_size=192,
    epochs=200,
    verbose=True
)

print("Model initialized successfully")
X_train = dev_train['SMILES'].to_list()
y_train = dev_train[['Tg', 'FFV', 'Tc', 'Density', 'Rg']].to_numpy()
X_val = dev_val['SMILES'].to_list()
y_val = dev_val[['Tg', 'FFV', 'Tc', 'Density', 'Rg']].to_numpy()
lstm.autofit(
    X_train = X_train,
    y_train = y_train,
    X_val = X_val,
    y_val = y_val,
    search_parameters=search_parameters,
    n_trials = 10 # number of times searching the best hyper-parameters
)

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

def smiles_to_fp(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits))

# Convert SMILES to fingerprint features
X_train_feats = np.vstack([smiles_to_fp(s) for s in X_train])
X_val_feats   = np.vstack([smiles_to_fp(s) for s in X_val])
X_test = dev_test['SMILES'].to_list()
X_test_feats  = np.vstack([smiles_to_fp(s) for s in X_test])

# Combine train and validation sets
X_dev_feats = np.vstack([X_train_feats, X_val_feats])
y_dev = np.vstack([y_train, y_val])

# Test targets
y_test = dev_test[['Tg', 'FFV', 'Tc', 'Density', 'Rg']].to_numpy()

task_names = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
models = {}
y_pred = np.zeros_like(y_test)

# Train one random forest per task
for idx, name in enumerate(task_names):
    print('Training random forest for the task:', name)
    y_col = y_dev[:, idx]
    mask  = ~np.isnan(y_col)
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_dev_feats[mask], y_col[mask])
    models[name] = rf
    # Predict on test set
    y_pred[:, idx] = rf.predict(X_test_feats)

# Compute MSE per task, skipping NaNs
mse_per_task = {}
for i, name in enumerate(task_names):
    print('Predicting for the task:', name)
    mask = ~np.isnan(y_test[:, i])
    if mask.sum() > 0:
        mse = mean_squared_error(y_test[mask, i], y_pred[mask, i])
        mse_per_task[name] = mse
    else:
        mse_per_task[name] = np.nan

print("MSE per task:")
for name, mse in mse_per_task.items():
    print(f"  {name}: {mse:.4f}")

# Compute overall MSE across all tasks, skipping NaNs
mask_all = ~np.isnan(y_test)
y_true_flat = y_test[mask_all]
y_pred_flat = y_pred[mask_all]
mse_overall = mean_squared_error(y_true_flat, y_pred_flat)
print(f"Overall MSE: {mse_overall:.4f}")

In [None]:
from tqdm.notebook import tqdm as notebook_tqdm
import tqdm
tqdm.tqdm = notebook_tqdm
tqdm.trange = notebook_tqdm

from torch_molecule import GNNMolecularPredictor
from torch_molecule.utils.search import ParameterType, ParameterSpec
import numpy as np
from sklearn.metrics import mean_squared_error

search_parameters = {
    'num_layer': ParameterSpec(
        param_type=ParameterType.INTEGER,
        value_range=(2, 5)
    ),
    'hidden_size': ParameterSpec(
        param_type=ParameterType.INTEGER,
        value_range=(64, 512)
    ),
    'learning_rate': ParameterSpec(
        param_type=ParameterType.LOG_FLOAT,
        value_range=(1e-4, 1e-2)
    ),
}

gnn = GNNMolecularPredictor(
    task_type="regression",
    num_task=5,
    batch_size=192,
    epochs=200,
    verbose=True
)

print("Model initialized successfully")
X_train = dev_train['SMILES'].to_list()
y_train = dev_train[['Tg', 'FFV', 'Tc', 'Density', 'Rg']].to_numpy()
X_val = dev_val['SMILES'].to_list()
y_val = dev_val[['Tg', 'FFV', 'Tc', 'Density', 'Rg']].to_numpy()
gnn.autofit(
    X_train = X_train,
    y_train = y_train,
    X_val = X_val,
    y_val = y_val,
    search_parameters=search_parameters,
    n_trials = 10 # number of times searching the best hyper-parameters
)

In [None]:
X_test = dev_test['SMILES'].to_list()
y_test = dev_test[['Tg', 'FFV', 'Tc', 'Density', 'Rg']].to_numpy()
y_predict = gnn.predict(X_test)['prediction']

task_names = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

# Compute MSE per task, skipping NaNs
mse_per_task = {}
for i, name in enumerate(task_names):
    mask = ~np.isnan(y_test[:, i])
    if mask.sum() > 0:
        mse = mean_squared_error(y_test[mask, i], y_predict[mask, i])
        mse_per_task[name] = mse
    else:
        mse_per_task[name] = np.nan  # no valid data

print("MSE per task:")
for name, mse in mse_per_task.items():
    print(f"  {name}: {mse:.4f}")

# Compute overall MSE across all tasks, skipping NaNs
mask_all = ~np.isnan(y_test)
y_true_flat = y_test[mask_all]
y_pred_flat = y_predict[mask_all]
mse_overall = mean_squared_error(y_true_flat, y_pred_flat)

print(f"Overall MSE: {mse_overall:.4f}")

In [None]:
submission_df = pd.DataFrame(
    test_preds,
    columns=["Tg","FFV","Tc","Density","Rg"]
)
submission_df.insert(0, "id", test_df["id"])
submission_df.to_csv("submission.csv", index=False)