# Model Training, Validation and Testing on OE62

In [None]:
import torch
import torch_geometric
import logging
from pathlib import Path
from tqdm import tqdm
import os

from ocpmodels import models
from ocpmodels.common import logger
from ocpmodels.common.utils import setup_logging, load_config
from ocpmodels.datasets import LmdbDataset
from ocpmodels.common.registry import registry
from ocpmodels.trainers import EnergyTrainer, ForcesTrainer


setup_logging()

%load_ext autoreload
%autoreload 2

### Define model variant by choosing a config file
For each model, the following variants exist: **baseline**, **variant with Ewald message passing**, **increased cutoff** and **increased embedding size**.

Configs to choose from [and approximate training times in brackets]: 
- schnet_oe62_baseline.yml [~ 1h]
- schnet_oe62_ewald.yml    [~ 4h]
- schnet_oe62_cutoff.yml   [~ 1h]
- schnet_oe62_embeddings.yml[~ 1.5h]
----------------------------
- painn_oe62_baseline.yml  [~ 10h]
- painn_oe62_ewald.yml     [~ 14.5h]
- painn_oe62_cutoff.yml    [~ 12h]
- painn_oe62_embeddings.yml [~ 10h]
----------------------------
- dpp_oe62_baseline.yml [~ 16h]
- dpp_oe62_ewald.yml [~ 22h]
- dpp_oe62_cutoff.yml [~ 22h]
- dpp_oe62_embeddings.yml [~ 19h]
----------------------------
- gemnet_oe62_baseline.yml [~ 1d 8h]
- gemnet_oe62_ewald.yml [~ 1d 18h]
- gemnet_oe62_cutoff.yml [~ 1d 18h]
- gemnet_oe62_embeddings.yml [~ 1d 8h]
----------------------------

The above training times are based off our experience using a single `Nvidia A100` GPU. We obtained them while having intermediate evaluation runs on the test set disabled at training time. This is not the case here, so training might take slightly longer.

In [None]:
config_dir = "configs_oe62"
#-----------Put your model variant here-----------
config_path = os.path.join(config_dir, "schnet_oe62_baseline.yml")

### Parse config file and initialize `EnergyTrainer` object for OE62

In [None]:
torch.cuda.empty_cache()
conf = load_config(config_path)[0]
task = conf["fixed"]["task"]
model = conf["fixed"]["model"]
optimizer = conf["fixed"]["optimizer"]
name = conf["fixed"]["name"]
logger = conf["fixed"]["logger"]
dataset = conf["fixed"]["dataset"]
trainer = EnergyTrainer(
    task=task,
    model=model,
    dataset=dataset,
    optimizer=optimizer,
    identifier=name,
    run_dir="./",
    is_debug=False,  # if True, do not save checkpoint, logs, or results
    print_every=5000,
    seed=0,  # random seed to use
    logger=logger,  # logger of choice (tensorboard and wandb supported)
    local_rank=0,
    amp=False,  # whether to use PyTorch Automatic Mixed Precision
)

### Train model

In [None]:
trainer.train()

### Load best checkpoint from training

In [None]:
checkpoint_path = os.path.join(
    trainer.config["cmd"]["checkpoint_dir"], "best_checkpoint.pt"
)
trainer = EnergyTrainer(
    task=task,
    model=model,
    dataset=dataset,
    optimizer=optimizer,
    identifier="schnet",
    run_dir="./",
    # directory to save results if is_debug=False. Prediction files are saved here so be careful not to override!
    is_debug=True,  # if True, do not save checkpoint, logs, or results
    print_every=5000,
    seed=0,  # random seed to use
    logger=logger,  # logger of choice (tensorboard and wandb supported)
    local_rank=0,
    amp=False,  # use PyTorch Automatic Mixed Precision (faster training and less memory usage)
)
trainer.load_checkpoint(checkpoint_path=checkpoint_path)

### Validate or test model
Replace the argument below by `split="val"` to use the OE62 validation split instead.

In [None]:
metrics = trainer.validate(split="test")
results = {key: val["metric"] for key, val in metrics.items()}
print(f"Results for configuration {name}: {results}")

# Model Training and Validation on OC20

On OC20, only validation can be done locally. To generate results on the test set, follow the instructions on https://github.com/Open-Catalyst-Project/ocp to obtain files for submission on eval.ai.

### Define model variant by choosing a config file
For each model, the following variants exist: **baseline**, **variant with Ewald message passing**, **increased cutoff**.

Configs to choose from [and approximate training times in brackets]: 
- schnet_oc20_baseline.yml [~ 1d 4h]
- schnet_oc20_ewald.yml [~ 2d 18h]
- schnet_oc20_cutoff.yml [~ 2d 22h]
----------------------------
- painn_oc20_baseline.yml [~ 2d 11h]
- painn_oc20_ewald.yml [~ 2d 17h]
- painn_oc20_cutoff.yml [~ 3d 7h]
----------------------------
- dpp_oc20_baseline.yml [~ 6d]
- dpp_oc20_ewald.yml [~ 6d 12h]
- dpp_oc20_cutoff.yml [~6d 15h]
----------------------------
- gemnet_oc20_baseline.yml [~ 10d 2h]
- gemnet_oc20_ewald.yml [~ 11d 2h]
- gemnet_oc20_cutoff.yml [~ 11d 4h]
----------------------------
The above training times are based off our experience using a single `Nvidia A100` GPU.

In [None]:
config_dir = "configs_oc20"
#-----------Put your model variant here-----------
config_path = os.path.join(config_dir, "schnet_oc20_baseline.yml")

### Parse config file and initialize `ForcesTrainer` object for OC20

In [None]:
torch.cuda.empty_cache()
conf = load_config(config_path)[0]
task = conf["fixed"]["task"]
model = conf["fixed"]["model"]
optimizer = conf["fixed"]["optimizer"]
name = conf["fixed"]["name"]
logger = conf["fixed"]["logger"]
# conf["fixed"]["dataset_train"] contains the training set and the 
# combination of all four validation splits
dataset = conf["fixed"]["dataset_train"]
# four individual validation splits 
#(adsorbate, catalyst, none or both have out-of-distribution composition)
dataset_id = conf["fixed"]["dataset_id"]
dataset_ood_ads = conf["fixed"]["dataset_ood_ads"]
dataset_ood_cat = conf["fixed"]["dataset_ood_cat"]
dataset_ood_both = conf["fixed"]["dataset_ood_both"]
trainer = ForcesTrainer(
    task=task,
    model=model,
    dataset=dataset,
    optimizer=optimizer,
    identifier=name,
    run_dir="./",
    is_debug=False,  # if True, do not save checkpoint, logs, or results
    print_every=5000,
    seed=0,  # random seed to use
    logger=logger,  # logger of choice (tensorboard and wandb supported)
    local_rank=0,
    amp=False,  # whether to use PyTorch Automatic Mixed Precision
)

### Train model

In [None]:
trainer.train()

### Load best checkpoint from training

In [None]:
checkpoint_path = os.path.join(
    trainer.config["cmd"]["checkpoint_dir"], "best_checkpoint.pt"
)

### Validate on val-id
Validation split where both the adsorbate and catalyst compositions are in-distribution.

In [None]:
trainer = ForcesTrainer(
    task=task,
    model=model,
    dataset=dataset_id,
    optimizer=optimizer,
    identifier="validate_id",
    run_dir="./",
    # directory to save results if is_debug=False. Prediction files are saved here so be careful not to override!
    is_debug=True,  # if True, do not save checkpoint, logs, or results, set to False if you want to test not validate (results file needed)
    print_every=5000,
    seed=0,  # random seed to use
    logger=logger,  # logger of choice (tensorboard and wsqueueandb supported)
    local_rank=0,
    amp=False,  # use PyTorch Automatic Mixed Precision (faster training and less memory usage)
)

trainer.load_checkpoint(checkpoint_path=checkpoint_path)
metrics = trainer.validate()
results_id = {key: val["metric"] for key, val in metrics.items()}
print(f"val-id results for configuration {name}: {results}")

### Validate on val-ood-ads
Validation split where the adsorbate compositions are out-of-distribution.

In [None]:
trainer = ForcesTrainer(
    task=task,
    model=model,
    dataset=dataset_ood_ads,
    optimizer=optimizer,
    identifier="validate_ood_ads",
    run_dir="./",
    # directory to save results if is_debug=False. Prediction files are saved here so be careful not to override!
    is_debug=True,  # if True, do not save checkpoint, logs, or results, set to False if you want to test not validate (results file needed)
    print_every=5000,
    seed=0,  # random seed to use
    logger=logger,  # logger of choice (tensorboard and wsqueueandb supported)
    local_rank=0,
    amp=False,  # use PyTorch Automatic Mixed Precision (faster training and less memory usage)
)

trainer.load_checkpoint(checkpoint_path=checkpoint_path)
metrics = trainer.validate()
results_id = {key: val["metric"] for key, val in metrics.items()}
print(f"val-ood-ads results for configuration {name}: {results}")

### Validate on val-ood-cat
Validation split where the catalyst compositions are out-of-distribution.

In [None]:
trainer = ForcesTrainer(
    task=task,
    model=model,
    dataset=dataset_ood_cat,
    optimizer=optimizer,
    identifier="validate_ood_cat",
    run_dir="./",
    # directory to save results if is_debug=False. Prediction files are saved here so be careful not to override!
    is_debug=True,  # if True, do not save checkpoint, logs, or results, set to False if you want to test not validate (results file needed)
    print_every=5000,
    seed=0,  # random seed to use
    logger=logger,  # logger of choice (tensorboard and wsqueueandb supported)
    local_rank=0,
    amp=False,  # use PyTorch Automatic Mixed Precision (faster training and less memory usage)
)

trainer.load_checkpoint(checkpoint_path=checkpoint_path)
metrics = trainer.validate()
results_id = {key: val["metric"] for key, val in metrics.items()}
print(f"val-ood-cat results for configuration {name}: {results}")

### Validate on val-ood-both
Validation split where both the adsorbate and catalyst compositions are out-of-distribution.

In [None]:
trainer = ForcesTrainer(
    task=task,
    model=model,
    dataset=dataset_ood_both,
    optimizer=optimizer,
    identifier="validate_ood_both",
    run_dir="./",
    # directory to save results if is_debug=False. Prediction files are saved here so be careful not to override!
    is_debug=True,  # if True, do not save checkpoint, logs, or results, set to False if you want to test not validate (results file needed)
    print_every=5000,
    seed=0,  # random seed to use
    logger=logger,  # logger of choice (tensorboard and wsqueueandb supported)
    local_rank=0,
    amp=False,  # use PyTorch Automatic Mixed Precision (faster training and less memory usage)
)

trainer.load_checkpoint(checkpoint_path=checkpoint_path)
metrics = trainer.validate()
results_id = {key: val["metric"] for key, val in metrics.items()}
print(f"val-ood-both results for configuration {name}: {results}")