# `Protein Workshop` Tutorial, Part 5 - Adding a New Task
![Tasks](../docs/source/_static/box_downstream_tasks.png)

## Add a custom task to the `Protein Workshop`

In [None]:
%load_ext autoreload
%autoreload 2
#%load_ext blackcellmagic

### Create a new subclass of the `torch_geometric.transforms.BaseTransform` class

#### Reference the `SequenceNoiseTransform` below (i.e., `src/tasks/sequence_denoising.py`) to fill out a custom `src/tasks/my_new_task.py`

In [None]:
"""
class SequenceNoiseTransform(BaseTransform):
    def __init__(
        self, corruption_rate: float, corruption_strategy: Literal["mutate", "mask"]
    ):
        self.corruption_rate = corruption_rate
        self.corruption_strategy = corruption_strategy

    @property
    def required_attributes(self) -> Set[str]:
        return {"residue_type"}

    @beartype
    def __call__(self, x: Union[Data, Protein]) -> Union[Data, Protein]:
        x.residue_type_uncorrupted = copy.deepcopy(x.residue_type)
        # Get indices of residues to corrupt
        indices = torch.randint(
            0,
            x.residue_type.shape[0],
            (int(x.residue_type.shape[0] * self.corruption_rate),),
            device=x.residue_type.device,
        ).long()

        # Apply corruption
        if self.corruption_strategy == "mutate":
            # Set indices to random residue type
            x.residue_type[indices] = torch.randint(
                0,
                23,  # TODO: probably best to not hardcode this
                (indices.shape[0],),
                device=x.residue_type.device,
            )
        elif self.corruption_strategy == "mask":
            # Set indices to 23 -> "UNK"
            x.residue_type[indices] = 23  # TODO: probably best to not hardcode this
        else:
            raise NotImplementedError(
                f"Corruption strategy: {self.corruption_strategy} not supported."
            )
        # Get indices of applied corruptions
        index = torch.zeros(x.residue_type.shape[0])
        index[indices] = 1
        x.sequence_corruption_mask = index.bool()

        return x

    def __repr__(self) -> str:
        return f"{self.__class__}(corruption_strategy: {self.corruption_strategy} corruption_rate: {self.corruption_rate})"
"""

### Create a new task config file to accompany the custom `MyNewTask`

#### Reference the `sequence_denoising` config below (i.e., `configs/task/sequence_denoising.yaml`) to fill out a custom `configs/task/my_new_task.yaml`

In [None]:
"""
# @package _global_

defaults:
  - override /metrics:
      - accuracy
      - f1_score
      - perplexity
  - override /decoder:
      - residue_type
  - override /transforms:
      - remove_missing_ca
      - sequence_denoising

dataset:
  num_classes: 23

callbacks:
  early_stopping:
    monitor: val/residue_type/accuracy
    mode: "max"
  model_checkpoint:
    monitor: val/residue_type/accuracy
    mode: "max"

task:
  task: "sequence_denoising"
  classification_type: "multiclass"
  metric_average: "micro"

  losses:
    residue_type: cross_entropy
  label_smoothing: 0.0

  output:
    - residue_type
  supervise_on:
    - residue_type
"""

### Use new task in either a pre-training or fine-tuning regime, including or excluding full-atom context

In [None]:
# Misc. tools
import os

# Hydra tools
import hydra

from hydra.compose import GlobalHydra
from hydra.core.hydra_config import HydraConfig

from proteinworkshop.constants import HYDRA_CONFIG_PATH
from proteinworkshop.utils.notebook import init_hydra_singleton

version_base = "1.2"  # Note: Need to update whenever Hydra is upgraded
init_hydra_singleton(reload=True, version_base=version_base)

path = HYDRA_CONFIG_PATH
rel_path = os.path.relpath(path, start=".")

GlobalHydra.instance().clear()
hydra.initialize(rel_path, version_base=version_base)

cfg = hydra.compose(config_name="train", overrides=["encoder=schnet", "task=my_new_task", "dataset=afdb_swissprot_v4", "features=ca_angles", "+aux_task=none"], return_hydra_config=True)

# Note: Customize as needed e.g., when running a sweep
cfg.hydra.job.num = 0
cfg.hydra.job.id = 0
cfg.hydra.hydra_help.hydra_help = False
cfg.hydra.runtime.output_dir = "outputs"

HydraConfig.instance().set_config(cfg)

### Either pre-train or fine-tune a model using the new task and an existing dataset

In [None]:
from proteinworkshop.configs import config
from proteinworkshop.finetune import finetune
from proteinworkshop.train import train_model

cfg = config.validate_config(cfg)

# train_model(cfg)  # Pre-train a model using the selected data
# finetune(cfg)  # Fine-tune a model using the selected data

### Reconfigure the custom task to incorporate side-chain atom context

In [None]:
version_base = "1.2"  # Note: Need to update whenever Hydra is upgraded
init_hydra_singleton(reload=True, version_base=version_base)

path = HYDRA_CONFIG_PATH
rel_path = os.path.relpath(path, start=".")

GlobalHydra.instance().clear()
hydra.initialize(rel_path, version_base=version_base)

cfg = hydra.compose(config_name="train", overrides=["encoder=schnet", "task=my_new_task", "dataset=afdb_swissprot_v4", "features=ca_sc", "+aux_task=none"], return_hydra_config=True)

# Note: Customize as needed e.g., when running a sweep
cfg.hydra.job.num = 0
cfg.hydra.job.id = 0
cfg.hydra.hydra_help.hydra_help = False
cfg.hydra.runtime.output_dir = "outputs"

HydraConfig.instance().set_config(cfg)

cfg = config.validate_config(cfg)

# train_model(cfg)  # Pre-train a model using the selected data
# finetune(cfg)  # Fine-tune a model using the selected data