# Development Script

### Imports

In [3]:
import pathlib
from dataclasses import dataclass
from typing import Optional

import torch

### Configuration

In [None]:
@dataclass(slots=True)
class EnvironmentArguments(_ABCDataClass):
    DEBUG: bool = False
    STRICT: bool = True

    SEED: int = 1234
    PORT: int = 6006

    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"

    LOG_TENSORBOARD: bool = True
    LOG_WANDB: bool = False
    LOG_MLFLOW: bool = False


@dataclass
class DataArguments(_ABCDataClass):
    NUM_WORKERS: int = 8
    PERSISTENT_WORKERS: bool = True
    REDO_DATA_PROCESSING: bool = False

    HOME_DIR: Optional[str] = str(pathlib.Path(__file__).parent.resolve())
    DATA_DIR: Optional[str] = f"{pathlib.Path(__file__).parent.resolve()}/data"
    DATA_LOAD_DIR: Optional[
        str
    ] = f"{pathlib.Path(__file__).parent.resolve()}/data/saved"

    REMOVE_MISSING: bool = True
    REMOVE_DUPLICATES: bool = True
    CANONICALIZATION: bool = True
    ENUMERATION: int = 10
    ENUM_OVERSAMPLE: int = 15


@dataclass
class ModelArguments(_ABCDataClass):
    NAME: str = "model"

    NUM_EPOCHS: int = 100
    BATCH_SIZE: int = 128
    LOAD_PREVIOUS: bool = False
    MODEL_LOAD_PATH: Optional[str] = None


@dataclass
class Seq2SeqArguments(ModelArguments):
    NAME: str = "seq2seq"

    SHARE_WEIGHT: bool = False

    EMB_SIZE: int = 512

    NHEAD: int = 8
    DROPOUT: float = 0.1
    FFN_HID_DIM: int = 512
    NUM_ENCODER_LAYERS: int = 3
    NUM_DECODER_LAYERS: int = 3


@dataclass
class TokenArguments(_ABCDataClass):
    # Define special symbols and indices
    PAD_IDX: int = 0  # Padding
    BOS_IDX: int = 1  # Beginning of Sequence
    EOS_IDX: int = 2  # End of Sequence
    UNK_IDX: int = 3  # Unknown Value
    MSK_IDX: Optional[int] = None  # Mask

    # Our vocabulary
    VOCAB: str = (
        " ^$?#%()+-./0123456789=@ABCDEFGHIKLMNOPRSTVXYZ[\\]abcdefgilmnoprstuy"
    )
    MAX_SEQ_LEN: int = 110


### Data Preprocessing

In [None]:
# Data cleaning
# Data analysis

### Data Engineering

In [None]:
# Enumeration
# Outlier removal
# Imbalance fix (sequence length; classes; activity)

### Model Creation

In [None]:
# Create model
# Analyse metrics
# Visualize parts (Embeddings; Loss environment; etc.)
# Hyperparameter-optimization

### Model Validation

In [None]:
# Test model

### Model Deployment

In [None]:
# Predict (something)
# Interpret model