# PyTorch: Classify Multi-Label

![houses](../images/iris.png)

In [2]:
import torch
import torch.nn as nn
from torch import optim
import torchmetrics

from sklearn.preprocessing import OrdinalEncoder, PowerTransformer

import aiqc
from aiqc import datum

---

## Example Data

Reference [Example Datasets](example_datasets.ipynb) for more information.

In [3]:
df = datum.to_pandas('iris.tsv')

In [4]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [5]:
df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species           int64
dtype: object

---

## a) High-Level API

Reference [High-Level API Docs](api_high_level.ipynb) for more information including how to work with non-tabular data.

In [6]:
splitset = aiqc.Pipeline.Tabular.make(
    dataFrame_or_filePath = df
    , label_column = 'species'
    , size_test = 0.22
    , size_validation = 0.12
    , label_encoder = OrdinalEncoder()
    , feature_encoders = [{
        "sklearn_preprocess": PowerTransformer(method='box-cox', copy=False)
        , "dtypes": ['float64']
    }]
    
    , dtype = None
    , features_excluded = None
    , fold_count = None
    , bin_count = None
)


___/ featurecoder_index: 0 \_________

=> The column(s) below matched your filter(s) and were ran through a test-encoding successfully.
['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

=> Nice! Now all feature column(s) have encoder(s) associated with them.
No more Featurecoders can be added to this Encoderset.



Note that `num_classes` is unique to PyTorch multi-classification.

In [7]:
def fn_build(features_shape, num_classes, **hp):
    model = nn.Sequential(
        nn.Linear(features_shape[0], 12),
        nn.BatchNorm1d(12,12),
        nn.ReLU(),
        nn.Dropout(p=0.5),

        nn.Linear(12, num_classes),
        nn.Softmax(dim=1)
    )
    return model

In [8]:
def fn_train(model, loser, optimizer, samples_train, samples_evaluate, **hp):
    ## --- Prepare mini batches for analysis ---
    batched_features, batched_labels = aiqc.torch_batcher(
        samples_train['features'], samples_train['labels'],
        batch_size=hp['batch_size'], enforce_sameSize=False, allow_1Sample=False
    )

    ## --- Metrics ---
    acc = torchmetrics.Accuracy()
    # Modeled after `keras.model.History.history` object.
    history = {
        'loss':list(), 'accuracy': list(), 
        'val_loss':list(), 'val_accuracy':list()
    }

    ## --- Training loop ---
    epochs = 100
    for epoch in range(epochs):
        # --- Batch training ---
        for i, batch in enumerate(batched_features):      
            # Make raw (unlabeled) predictions.
            batch_probability = model(batched_features[i])
            batch_flat_labels = batched_labels[i].flatten().to(torch.long)
            batch_loss = loser(batch_probability, batch_flat_labels)
            # Backpropagation.
            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

        ## --- Epoch metrics ---
        # Overall performance on training data.
        train_probability = model(samples_train['features'])
        train_flat_labels = samples_train['labels'].flatten().to(torch.long)
        train_loss = loser(train_probability, train_flat_labels)
        train_acc = acc(train_probability, samples_train['labels'].to(torch.short))
        history['loss'].append(float(train_loss))
        history['accuracy'].append(float(train_acc))
        # Performance on evaluation data.
        eval_probability = model(samples_evaluate['features'])
        eval_flat_labels = samples_evaluate['labels'].flatten().to(torch.long)
        eval_loss = loser(eval_probability, eval_flat_labels)
        eval_acc = acc(eval_probability, samples_evaluate['labels'].to(torch.short))    
        history['val_loss'].append(float(eval_loss))
        history['val_accuracy'].append(float(eval_acc))
    return model, history

Optional, will be automatically selected based on `analysis_type` if left as `None`.

In [9]:
def fn_lose(**hp):
	loser = nn.CrossEntropyLoss(reduction=hp['reduction'])
	return loser

In [10]:
hyperparameters = {
    "reduction": ['mean', 'sum']
    , "batch_size": [3, 5]
}

In [11]:
queue = aiqc.Experiment.make(
    library = "pytorch"
    , analysis_type = "classification_multi"
    , fn_build = fn_build
    , fn_train = fn_train
    , splitset_id = splitset.id
    , encoderset_id = splitset.encodersets[0]
    , repeat_count = 1
    , hide_test = False
    , hyperparameters = hyperparameters
    
    , fn_lose = None #automated
    , fn_optimize = None #automated
    , fn_predict = None #automated
    , foldset_id = None
)

In [12]:
queue.run_jobs()

🔮 Training Models 🔮: 100%|██████████████████████████████████████████| 4/4 [00:14<00:00,  3.56s/it]


For more information on visualization of performance metrics, reference the [Visualization & Metrics](visualization.html) documentation.

---

## b) Low-Level API

Reference [Low-Level API Docs](api_high_level.ipynb) for more information including how to work with non-tabular data, and defining an optimizer.

In [13]:
def fn_optimize(**hp):
    optimizer = keras.optimizers.Adamax()
    return optimizer

In [14]:
dataset = aiqc.Dataset.Tabular.from_pandas(df)

In [15]:
label_column = 'species'

In [16]:
label = dataset.make_label(columns=[label_column])

In [17]:
featureset = dataset.make_featureset(exclude_columns=[label_column])

In [18]:
splitset = featureset.make_splitset(
    label_id = label.id
    , size_test = 0.22
    , size_validation = 0.12
)

In [19]:
encoderset = splitset.make_encoderset()

In [20]:
labelcoder = encoderset.make_labelcoder(
    sklearn_preprocess = OrdinalEncoder()
)

In [21]:
featurecoder_0 = encoderset.make_featurecoder(
    sklearn_preprocess = PowerTransformer(method='yeo-johnson', copy=False)
    , dtypes = ['float64']
)


___/ featurecoder_index: 0 \_________

=> The column(s) below matched your filter(s) and were ran through a test-encoding successfully.
['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

=> Nice! Now all feature column(s) have encoder(s) associated with them.
No more Featurecoders can be added to this Encoderset.



Note that `num_classes` is unique to PyTorch multi-classification.

In [22]:
def fn_build(features_shape, num_classes, **hp):
    model = nn.Sequential(
        nn.Linear(features_shape[0], 12),
        nn.BatchNorm1d(12,12),
        nn.ReLU(),
        nn.Dropout(p=0.5),

        nn.Linear(12, num_classes),
        nn.Softmax(dim=1)
    )
    return model

In [23]:
def fn_train(model, loser, optimizer, samples_train, samples_evaluate, **hp):
    ## --- Prepare mini batches for analysis ---
    batched_features, batched_labels = aiqc.torch_batcher(
        samples_train['features'], samples_train['labels'],
        batch_size=hp['batch_size'], enforce_sameSize=False, allow_1Sample=False
    )

    ## --- Metrics ---
    acc = torchmetrics.Accuracy()
    # Modeled after `keras.model.History.history` object.
    history = {
        'loss':list(), 'accuracy': list(), 
        'val_loss':list(), 'val_accuracy':list()
    }

    ## --- Training loop ---
    epochs = 100
    for epoch in range(epochs):
        # --- Batch training ---
        for i, batch in enumerate(batched_features):      
            # Make raw (unlabeled) predictions.
            batch_probability = model(batched_features[i])
            batch_flat_labels = batched_labels[i].flatten().to(torch.long)
            batch_loss = loser(batch_probability, batch_flat_labels)
            # Backpropagation.
            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

        ## --- Epoch metrics ---
        # Overall performance on training data.
        train_probability = model(samples_train['features'])
        train_flat_labels = samples_train['labels'].flatten().to(torch.long)
        train_loss = loser(train_probability, train_flat_labels)
        train_acc = acc(train_probability, samples_train['labels'].to(torch.short))
        history['loss'].append(float(train_loss))
        history['accuracy'].append(float(train_acc))
        # Performance on evaluation data.
        eval_probability = model(samples_evaluate['features'])
        eval_flat_labels = samples_evaluate['labels'].flatten().to(torch.long)
        eval_loss = loser(eval_probability, eval_flat_labels)
        eval_acc = acc(eval_probability, samples_evaluate['labels'].to(torch.short))    
        history['val_loss'].append(float(eval_loss))
        history['val_accuracy'].append(float(eval_acc))
    return model, history

Optional, will be automatically selected based on `analysis_type` if left as `None`.

In [24]:
def fn_lose(**hp):
	loser = nn.CrossEntropyLoss(reduction=hp['reduction'])
	return loser

In [25]:
hyperparameters = {
    "reduction": ['mean', 'sum']
    , "batch_size": [3, 5]
}

In [26]:
algorithm = aiqc.Algorithm.make(
    library = "pytorch"
    , analysis_type = "classification_multi"
    , fn_build = fn_build
    , fn_train = fn_train
)

In [27]:
hyperparamset = algorithm.make_hyperparamset(
    hyperparameters = hyperparameters
)

In [28]:
queue = algorithm.make_queue(
    splitset_id = splitset.id
    , hyperparamset_id = hyperparamset.id
    , encoderset_id  = encoderset.id
    , repeat_count = 1
)

In [29]:
queue.run_jobs()

🔮 Training Models 🔮: 100%|██████████████████████████████████████████| 4/4 [00:14<00:00,  3.73s/it]


For more information on visualization of performance metrics, reference the [Visualization & Metrics](visualization.html) documentation.