# PyTorch: Regression

![houses](../images/houses.png)

In [2]:
import torch
import torch.nn as nn
from torch import optim
import torchmetrics

from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder

import aiqc
from aiqc import datum

---

## Example Data

Reference [Example Datasets](example_datasets.ipynb) for more information.

In [3]:
df = datum.to_pandas('houses.csv')

In [4]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


---

## a) High-Level API

Reference [High-Level API Docs](api_high_level.ipynb) for more information including how to work with non-tabular data.

In [5]:
splitset = aiqc.Pipeline.Tabular.make(
    dataFrame_or_filePath = df
    , label_column = 'price'
    , size_test = 0.18
    , size_validation = 0.12
    , label_encoder = PowerTransformer(method='box-cox', copy=False)
    , feature_encoders = [
        {
            "sklearn_preprocess": StandardScaler(copy=False)
            , "dtypes": ['float64']
        },
        {
            "sklearn_preprocess": OrdinalEncoder()
            , "dtypes": ['int64']
        }
    ]
    
    , dtype = None
    , features_excluded = None
    , fold_count = None
    , bin_count = None
)


___/ featurecoder_index: 0 \_________

=> The column(s) below matched your filter(s) and were ran through a test-encoding successfully.
['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'ptratio', 'lstat']

=> The remaining column(s) and dtype(s) can be used in downstream Featurecoder(s):
{'chas': 'int64', 'rad': 'int64', 'tax': 'int64'}


___/ featurecoder_index: 1 \_________

=> The column(s) below matched your filter(s) and were ran through a test-encoding successfully.
['chas', 'rad', 'tax']

=> Nice! Now all feature column(s) have encoder(s) associated with them.
No more Featurecoders can be added to this Encoderset.



In [6]:
def fn_build(features_shape, labels_shape, **hp):
    # Just giving hyperparameter a shorter reference.
    nc = hp['neuron_count']

    model = nn.Sequential(
        nn.Linear(features_shape[0], nc),
        nn.BatchNorm1d(nc,nc),
        nn.ReLU(),
        nn.Dropout(p=0.4),

        nn.Linear(nc, nc),
        nn.BatchNorm1d(nc,nc),
        nn.ReLU(),
        nn.Dropout(p=0.4),

        nn.Linear(nc, labels_shape[0])
    )
    return model

In [7]:
def fn_train(model, loser, optimizer, samples_train, samples_evaluate, **hp):
    from torchmetrics.functional import explained_variance as expVar
    ## --- Prepare mini batches for analysis ---
    batched_features, batched_labels = aiqc.torch_batcher(
        samples_train['features'], samples_train['labels'],
        batch_size=5, enforce_sameSize=False, allow_1Sample=False
    )

    # Modeled after `keras.model.History.history` object.
    history = {
        'loss':list(), 'expVar': list(), 
        'val_loss':list(), 'val_expVar':list()
    }

    ## --- Training loop ---
    epochs = 75
    for epoch in range(epochs):
        # --- Batch training ---
        for i, batch in enumerate(batched_features):      
            # Make raw (unlabeled) predictions.
            batch_probability = model(batched_features[i])
            batch_flat_labels = batched_labels[i].flatten()
            #batch_loss = loser(batch_probability, batch_flat_labels)
            batch_loss = loser(batch_probability.flatten(), batch_flat_labels)
            # Backpropagation.
            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

        ## --- Epoch metrics ---
        # Overall performance on training data.
        train_probability = model(samples_train['features'])
        train_flat_labels = samples_train['labels'].flatten()
        train_loss = loser(train_probability.flatten(), train_flat_labels)
        train_expVar = expVar(train_probability, samples_train['labels'])
        history['loss'].append(float(train_loss))
        history['expVar'].append(float(train_expVar))

        # Performance on evaluation data.
        eval_probability = model(samples_evaluate['features'])
        eval_flat_labels = samples_evaluate['labels'].flatten()
        eval_loss = loser(eval_probability.flatten(), eval_flat_labels)

        eval_expVar = expVar(eval_probability, samples_evaluate['labels'])    
        history['val_loss'].append(float(eval_loss))
        history['val_expVar'].append(float(eval_expVar))
    return model, history

Optional, will be automatically selected based on `analysis_type` if left as `None`.

In [8]:
def fn_lose(**hp):
    if (hp['loss_type'] == 'mae'):
        loser = nn.L1Loss()#mean absolute error.
    elif (hp['loss_type'] == 'mse'):
        loser = nn.MSELoss()
    return loser

In [9]:
hyperparameters = {
    "neuron_count": [22,24]
    , "loss_type": ["mae","mse"]
}

In [10]:
queue = aiqc.Experiment.make(
    library = "pytorch"
    , analysis_type = "regression"
    , fn_build = fn_build
    , fn_train = fn_train
    , splitset_id = splitset.id
    , encoderset_id = splitset.encodersets[0]
    , repeat_count = 1
    , hide_test = False
    , hyperparameters = hyperparameters
    
    , fn_lose = fn_lose #optional/ automated
    , fn_optimize = None #optional/ automated
    , fn_predict = None #optional/ automated
    , foldset_id = None
)

In [11]:
queue.run_jobs()

🔮 Training Models 🔮: 100%|██████████████████████████████████████████| 4/4 [00:41<00:00, 10.47s/it]


For more information on visualization of performance metrics, reference the [Visualization & Metrics](visualization.html) documentation.

---

## b) Low-Level API

Reference [Low-Level API Docs](api_high_level.ipynb) for more information including how to work with non-tabular data, and defining an optimizer.

In [12]:
dataset = aiqc.Dataset.Tabular.from_pandas(df)

In [13]:
label_column = 'price'

In [14]:
label = dataset.make_label(columns=[label_column])

In [15]:
featureset = dataset.make_featureset(exclude_columns=[label_column])

In [16]:
splitset = featureset.make_splitset(
    label_id = label.id
    , size_test = 0.18
    , size_validation = 0.12
)

In [17]:
encoderset = splitset.make_encoderset()

In [18]:
labelcoder = encoderset.make_labelcoder(
    sklearn_preprocess = PowerTransformer(method='box-cox', copy=False)
)

In [19]:
featurecoder_0 = encoderset.make_featurecoder(
    sklearn_preprocess = StandardScaler(copy=False)
    , dtypes = ['float64']
)


___/ featurecoder_index: 0 \_________

=> The column(s) below matched your filter(s) and were ran through a test-encoding successfully.
['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'ptratio', 'lstat']

=> The remaining column(s) and dtype(s) can be used in downstream Featurecoder(s):
{'chas': 'int64', 'rad': 'int64', 'tax': 'int64'}



In [20]:
def fn_build(features_shape, labels_shape, **hp):
    # Just giving hyperparameter a shorter reference.
    nc = hp['neuron_count']

    model = nn.Sequential(
        nn.Linear(features_shape[0], nc),
        nn.BatchNorm1d(nc,nc),
        nn.ReLU(),
        nn.Dropout(p=0.4),

        nn.Linear(nc, nc),
        nn.BatchNorm1d(nc,nc),
        nn.ReLU(),
        nn.Dropout(p=0.4),

        nn.Linear(nc, labels_shape[0])
    )
    return model

In [21]:
def fn_train(model, loser, optimizer, samples_train, samples_evaluate, **hp):
    from torchmetrics.functional import explained_variance as expVar
    ## --- Prepare mini batches for analysis ---
    batched_features, batched_labels = aiqc.torch_batcher(
        samples_train['features'], samples_train['labels'],
        batch_size=5, enforce_sameSize=False, allow_1Sample=False
    )

    # Modeled after `keras.model.History.history` object.
    history = {
        'loss':list(), 'expVar': list(), 
        'val_loss':list(), 'val_expVar':list()
    }

    ## --- Training loop ---
    epochs = 75
    for epoch in range(epochs):
        # --- Batch training ---
        for i, batch in enumerate(batched_features):      
            # Make raw (unlabeled) predictions.
            batch_probability = model(batched_features[i])
            batch_flat_labels = batched_labels[i].flatten()
            #batch_loss = loser(batch_probability, batch_flat_labels)
            batch_loss = loser(batch_probability.flatten(), batch_flat_labels)
            # Backpropagation.
            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

        ## --- Epoch metrics ---
        # Overall performance on training data.
        train_probability = model(samples_train['features'])
        train_flat_labels = samples_train['labels'].flatten()
        train_loss = loser(train_probability.flatten(), train_flat_labels)
        train_expVar = expVar(train_probability, samples_train['labels'])
        history['loss'].append(float(train_loss))
        history['expVar'].append(float(train_expVar))

        # Performance on evaluation data.
        eval_probability = model(samples_evaluate['features'])
        eval_flat_labels = samples_evaluate['labels'].flatten()
        eval_loss = loser(eval_probability.flatten(), eval_flat_labels)

        eval_expVar = expVar(eval_probability, samples_evaluate['labels'])    
        history['val_loss'].append(float(eval_loss))
        history['val_expVar'].append(float(eval_expVar))
    return model, history

Optional, will be automatically selected based on `analysis_type` if left as `None`.

In [22]:
def fn_lose(**hp):
    if (hp['loss_type'] == 'mae'):
        loser = nn.L1Loss()#mean absolute error.
    elif (hp['loss_type'] == 'mse'):
        loser = nn.MSELoss()
    return loser

In [23]:
algorithm = aiqc.Algorithm.make(
    library = "pytorch"
    , analysis_type = "regression"
    , fn_build = fn_build
    , fn_train = fn_train
    , fn_lose = fn_lose
)

In [24]:
hyperparameters = {
    "neuron_count": [22,24]
    , "loss_type": ["mae","mse"]
}

In [25]:
hyperparamset = algorithm.make_hyperparamset(
    hyperparameters = hyperparameters
)

In [26]:
queue = algorithm.make_queue(
    splitset_id = splitset.id
    , hyperparamset_id = hyperparamset.id
    , encoderset_id  = encoderset.id
    , repeat_count = 1
)

In [27]:
queue.run_jobs()

🔮 Training Models 🔮: 100%|██████████████████████████████████████████| 4/4 [00:41<00:00, 10.32s/it]


For more information on visualization of performance metrics, reference the [Visualization & Metrics](visualization.html) documentation.