In [1]:
from dataclasses import dataclass
import numpy as np
import torch as t
import torch.nn.functional as F
import sklearn.datasets
from haikunator import Haikunator
import mlflow
import mlflow.pytorch
import os.path as path

In [2]:
NUM_SAMPLES = 100_000
NUM_FEATURES = 5
NOISE = 0.5
DEVICE = t.device("cuda" if t.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [3]:
mlruns_dir = path.expanduser("~/mlruns")
mlflow.set_tracking_uri(mlruns_dir)

In [4]:
all_X, all_y = sklearn.datasets.make_regression(
    n_samples=NUM_SAMPLES, 
    n_features=NUM_FEATURES, 
    noise=NOISE
)

train_size = int(NUM_SAMPLES * 0.7)
val_size = int(NUM_SAMPLES * 0.2)
test_size = NUM_SAMPLES - train_size - val_size

train_X = all_X[:train_size]
train_y = all_y[:train_size]
trainset = t.utils.data.TensorDataset(t.from_numpy(train_X).to(t.float32), t.from_numpy(train_y).to(t.float32))

val_X = all_X[train_size:train_size+val_size]
val_y = all_y[train_size:train_size+val_size]
valset = t.utils.data.TensorDataset(t.from_numpy(val_X).to(t.float32), t.from_numpy(val_y).to(t.float32))

test_X = all_X[train_size+val_size:]
test_y = all_y[train_size+val_size:]
testset = t.utils.data.TensorDataset(t.from_numpy(test_X).to(t.float32), t.from_numpy(test_y).to(t.float32))

In [5]:
class SimpleLR(t.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = t.nn.Linear(NUM_FEATURES, 3)
        self.fc2 = t.nn.Linear(3, 1)
        
    def forward(self, batch_x):
        x = F.relu(self.fc1(batch_x))
        batch_y_hat = self.fc2(x)
        return t.squeeze(batch_y_hat)

In [6]:
@dataclass
class Hyperparams:
    batch_size: int = 10
    epochs: int = 10
    learning_rate: float = 0.0001

    def to_dict(self):
        return {
            "batch_size": self.batch_size,
            "epochs": self.epochs,
            "learning_rate": self.learning_rate
        }

In [7]:
experiment_name = "LearnMLFlow"
run_name = Haikunator().haikunate()

def train(model, optim, loss_fn, epochs, trainloader, valloader, hparams):
    mlflow.set_experiment(experiment_name)
    model = model.to(DEVICE)

    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(hparams.to_dict())
        for epoch in range(epochs):
            # Process the training set
            train_losses = []
            train_outputs = t.tensor([], dtype=t.float32).to(DEVICE)
            train_targets = t.tensor([], dtype=t.float32).to(DEVICE)
            model.train()
            with t.enable_grad():
                for batch_X, batch_y in trainloader:
                    batch_X = batch_X.to(DEVICE)
                    batch_y = batch_y.to(DEVICE)

                    optim.zero_grad()
                    batch_y_hat = model.forward(batch_X)
                    loss = loss_fn(batch_y_hat, batch_y)
                    loss.backward()
                    optim.step()

                    train_losses.append(loss.detach().item())
                    train_outputs = t.cat((train_outputs, batch_y_hat.detach()))
                    train_targets = t.cat((train_targets, batch_y.detach()))
            train_loss = np.mean(train_losses)
            train_rmse = t.sqrt(F.mse_loss(train_outputs, train_targets)).cpu().numpy()

            # Calculate the validation metrics
            val_losses = []
            val_outputs = t.tensor([], dtype=t.float32).to(DEVICE)
            val_targets = t.tensor([], dtype=t.float32).to(DEVICE)
            model.eval()
            with t.no_grad():
                for batch_X, batch_y in valloader:
                    batch_X = batch_X.to(DEVICE)
                    batch_y = batch_y.to(DEVICE)
                    batch_y_hat = model(batch_X)
                    loss = loss_fn(batch_y_hat, batch_y)
                    val_losses.append(loss.detach().item())
                    val_outputs = t.cat((val_outputs, batch_y_hat.detach()))
                    val_targets = t.cat((val_targets, batch_y.detach()))
            val_loss = np.mean(val_losses)
            val_rmse = t.sqrt(F.mse_loss(val_outputs, val_targets)).cpu().numpy()

            mlflow.log_metric("train_loss", np.around(train_loss, 3), step=epoch)
            mlflow.log_metric("val_loss", np.around(val_loss, 3), step=epoch)
            mlflow.log_metric("train_rmse", np.around(train_rmse, 3), step=epoch)
            mlflow.log_metric("val_rmse", np.around(val_rmse, 3), step=epoch)
            print(f"\nEpoch {epoch}:")
            print(f"Loss: train={train_loss:.3f}, validation={val_loss:.3f}")
            print(f"RMSE: train={train_rmse:.3f}, validaiton={val_rmse:.3f}")

        mlflow.pytorch.log_model(model, "model")

In [8]:
hparams = Hyperparams(batch_size=32, epochs=5, learning_rate=0.005)
model = SimpleLR()
optim = t.optim.Adam(model.parameters(), lr=hparams.learning_rate)
loss_fn = t.nn.MSELoss(reduction="mean")
trainloader = t.utils.data.DataLoader(trainset, batch_size=hparams.batch_size, shuffle=True)
valloader = t.utils.data.DataLoader(valset, batch_size=5000)

In [9]:
train(model, optim, loss_fn, hparams.epochs, trainloader, valloader, hparams)


Epoch 0:
Loss: train=15221.317, validation=12371.591
RMSE: train=123.382, validaiton=111.228

Epoch 1:
Loss: train=11429.276, validation=11162.140
RMSE: train=106.909, validaiton=105.651

Epoch 2:
Loss: train=10300.358, validation=10062.487
RMSE: train=101.496, validaiton=100.312

Epoch 3:
Loss: train=9270.617, validation=9056.615
RMSE: train=96.280, validaiton=95.166

Epoch 4:
Loss: train=8326.144, validation=8137.731
RMSE: train=91.255, validaiton=90.209


  "type " + obj.__name__ + ". It won't be checked "


In [10]:
list(model.parameters())

[Parameter containing:
 tensor([[-2.6699, -4.8544, -4.8110, -3.6846, -5.7742],
         [-2.6518, -4.8366, -4.7337, -3.6286, -5.7307],
         [-2.5713, -4.6761, -4.6174, -3.5305, -5.5658]], device='cuda:0',
        requires_grad=True), Parameter containing:
 tensor([3.1071, 3.0833, 3.0081], device='cuda:0', requires_grad=True), Parameter containing:
 tensor([[-5.4482, -5.6179, -5.5708]], device='cuda:0', requires_grad=True), Parameter containing:
 tensor([51.0001], device='cuda:0', requires_grad=True)]

In [11]:
saved_model = mlflow.pytorch.load_model("/home/avilay/mlruns/0/198ddcd3e2fb4840aa1d9a4503cd1d4f/artifacts/model")

In [12]:
list(saved_model.parameters())

[Parameter containing:
 tensor([[-2.6699, -4.8544, -4.8110, -3.6846, -5.7742],
         [-2.6518, -4.8366, -4.7337, -3.6286, -5.7307],
         [-2.5713, -4.6761, -4.6174, -3.5305, -5.5658]], device='cuda:0',
        requires_grad=True), Parameter containing:
 tensor([3.1071, 3.0833, 3.0081], device='cuda:0', requires_grad=True), Parameter containing:
 tensor([[-5.4482, -5.6179, -5.5708]], device='cuda:0', requires_grad=True), Parameter containing:
 tensor([51.0001], device='cuda:0', requires_grad=True)]

In [13]:
testloader = t.utils.data.DataLoader(testset, batch_size=len(testset))
X, y = next(iter(testloader))
X = X.to(DEVICE)
y = y.to(DEVICE)
saved_model.eval()
with t.no_grad():
    y_hat = saved_model(X)
    test_rmse = t.sqrt(F.mse_loss(y_hat, y)).cpu()
print(f"Test RMSE={test_rmse:.3f}")
print(list(zip(y[:5].cpu().numpy(), y_hat[:5].cpu().numpy())))

Test RMSE=90.546
[(62.163883, 51.000103), (-32.936485, -32.043056), (128.3833, 51.000103), (29.020142, 29.346474), (218.82535, 51.000103)]
