In [10]:
import os
from functools import partial
import numpy as np
import torch
import torch.nn as nn
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from torch.utils.data import DataLoader, Dataset
from torchmetrics import MeanAbsolutePercentageError

In [2]:
train_features = np.load("Train(Features).npy")
train_features = np.squeeze(train_features, axis=2)
train_targets = np.load("Train(Targets).npy")
train_targets = np.squeeze(train_targets, axis=2)
val_features = np.load("Val(Features).npy")
val_features = np.squeeze(val_features, axis=2)
val_targets = np.load("Val(Targets).npy")
val_targets = np.squeeze(val_targets, axis=2)

In [3]:
class dataset_for_loading(Dataset):
    def __init__(self, features, targets):
        self.tensorfeatures = torch.FloatTensor(features)
        self.tensortargets = torch.FloatTensor(targets)

    def __len__(self):
        return self.tensorfeatures.shape[0]

    def __getitem__(self, idx):
        feature = self.tensorfeatures[idx]
        target = self.tensortargets[idx]
        return feature, target

In [4]:
class NN(nn.Module):
    def __init__(self, n_layers, hidden_dim, batch_size, l1, l2):
        super().__init__()
        self.n_layers = n_layers
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.l1 = l1
        self.l2 = l2
        self.fc1 = nn.Linear(self.hidden_dim, self.l1)
        self.fc2 = nn.Linear(self.l1, self.l2)
        self.fc3 = nn.Linear(self.l2, 1)

    def forward2(self, inp):
        pred = self.fc1(inp)
        pred = self.fc2(nn.functional.softmax(pred))
        pred = self.fc3(nn.functional.softmax(pred))
        #if recc==True:
            #pred = pred.reshape((self.batch_size, 1))
        return pred

In [5]:
class MLP(NN):
    def __init__(self, n_layers, hidden_dim, batch_size, l1, l2):
        NN.__init__(self, n_layers, hidden_dim, batch_size, l1, l2)
        self.MLP = nn.Linear(5, self.hidden_dim)

    def forward1(self, x_in):
        MLPpred = self.MLP(x_in)
        return MLPpred

In [6]:
class RNN(NN):
    def __init__(self, n_layers, hidden_dim, batch_size, l1, l2):
        NN.__init__(self, n_layers, hidden_dim, batch_size, l1, l2)
        self.recurrent_layers = nn.RNN(
            input_size=5,
            hidden_size=self.hidden_dim,
            num_layers=self.n_layers,
            nonlinearity="tanh",
            batch_first=True
        )

    def forward1(self, x_in):
        RNNpred = self.recurrent_layers(x_in)
        output, hidden_cell_states = RNNpred
        return output

In [7]:
class LSTM(NN):
    def __init__(self, n_layers, hidden_dim, batch_size, l1, l2):
        NN.__init__(self, n_layers, hidden_dim, batch_size, l1, l2)
        self.recurrent_layers = nn.LSTM(
            hidden_size=self.hidden_dim,
            input_size=5,
            num_layers=self.n_layers,
            batch_first=True
        )

    def forward1(self, x_in):
        LSTMpred = self.recurrent_layers(x_in)
        output, hidden_cell_states = LSTMpred
        return output

In [8]:
class GRU(NN):
    def __init__(self, n_layers, hidden_dim, batch_size, l1, l2):
        NN.__init__(self, n_layers, hidden_dim, batch_size, l1, l2)
        self.recurrent_layers = nn.GRU(
            hidden_size=self.hidden_dim,
            num_layers=self.n_layers,
            batch_first=True,
            input_size=5
        )

    def forward1(self, x_in):
        GRUpred = self.recurrent_layers(x_in)
        output, hidden_cell_states = GRUpred
        return output

In [11]:
def train_mlp(config, checkpoint_dir=None):
    device = "cpu"

    # initialize class
    mlp = MLP(
        l1=config["l1"],
        l2=config["l2"],
        batch_size=config["batch_size"],
        n_layers=config["n_layers"],
        hidden_dim=config["hidden_dim"],
    )

    criterion = MeanAbsolutePercentageError()
    optimizer = torch.optim.SGD(mlp.parameters(), lr=config["lr"], momentum=0.9)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint")
        )
        mlp.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainloader = DataLoader(
        dataset_for_loading(train_features, train_targets),
        batch_size=config["batch_size"],
    )
    valloader = DataLoader(
        dataset_for_loading(val_features, val_targets), batch_size=config["batch_size"]
    )

    for epoch in range(100):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = mlp.forward1(inputs)
            outputs = mlp.forward2(outputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f"%(epoch + 1, i + 1, running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = mlp.forward1(inputs)
                outputs = mlp.forward2(outputs)

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((mlp.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps))
    print("Finished Training")

In [12]:
def train_recc(config, model='rnn', checkpoint_dir=None):
    device = "cpu"

    # initialize class
    if model=='rnn':
        rnn = RNN(
            l1=config["l1"],
            l2=config["l2"],
            batch_size=config["batch_size"],
            n_layers=config["n_layers"],
            hidden_dim=config["hidden_dim"],
        )
        
    elif model=='lstm':
        lstm = LSTM(
            l1=config["l1"],
            l2=config["l2"],
            batch_size=config["batch_size"],
            n_layers=config["n_layers"],
            hidden_dim=config["hidden_dim"],
        )
    
    elif model=='gru':
        gru = GRU(
            l1=config["l1"],
            l2=config["l2"],
            batch_size=config["batch_size"],
            n_layers=config["n_layers"],
            hidden_dim=config["hidden_dim"],
        )

    criterion = MeanAbsolutePercentageError()
    if model=='rnn':
        optimizer = torch.optim.SGD(rnn.parameters(), lr=config["lr"], momentum=0.9)
    elif model=='lstm':
        optimizer = torch.optim.SGD(lstm.parameters(), lr=config["lr"], momentum=0.9)
    elif model=='gru':
        optimizer = torch.optim.SGD(gru.parameters(), lr=config["lr"], momentum=0.9)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint")
        )
        if model=='rnn':
            rnn.load_state_dict(model_state)
        elif model=='lstm':
            lstm.load_state_dict(model_state)
        elif model=='gru':
            gru.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainloader = DataLoader(
        dataset_for_loading(train_features, train_targets),
        batch_size=config["batch_size"],
    )
    valloader = DataLoader(
        dataset_for_loading(val_features, val_targets), batch_size=config["batch_size"]
    )

    for epoch in range(100):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            if model=='rnn':
                outputs = rnn.forward1(inputs)
                outputs = rnn.forward2(outputs)
            elif model=='lstm':
                outputs = lstm.forward1(inputs)
                outputs = lstm.forward2(outputs)
            elif model=='gru':
                outputs = gru.forward1(inputs)
                outputs = gru.forward2(outputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f"%(epoch + 1, i + 1, running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                if model=='rnn':
                    outputs = rnn.forward1(inputs)
                    outputs = rnn.forward2(outputs)
                elif model=='lstm':
                    outputs = lstm.forward1(inputs)
                    outputs = lstm.forward2(outputs)
                elif model=='gru':
                    outputs = gru.forward1(inputs)
                    outputs = gru.forward2(outputs)

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            if model=='rnn':
                torch.save((rnn.state_dict(), optimizer.state_dict()), path)
            elif model=='lstm':
                torch.save((lstm.state_dict(), optimizer.state_dict()), path)
            elif model=='gru':
                torch.save((gru.state_dict(), optimizer.state_dict()), path)
        tune.report(loss=(val_loss / val_steps))
    print("Finished Training")

In [13]:
def main(num_samples=10, max_num_epochs=10, recurrent=False, model='rnn'):
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16]),
        "hidden_dim": tune.choice([5, 6, 7, 8, 9, 10]),
        "n_layers": tune.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=5,
        reduction_factor=2,
    )
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size". "hidden_dim"],
        metric_columns=["loss", "training_iteration"]
    )
    
    if recurrent==False:
        result = tune.run(
            partial(train_mlp),
            resources_per_trial={"cpu": int(16)},
            config=config,
            num_samples=num_samples,
            scheduler=scheduler,
            progress_reporter=reporter,
        )
    
    elif recurrent==True:
        result = tune.run(
            partial(train_recc, model=model),
            resources_per_trial={"cpu": int(16)},
            config=config,
            num_samples=num_samples,
            scheduler=scheduler,
            progress_reporter=reporter,
        )

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
    
    if recurrent==False:
        best_trained_model = MLP(
            l1=best_trial.config["l1"],
            l2=best_trial.config["l2"],
            batch_size=best_trial.config["batch_size"],
            n_layers=best_trial.config["n_layers"],
            hidden_dim=best_trial.config["hidden_dim"],
        )
    
    elif recurrent==True:
        if model=='rnn':
            best_trained_model = RNN(
            l1=best_trial.config["l1"],
            l2=best_trial.config["l2"],
            batch_size=best_trial.config["batch_size"],
            n_layers=best_trial.config["n_layers"],
            hidden_dim=best_trial.config["hidden_dim"],
        )
        
        elif model=='lstm':
            best_trained_model = LSTM(
            l1=best_trial.config["l1"],
            l2=best_trial.config["l2"],
            batch_size=best_trial.config["batch_size"],
            n_layers=best_trial.config["n_layers"],
            hidden_dim=best_trial.config["hidden_dim"],
        )
            
        elif model=='gru':
            best_trained_model = GRU(
            l1=best_trial.config["l1"],
            l2=best_trial.config["l2"],
            batch_size=best_trial.config["batch_size"],
            n_layers=best_trial.config["n_layers"],
            hidden_dim=best_trial.config["hidden_dim"],
        )

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    #best_checkpoint_dir = best_trial.checkpoint.value
    #model_state, optimizer_state = torch.load(os.path.join(best_checkpoint_dir, "checkpoint"))
    #best_trained_model.load_state_dict(model_state)
    if recurrent==False:
        filename = "Bestmlp.pt"
    else:
        filename = "Best{modeltype}.pt".format(modeltype=model)
    
    torch.save(best_trained_model.state_dict(), filename)

In [18]:
if __name__ == "__main__":
    main(num_samples=200, max_num_epochs=100, recurrent=True, model='gru')

== Status ==
Current time: 2022-10-30 18:13:05 (running for 00:00:00.25)
Memory usage on this node: 12.7/31.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 80.000: None | Iter 40.000: None | Iter 20.000: None | Iter 10.000: None | Iter 5.000: None
Resources requested: 16.0/16 CPUs, 0/0 GPUs, 0.0/12.06 GiB heap, 0.0/6.03 GiB objects
Result logdir: C:\Users\theod\ray_results\train_recc_2022-10-30_18-13-05
Number of trials: 17/200 (16 PENDING, 1 RUNNING)
+------------------------+----------+-----------------+--------------+--------------+------+------+-------------+------------+
| Trial name             | status   | loc             |   batch_size |   hidden_dim |   l1 |   l2 |          lr |   n_layers |
|------------------------+----------+-----------------+--------------+--------------+------+------+-------------+------------|
| train_recc_71d29_00000 | RUNNING  | 127.0.0.1:24464 |            2 |            7 |  128 |    8 | 0.000709056 |          3 |
| train_recc_71d29_00001 | P



== Status ==
Current time: 2022-10-30 18:13:10 (running for 00:00:05.30)
Memory usage on this node: 12.9/31.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 80.000: None | Iter 40.000: None | Iter 20.000: None | Iter 10.000: -94.09835662841797 | Iter 5.000: -94.09831130981445
Resources requested: 16.0/16 CPUs, 0/0 GPUs, 0.0/12.06 GiB heap, 0.0/6.03 GiB objects
Result logdir: C:\Users\theod\ray_results\train_recc_2022-10-30_18-13-05
Number of trials: 18/200 (17 PENDING, 1 RUNNING)
+------------------------+----------+-----------------+--------------+--------------+------+------+-------------+------------+---------+----------------------+
| Trial name             | status   | loc             |   batch_size |   hidden_dim |   l1 |   l2 |          lr |   n_layers |    loss |   training_iteration |
|------------------------+----------+-----------------+--------------+--------------+------+------+-------------+------------+---------+----------------------|
| train_recc_71d29_00000 | R

2022-10-30 18:46:02,249	INFO tune.py:758 -- Total run time: 1976.83 seconds (1976.61 seconds for the tuning loop).


Best trial config: {'l1': 8, 'l2': 32, 'lr': 0.00033825890048956196, 'batch_size': 8, 'hidden_dim': 5, 'n_layers': 6}
Best trial final validation loss: 0.23235508905989782
