In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
import os
import seaborn as sns
from DataModel import DataModel, TensorDataSet, TensorTabDataSet
from sklearn import metrics
from torch.utils.data import DataLoader
from datetime import datetime
import torch
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer, FTTransformer
# from tensorboard import SummaryWriter
from torch.utils.tensorboard import SummaryWriter

working_directory = os.getcwd()
file_path = working_directory + '/final_dataset.pickle'
with open(file_path, 'rb') as f:
    data = pickle.load(f)



The dataset is obtained fron the shiping and logistic company DFDS and is confidential. Therefor we only share a subset of the data, approx. 350 rows of the original 43890 rows. Thus, it's not possible to reproduce the results in the report from the following script. However, the following show the model implementaion and the training procces.

The class `DataModel` handles the data processing. 

## DL models


### FT-Transformer

In [None]:
data_encoding_type = 'label'
cols_to_ohe = ['priority', 'deck_on_vessel', 'is_reefer', 'is_hazardous', 'unitype_id']
dm = DataModel(encoding_type=data_encoding_type, cols_to_ohe=cols_to_ohe)
df = dm.get_df()
X,y = dm.get_inputs_targets()
def get_cols_unique_num():
    uniques = []
    for ind, i in enumerate(X.columns):
        if ind == 0 or ind == 1: continue
        uniques.append(len(X[i].unique()))
    return tuple(uniques)

In [None]:
batch_size = 64
tr_tab_ds = TensorTabDataSet(data_type='train',normalize_num=False, encoding_type=data_encoding_type, cols_to_ohe=cols_to_ohe)
vl_tab_ds = TensorTabDataSet(data_type='valid', normalize_num=False,encoding_type=data_encoding_type, cols_to_ohe=cols_to_ohe)
train_tab_loader =  DataLoader(tr_tab_ds, batch_size=batch_size)
valid_tab_loader =  DataLoader(vl_tab_ds, batch_size=batch_size)

In [None]:
unique_cat_num_tuple = get_cols_unique_num()

model = FTTransformer(
    categories = unique_cat_num_tuple,
    num_continuous = 2,          
    dim = 64,                     
    dim_out = 1,                  
    depth = 6,                    
    heads = 8,                    
    attn_dropout = 0.2,           
    ff_dropout = 0.1              
)
device = torch.device('cpu')  # use cuda or cpu
model.to(device)

In [None]:
criterion = nn.MSELoss()
learning_rate = 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  
def accuracy(target, pred):
    return metrics.r2_score(target.detach().cpu().numpy(), pred.detach().cpu().numpy())

In [None]:
train_accuracies=[]
valid_accuracies = []
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    for i, data in enumerate(train_tab_loader):
        train_accuracies_batches = []
        # Every data instance is an input + label pair
        inputs, targets = data
        cat_inputs, num_inputs = inputs[0], inputs[1]
        targets = targets.type(torch.FloatTensor)
        targets = targets.unsqueeze(dim=1)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(cat_inputs, num_inputs)

        # Compute the loss and its gradients
        loss = criterion(outputs, targets)
        loss.backward()
        # predictions = outputs

        # Adjust learning weights
        optimizer.step()
        train_accuracies_batches.append(accuracy(targets, outputs))

        # Gather data and report
        running_loss += loss.item()

        if i % 100 == 99:
            last_loss = running_loss / 100 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_tab_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.
    train_accuracies.append(np.mean(train_accuracies_batches))

    return last_loss


In [None]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/TabData(dropout){}'.format(timestamp))
epoch_number = 0
valid_preds = []
valid_targs = []
EPOCHS = 65

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer)

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(valid_tab_loader):
        valid_accuracies_batches = []
        vinputs, vtargets = vdata
        vcat_inputs, vnum_inputs = vinputs[0], vinputs[1]
        vtargets = vtargets.type(torch.FloatTensor)
        vtargets = vtargets.unsqueeze(dim=1)
        voutputs = model(vcat_inputs, vnum_inputs)
        valid_preds.append(voutputs.detach().cpu().numpy())
        valid_targs.append(vtargets.detach().cpu().numpy())
        vloss = criterion(voutputs, vtargets)
        running_vloss += vloss
        valid_accuracies_batches.append(accuracy(vtargets, voutputs))

    valid_accuracies.append(np.mean(valid_accuracies_batches))
    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1
print("=== Done ===")

In [None]:
epoch = np.arange(len(train_accuracies))
plt.figure(figsize=(10,10))
plt.plot(epoch, train_accuracies, 'r', epoch, valid_accuracies, 'b')
plt.yticks([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
plt.legend(['Train Accucary','Validation Accuracy'])
plt.xlabel('Updates'), plt.ylabel('Acc')
print("train acc: ",np.mean(train_accuracies))
print("validation acc: ", np.mean(valid_accuracies))

### FNN

In [None]:
tr_ff_ds = TensorDataSet(data_type='train',normalize_num=False, encoding_type=data_encoding_type, cols_to_ohe=cols_to_ohe)
vl_ff_ds = TensorDataSet(data_type='valid', normalize_num=False,encoding_type=data_encoding_type, cols_to_ohe=cols_to_ohe)

train_ff_loader =  DataLoader(tr_ff_ds, batch_size=batch_size)
valid_ff_loader =  DataLoader(vl_ff_ds, batch_size=batch_size)

In [None]:

class FFModel(nn.Module):
    def __init__(self, input_size):
        super(FFModel, self).__init__()  
        self.input_size =input_size
        self.l1=nn.Linear(self.input_size, self.input_size*2)
        self.l2=nn.Linear(self.input_size*2, self.input_size*10)
        self.l3=nn.Linear(self.input_size*10, self.input_size*20)
        self.l4=nn.Linear(self.input_size*20, self.input_size*50)
        self.l5=nn.Linear(self.input_size*50, self.input_size*55)
        self.l6=nn.Linear(self.input_size*55, 1)
        self.relu=nn.ReLU()
        self.dropout = nn.Dropout(0.30)
    def forward(self, x):
        out=self.l1(x)
        out=self.relu(out)
        self.dropout(out)
        out=self.l2(out)
        out=self.relu(out)
        self.dropout(out)
        out=self.l3(out)
        out=self.relu(out)
        self.dropout(out)
        out=self.l4(out)
        out=self.relu(out)
        self.dropout(out)
        out=self.l5(out)
        out=self.relu(out)
        out=self.l6(out)
        return out
# encode_type = ''
# cols_to_label_encode=['place_on_board']
data_encoding_type = 'label'
cols_to_ohe = ['priority', 'deck_on_vessel', 'is_reefer', 'is_hazardous', 'unitype_id']
dm = DataModel(encoding_type=data_encoding_type, normalize_num=False, cols_to_ohe=cols_to_ohe)
df = dm.get_df()
input_size = dm.get_inputs_targets()[0].shape[1]
model = FFModel(input_size)

#### Training loop 

By running the following script, the training loop will begin and for each epoch, model parameter will be saved. We select the model that preforms best on the validation set. 

In [None]:
train_accuracies=[]
valid_accuracies = []
def train_one_epochFFM(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    for i, data in enumerate(train_ff_loader):
        train_accuracies_batches = []

        inputs, targets = data
        targets = targets.type(torch.FloatTensor)
        targets = targets.unsqueeze(dim=1)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        train_accuracies_batches.append(accuracy(targets, outputs))

        if i % 100 == 99:
            last_loss = running_loss / 100 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_ff_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.
    train_accuracies.append(np.mean(train_accuracies_batches))

    return last_loss

In [None]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/FNN{}'.format(timestamp))
epoch_number = 0

EPOCHS = 65

best_vloss = 1_000_000.
train_accuracies = []
valid_accuracies = []
for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epochFFM(epoch_number, writer)

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(valid_ff_loader):
        valid_accuracies_batches = []
        vinputs, vtargets = vdata
        vtargets = vtargets.type(torch.FloatTensor)
        vtargets = vtargets.unsqueeze(dim=1)
        outputs = model(vinputs)
        vloss = criterion(outputs, vtargets)
        running_vloss += vloss
        valid_accuracies_batches.append(accuracy(vtargets, outputs))

    valid_accuracies.append(np.mean(valid_accuracies_batches))
    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1
epoch = np.arange(len(train_accuracies))
plt.figure()
plt.plot(epoch, train_accuracies, 'r', epoch, valid_accuracies, 'b')
plt.legend(['Train Accucary','Validation Accuracy'])
plt.xlabel('Updates'), plt.ylabel('Acc')

### Testing the model on test dataset

For testing the model on test dataset, check `test_model.ipynb`