It is recommended to read the explanation in README.md first

In [None]:
import sys
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import logit
from scipy.stats import norm

import tensorflow as tf
from keras import layers, models
from keras.datasets import mnist

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from torchinfo import summary

from sklearn.model_selection import train_test_split, LeaveOneOut, cross_val_predict, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, log_loss, accuracy_score

Inspired by https://arxiv.org/abs/1702.08591 (looks-linear init), https://arxiv.org/abs/1711.04735 (dynamical isometry), and https://openreview.net/forum?id=Fp7__phQszn (inductive bias of preserving original orientation of features with almost linear input-output mapping)

In [None]:
class CustomLinearLayer(nn.Module):
    def __init__(self, input_size, output_size, mode="default"):
        super(CustomLinearLayer, self).__init__()
        self.linear = nn.Linear(input_size, output_size, bias=True)
        nn.init.zeros_(self.linear.bias)
        
        if mode == "init_zero":
            nn.init.zeros_(self.linear.weight)
        elif mode == "splits_inputs":
            self.splits_inputs()
        elif mode == "looks_linear_init":
            self.looks_linear_init()

    def looks_linear_init(self):
        with torch.no_grad():
            size = self.linear.weight.size(0)
            weight = torch.zeros(size, size)

            indices = torch.arange(0, size, step=2)

            weight[indices, indices] = 1
            weight[indices, indices + 1] = -1
            weight[indices + 1, indices] = -1
            weight[indices + 1, indices + 1] = 1

            self.linear.weight.copy_(weight)
        
        """ Example matrix: [
            [1, -1, 0, 0],
            [-1, 1, 0, 0],
            [0, 0, 1, -1],
            [0, 0, -1, 1]
        ] """
            
    def splits_inputs(self):
        with torch.no_grad():
            weight = torch.zeros(self.linear.out_features, self.linear.in_features)

            for i in range(self.linear.in_features):
                weight[2 * i, i] = 1
                weight[2 * i + 1, i] = -1

            self.linear.weight.copy_(weight)
            
        """ Example matrix: [
            [1, 0, 0],
            [-1, 0, 0],
            [0, 1, 0],
            [0, -1, 0],
            [0, 0, 1],
            [0, 0, -1]
        ] """

    def forward(self, x):
        return self.linear(x)

Similar in logic to DenseNet "of connecting each layer to every other layer in a feed-forward fashion", where "for each layer, the feature-maps of all preceding layers are used as inputs, and its own feature-maps are used as inputs into all subsequent layers" https://arxiv.org/abs/1711.04735 but for MLPs or fully connected neural networks. Current, naive implementations of DenseNet consumes quadratic memory with respect to depth but since feature maps are reused almost everywhere, through some implementation tricks DenseNet can also be implemented in linear memory https://arxiv.org/abs/1707.06990. This will be worked on later

In [None]:
class TabularDenseNet(nn.Module):
    def __init__(self, input_size, num_layers, output_size):
        super(TabularDenseNet, self).__init__()
        self.activation = nn.Softplus()
        
        layer_size = input_size * 2
        self.res_layer = CustomLinearLayer(input_size, layer_size, mode="splits_inputs")
        
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            self.layers.append(CustomLinearLayer(layer_size, layer_size, mode="looks_linear_init"))
            layer_size *= 2
            
        self.last_layer = CustomLinearLayer(layer_size, output_size, mode="init_zero")

    def forward(self, x):
        outputs = [self.activation(self.res_layer(x))]
        
        for layer in self.layers:
            concatenated_outputs = torch.cat(outputs, dim=1)
            outputs.append(self.activation(layer(concatenated_outputs)))

        concatenated_outputs = torch.cat(outputs, dim=1)
        return self.last_layer(concatenated_outputs)

Custom Loss below with L1 and L2 regularization is only recommended to use with SGD optimizer. There are issues with adaptive gradient algorithm when doing navive L1/L2 regularization as mentioned in this paper https://arxiv.org/abs/1711.05101. I'll find some time to address this issue later. L1 regularization is particularly beneficial as it is rotationally invariant and robust against uninformative features, which are also the inductive biases that contribute to tree-based models strong performance on tabular data https://openreview.net/forum?id=Fp7__phQszn

In [None]:
class CustomLoss(nn.Module):
    def __init__(self, criterion, l1_lambda, l2_lambda):
        super(CustomLoss, self).__init__()
        self.criterion = criterion
        self.l1_lambda = l1_lambda
        self.l2_lambda = l2_lambda

    def forward(self, outputs, labels, model):
        loss = self.criterion(outputs, labels)
        
        l1_norm = sum(p.abs().sum() for name, p in model.named_parameters() if 'bias' not in name)
        l2_norm = sum(p.pow(2.0).sum() for name, p in model.named_parameters() if 'bias' not in name)
        
        loss += self.l1_lambda * l1_norm + self.l2_lambda * l2_norm
        return loss