Alternate architecture with Gated Linear Unit (GLU). This should also be able to achieve dynamical isometry as it achieves identity input-output mapping. 

In [None]:
class CustomGLU(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(CustomGLU, self).__init__()
        self.linear_main = nn.Linear(input_dim, output_dim)
        self.linear_gate = nn.Linear(input_dim, output_dim)

        with torch.no_grad():
            self.linear_main.weight.copy_(torch.eye(output_dim, input_dim))
            self.linear_main.bias.fill_(0)

            self.linear_gate.weight.fill_(0)
            self.linear_gate.bias.fill_(0)

    def forward(self, x):
        a = self.linear_main(x)
        b = self.linear_gate(x)

        return a * torch.sigmoid(b) * 2

In [None]:
class TabularDenseNet(nn.Module):
    def __init__(self, input_size, num_layers, output_size):
        super(TabularDenseNet, self).__init__()
        self.num_layers = num_layers
        self.layers = nn.ModuleList()
        
        for i in range(num_layers):
            self.layers.append(CustomGLU(input_size, input_size))
            input_size *= 2
            
        self.last_layer = nn.Linear(input_size, output_size)
        with torch.no_grad():
            nn.init.zeros_(self.last_layer.weight)
            nn.init.zeros_(self.last_layer.bias)
    
    def forward(self, x):
        outputs = [x]
        
        for layer in self.layers:
            concatenated_outputs = torch.cat(outputs, dim=1)
            outputs.append(layer(concatenated_outputs))

        concatenated_outputs = torch.cat(outputs, dim=1)
        return self.last_layer(concatenated_outputs)