# Effective Theory of Deep Linear Networks at Initialization

In this notebook we will test the Effective Theory from the third chapter of the book. More specifically we will look at linear neural networks and their output distribution. That means that the activation function will be linear.

It is highly recommended to read the book or atleast the third chapter.

## $p(z^{(L)} | x)$ is Gaussian
(3.48)

![a](../pictures/1.PNG)


We will test it empirically. Linear neural networks with infinite width have output distribution precisly Gaussian. Because it is impossible to have infinite width we will look at how the sample connected 4-point correlator change with scaling width with respect to ensemble of networks. Gaussian distribution has connected correlators (4 point and higher) equal to 0. For nearly Gaussian distribution the connected correlators should be small. (1.54) (4.77) We will also try the Henze-Zirkler test for multivariate normality on outputs of the ensemble of networks.


We will use critical initialization hyperparameters($C_w = 1$) and set fixed L.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import math

import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

import pingouin as pg

import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import random
random.seed(69)
random.seed(69)
torch.manual_seed(69)

In [2]:
#parameters
n = 12288
L = 3
inp_shape = 12288
out_shape = 12288

In [3]:
def try_gpu(i=0):
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')
device = try_gpu()

In [4]:
#device = torch.device('cpu')

In [5]:
device

device(type='cuda', index=0)

In [6]:
def count_para(net):
    return sum([p.numel() for p in net.parameters() if p.requires_grad])

In [7]:
def critical_initialization_linear(tensor):
    if type(tensor) == nn.Linear:
        with torch.no_grad():
            #return tensor.normal_(0, 1)
            return torch.nn.init.normal_(tensor.weight, mean=0.0, std=1.0)

In [48]:
def create_net(n,l,inp_shape=n,out_shape=n):
    blks = []
    n = n
    L = l
    for _ in range(L-2):
        blks.append( nn.Sequential(nn.Linear(n, n,bias=False)))
    inp = nn.Sequential(nn.Linear(inp_shape, n,bias=False))
    mid = nn.Sequential(*blks)
    top = nn.Sequential(nn.Linear(n, out_shape,bias=False))
    net = nn.Sequential(inp, mid, top)
    net.apply(critical_initialization_linear)
    net = net.eval()
    return net

In [9]:
net = create_net(n,L)
count_para(net)

452984832

In [10]:
x = torch.rand((1, inp_shape), dtype=torch.float)
x = (x - x.mean(dim=1))/x.std(dim=1)
x = x.to(device)

In [11]:
x.shape

torch.Size([1, 12288])

In [12]:
net

Sequential(
  (0): Sequential(
    (0): Linear(in_features=12288, out_features=12288, bias=False)
  )
  (1): Sequential(
    (0): Sequential(
      (0): Linear(in_features=12288, out_features=12288, bias=False)
    )
  )
  (2): Sequential(
    (0): Linear(in_features=12288, out_features=12288, bias=False)
  )
)

In [21]:
num_nets = 1000
z_Ls = torch.tensor([])
z_Ls = z_Ls.to(device)
for _ in range(num_nets):
    with torch.no_grad():
        net = create_net(n,L)
        net.to(device)
        z = net(x)
        z_Ls = torch.cat((z_Ls, z), dim=0)
        torch.cuda.empty_cache()

KeyboardInterrupt: 

In [9]:
PATH = "12288-3-677nets.pt"

# torch.save({
#             'outputs': z_Ls
#             }, PATH)

In [10]:
d = torch.load(PATH)

In [11]:
z_Ls = d['outputs']

In [12]:
z_Ls.shape

torch.Size([677, 12288])

In [22]:
pvals = []
for _ in range(100):
    z = z_Ls.cpu().detach()
    index = np.random.choice(z.shape[1], 666, replace=False)
    z = z[:,index]
    #z = np.concatenate([z for _ in range(2)])
    z.shape
    df = pg.multivariate_ttest(z)
    pvals.append(df['pval'].item())
    #print(df['pval'].item())
pvals = np.array(pvals)

In [23]:
pvals.mean()

0.5147019851929099

In [24]:
pg.multivariate_ttest(z)

Unnamed: 0,T2,F,df1,df2,pval
hotelling,50418.612695,1.231864,666,11,0.371646


In [25]:
k = 666
n = 667
a = np.random.multivariate_normal(np.zeros(k), np.ones((k,k)), size=(n))
pg.multivariate_ttest(a)

Unnamed: 0,T2,F,df1,df2,pval
hotelling,9.61822,2.2e-05,666,1,1.0


In [26]:
def sample_con_cor(zs, a_1, a_2, a_3, a_4):
    exp_all = (zs[:,a_1] * zs[:,a_2] * zs[:,a_3] * zs[:,a_4]).mean()
    exp_12 = (zs[:,a_1] * zs[:,a_2]).mean()
    exp_34 = (zs[:,a_3] * zs[:,a_4]).mean()
    exp_13 = (zs[:,a_1] * zs[:,a_3]).mean()
    exp_24 = (zs[:,a_2] * zs[:,a_4]).mean()
    exp_14 = (zs[:,a_1] * zs[:,a_4]).mean()
    exp_23 = (zs[:,a_2] * zs[:,a_3]).mean()
    #print(exp_all,exp_12, exp_34,exp_13,exp_24,exp_14,exp_23)
    con_cor = exp_all - exp_12*exp_34 - exp_13*exp_24 - exp_14*exp_23
    #print(exp_12*exp_34,exp_13*exp_24,exp_14*exp_23)
    return con_cor

In [32]:
count = 0
samples = 1000
con_cs = 0
Ks = 0
for _ in range(samples):
    i=random.randint(0,out_shape-1)
    idxs = random.choices(list(range(out_shape)),k=4)
    con_c = sample_con_cor(z_Ls, *idxs)
    K = (z_Ls[:,i] * z_Ls[:,i]).mean() - z_Ls[:,i].mean()*z_Ls[:,i].mean()
    count+= abs(con_c)/(K*K)
    con_cs += abs(con_c)
    Ks += K*K
count/samples

tensor(0.0299, device='cuda:0')

In [40]:
i=random.randint(0,out_shape-1)
idxs = random.choices(list(range(out_shape)),k=4)
con_c = sample_con_cor(z_Ls, *idxs)
K = (z_Ls[:,i] * z_Ls[:,i]).mean() - z_Ls[:,i].mean()*z_Ls[:,i].mean()
count= abs(con_c)/(K*K)
count

tensor(0.0074, device='cuda:0')

# Chaos

(3.50)

![a](../pictures/chaos.PNG)

In [80]:
#parameters
n = 16
L = 25#1e5 -- 25_992_704
inp_shape = 16
out_shape = 16

def create_net(n,l,inp_shape=n,out_shape=n):
    blks = []
    n = n
    L = l
    for _ in range(L-2):
        blks.append( nn.Sequential(nn.Linear(n, n,bias=False)))
    inp = nn.Sequential(nn.Linear(inp_shape, n,bias=False))
    mid = nn.Sequential(*blks)
    top = nn.Sequential(nn.Linear(n, out_shape,bias=False))
    net = nn.Sequential(inp, mid, top)
    net.apply(critical_initialization_linear)
    net = net.eval()
    return net

In [81]:
net = create_net(n,L)
count_para(net)

6400

In [82]:
x = torch.rand((1, inp_shape), dtype=torch.float)
x = (x - x.mean(dim=1))/x.std(dim=1)
x = x.to(device)

In [83]:
x.shape

torch.Size([1, 16])

In [84]:
def sample_con_cor(zs, a_1, a_2, a_3, a_4):
    exp_all = (zs[:,a_1] * zs[:,a_2] * zs[:,a_3] * zs[:,a_4]).mean()
    exp_12 = (zs[:,a_1] * zs[:,a_2]).mean()
    exp_34 = (zs[:,a_3] * zs[:,a_4]).mean()
    exp_13 = (zs[:,a_1] * zs[:,a_3]).mean()
    exp_24 = (zs[:,a_2] * zs[:,a_4]).mean()
    exp_14 = (zs[:,a_1] * zs[:,a_4]).mean()
    exp_23 = (zs[:,a_2] * zs[:,a_3]).mean()
    print(exp_all,exp_12, exp_34,exp_13,exp_24,exp_14,exp_23)
    con_cor = exp_all - exp_12*exp_34 - exp_13*exp_24 - exp_14*exp_23
    print(exp_12*exp_34,exp_13*exp_24,exp_14*exp_23)
    return con_cor

In [85]:
num_nets = 10
z_Ls = torch.tensor([])
z_Ls = z_Ls.to(device)
for _ in range(num_nets):
    with torch.no_grad():
        net = create_net(n,L)
        net.to(device)
        z = net(x)
        z_Ls = torch.cat((z_Ls, z), dim=0)
        torch.cuda.empty_cache()

In [86]:
z_Ls.mean(dim=0)

tensor([-2.3946e+14, -3.1475e+13, -4.4411e+14, -3.3243e+13,  1.8281e+14,
        -5.7144e+14,  2.4473e+14, -5.0012e+13, -1.1685e+14,  9.7677e+13,
        -5.3157e+13,  1.5606e+14, -7.2179e+12,  2.3542e+14,  1.8082e+14,
         2.5545e+13], device='cuda:0')

In [87]:
count = 0
samples = 2
con_cs = 0
Ks = 0
for _ in range(samples):
    i=random.randint(0,out_shape-1)
    idxs = random.choices(list(range(out_shape)),k=4)
    con_c = sample_con_cor(z_Ls, *idxs)
    K = (z_Ls[:,i] * z_Ls[:,i]).mean() - z_Ls[:,i].mean()*z_Ls[:,i].mean()
    count+= abs(con_c)/(K*K)
    print(abs(con_c), K*K)
    con_cs += abs(con_c)
    Ks += K*K
count/samples

tensor(nan, device='cuda:0') tensor(6.6265e+27, device='cuda:0') tensor(1.1043e+29, device='cuda:0') tensor(2.3241e+28, device='cuda:0') tensor(1.0967e+29, device='cuda:0') tensor(2.3241e+28, device='cuda:0') tensor(1.0967e+29, device='cuda:0')
tensor(inf, device='cuda:0') tensor(inf, device='cuda:0') tensor(inf, device='cuda:0')
tensor(nan, device='cuda:0') tensor(inf, device='cuda:0')
tensor(nan, device='cuda:0') tensor(-6.5300e+27, device='cuda:0') tensor(6.6265e+27, device='cuda:0') tensor(7.4421e+28, device='cuda:0') tensor(1.5528e+28, device='cuda:0') tensor(7.1754e+28, device='cuda:0') tensor(9.0916e+29, device='cuda:0')
tensor(-inf, device='cuda:0') tensor(inf, device='cuda:0') tensor(inf, device='cuda:0')
tensor(nan, device='cuda:0') tensor(inf, device='cuda:0')


tensor(nan, device='cuda:0')

In [88]:
i=random.randint(0,out_shape-1)
idxs = random.choices(list(range(out_shape)),k=4)
con_c = sample_con_cor(z_Ls, *idxs)
K = (z_Ls[:,i] * z_Ls[:,i]).mean() - z_Ls[:,i].mean()*z_Ls[:,i].mean()
count= abs(con_c)/(K*K)
count

tensor(nan, device='cuda:0') tensor(6.8775e+28, device='cuda:0') tensor(5.3220e+29, device='cuda:0') tensor(1.0967e+29, device='cuda:0') tensor(2.9983e+29, device='cuda:0') tensor(6.8775e+28, device='cuda:0') tensor(5.3220e+29, device='cuda:0')
tensor(inf, device='cuda:0') tensor(inf, device='cuda:0') tensor(inf, device='cuda:0')


tensor(nan, device='cuda:0')