This Notebook is used to observe the difference between estimate gradient and true gradient

In [1]:
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10, MNIST
from torch.utils.data import DataLoader, TensorDataset

import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt
import os

DEVICE = torch.device('cuda')
os.environ['CUDA_VISIBLE_DEVICES'] = '0'


In [2]:
def switch_to_device(dataset,device=None):
    final_X, final_Y = [], []
    for x, y in dataset:
        final_X.append(x)
        final_Y.append(y)
    X = torch.stack(final_X)
    Y = torch.tensor(final_Y)
    if device is not None:
        X = X.to(device)
        Y = Y.to(device)
    return TensorDataset(X, Y)

In [3]:
def get_Cifar10_dl(batch_size_train=256, batch_size_eval=1024, device=DEVICE):
    transform = transforms.Compose([transforms.ToTensor()])
    
    data_train = CIFAR10('./datasets', train=True, download=True, transform=transform)
    data_train = switch_to_device(data_train, device=device)
    data_train, data_valid = torch.utils.data.random_split(data_train, [45000,5000])
    
    data_test = CIFAR10('./datasets', train=False, download=True, transform=transform)
    data_test = switch_to_device(data_test, device=device)
    
    train_dl = DataLoader(data_train, batch_size=batch_size_train, shuffle=True)
    valid_dl = DataLoader(data_valid, batch_size=batch_size_eval, shuffle=False)
    test_dl = DataLoader(data_test, batch_size=batch_size_eval, shuffle=False)
    
    return train_dl, valid_dl, test_dl

In [4]:
def get_mnist_dl(batch_size_train=1024, batch_size_eval=1024, device=torch.device('cuda')):
    transform = transforms.Compose([transforms.ToTensor()])
    
    data_train = MNIST('./datasets', train=True, download=True, transform=transform)
    data_train = switch_to_device(data_train, device=device)
    data_train, data_valid = torch.utils.data.random_split(data_train, [55000,5000])
    
    data_test = MNIST('./datasets', train=False, download=True, transform=transform)
    data_test = switch_to_device(data_test, device=device)
    
    train_dl = DataLoader(data_train, batch_size=batch_size_train, shuffle=True)
    valid_dl = DataLoader(data_valid, batch_size=batch_size_eval, shuffle=False)
    test_dl = DataLoader(data_test, batch_size=batch_size_eval, shuffle=False)
    
    return train_dl, valid_dl, test_dl

In [5]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [6]:
@torch.no_grad()
def get_acc(model, dl):
  model.eval()
  acc = []
  for X, y in dl:
    #acc.append((torch.sigmoid(model(X)) > 0.5) == y)
    acc.append(torch.argmax(model(X), dim=1) == y)
  acc = torch.cat(acc)
  acc = torch.sum(acc)/len(acc)
  model.train()
  return acc.item()

In [7]:
#stats = {'train-loss' : [], 'valid-acc' : [], 'diff_mean' : [], 'diff_variance' : [], 'cosine' : [], 'scale_diff' : []}
def print_stats_full(stats):

  fig, axs= plt.subplots(3, 2, figsize=(7,9), dpi=110)

  axs[0, 0].set_title("ERM loss")
  axs[0, 1].set_title("Valid Acc")
  axs[1, 0].set_title("diff mean")
  axs[1, 1].set_title("diff variance")
  axs[2, 0].set_title("cosine")
  axs[2, 1].set_title("scale_diff")
  #axs[2, 1].set_title("same_sign_perc")


  for i in range(3):
    for j in range(2):
      axs[i, j].set_xlabel("iterations")
      axs[i, j].grid()
  
  itrs = [x[0] for x in stats['train-loss']]
  loss = [x[1] for x in stats['train-loss']]
  axs[0, 0].set_ylim(0.0, max(loss))
  axs[0, 0].plot(itrs, loss)

  itrs = [x[0] for x in stats['valid-acc']]
  acc = [x[1] for x in stats['valid-acc']]
  axs[0, 1].set_ylim(0.0, 1.05)
  axs[0, 1].plot(itrs, acc)

  itrs = [x[0] for x in stats['diff_mean']]
  diff_mean = [x[1].cpu() for x in stats['diff_mean']]
  axs[1, 0].set_ylim(min(diff_mean), max(diff_mean))
  axs[1, 0].plot(itrs, diff_mean)

  itrs = [x[0] for x in stats['diff_mean']]
  diff_variance = [x[1].cpu() for x in stats['diff_mean']]
  axs[1, 1].set_ylim(0.0, max(diff_variance))
  axs[1, 1].plot(itrs, diff_variance)

  itrs = [x[0] for x in stats['cosine']]
  cosine = [x[1].cpu() for x in stats['cosine']]
  axs[2, 0].set_ylim(min(cosine), max(cosine))
  axs[2, 0].plot(itrs, cosine)

  
  itrs = [x[0] for x in stats['scale_diff']]
  scale_diff = [x[1].cpu() for x in stats['scale_diff']]
  axs[2, 1].set_ylim(min(scale_diff), max(scale_diff))
  axs[2, 1].plot(itrs, scale_diff)
  '''
  itrs = [x[0] for x in stats['same_sign_perc']]
  scale_diff = [x[1].cpu() for x in stats['same_sign_perc']]
  axs[2, 1].set_ylim(0, 1)
  axs[2, 1].plot(itrs, scale_diff)
  '''
  plt.tight_layout()
  fig.savefig('testing.jpg', bbox_inches = 'tight')

In [8]:
def print_stats(stats):

  fig, (ax1, ax2) = plt.subplots(1,2,figsize=(7,3), dpi=110)
  ax1.grid()
  ax2.grid()

  ax1.set_title("ERM loss")
  ax2.set_title("Valid Acc")
  
  ax1.set_xlabel("iterations")
  ax2.set_xlabel("iterations")

  itrs = [x[0] for x in stats['train-loss']]
  loss = [x[1] for x in stats['train-loss']]
  ax1.plot(itrs, loss)

  itrs = [x[0] for x in stats['valid-acc']]
  acc = [x[1] for x in stats['valid-acc']]
  ax2.plot(itrs, acc)

  ax1.set_ylim(0.0, max(loss))
  ax2.set_ylim(0.0, 1.05)
  fig.savefig('testing.jpg', bbox_inches = 'tight')

In [9]:
class VGG16(nn.Module):
    def __init__(self, num_classes=10):
        super(VGG16, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU())
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU())
        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer5 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU())
        self.layer6 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU())
        self.layer7 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        '''
        self.layer8 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer9 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer10 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer11 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer12 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer13 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        '''
        self.fc = nn.Sequential(
            #nn.Dropout(0.5),
            nn.Linear(4*4*256, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            #nn.Dropout(0.5),
            nn.Linear(1024, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU())
        self.fc2= nn.Sequential(
            nn.Linear(1024, num_classes))
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = self.layer6(out)
        out = self.layer7(out)
        '''
        out = self.layer8(out)
        out = self.layer9(out)
        out = self.layer10(out)
        out = self.layer11(out)
        out = self.layer12(out)
        out = self.layer13(out)
        '''
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [10]:
class LeNet(nn.Module):

  def __init__(self, num_classes=10) -> None:
    super().__init__()
    self.flatten = nn.Flatten()
    self.fc1 = nn.Linear(28*28, 1024)
    self.Relu1 = nn.Hardtanh()
    self.fc2 = nn.Linear(1024, 1024)
    self.Relu2 = nn.Hardtanh()
    self.fc3 = nn.Linear(1024, 1024)
    self.Relu3 = nn.Hardtanh()
    self.fc4 = nn.Linear(1024, num_classes)
    #self.softmax = nn.Softmax()


  def forward(self, x: torch.Tensor) -> torch.Tensor:
    x = self.flatten(x)
    x = self.Relu1(self.fc1(x))
    x = self.Relu2(self.fc2(x))
    x = self.Relu3(self.fc3(x))
    x = self.fc4(x)
    
    return x

In [11]:
def naive_forward(model, normalize, num_dir):
    random_dir = {}
    da = torch.zeros((num_dir, 1), device = DEVICE)
    if normalize:
        dim = 0
        lens = 0
    
    for i, p in enumerate(model.parameters()):
        g = p.grad.view(-1)
        v = torch.randn(num_dir, len(g), device = DEVICE)
        random_dir[i] = v
        if normalize:
            lens += (v.norm(dim = 1)**2).view(-1, 1)
            dim += len(g)
        else:
            da = da + (v @ g).view(num_dir, 1)
    
    if normalize:
        lens = torch.sqrt(lens)
        for i, p in enumerate(model.parameters()):
            g = p.grad.view(-1)
            random_dir[i] /= lens
            da = da + (random_dir[i] @ g).view(num_dir, 1)

    if normalize:
        da *= dim
    
    #print(da)
    #print(random_dir[0].norm())
    #print(random_dir[0].shape)    

    return random_dir, da


In [12]:
def Normalize(input):
    mean = torch.mean(input, dim=1).view(-1, 1)
    std = torch.std(input, dim=1).view(-1, 1)
    return (input - mean)/std

In [13]:
def cal_proj_matrix(A):
    return A @ torch.inverse(A.T @ A) @ A.T

In [14]:
def forward_using_projection(model, num_dir, input_hook):
    random_dir = {}
    da = torch.zeros((num_dir, 1), device = DEVICE)
    i = 0

    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            input = input_hook[module]
            projection_matrix = cal_proj_matrix(input.T)
            input_sample = torch.randn(num_dir, module.weight.grad.shape[1], device = DEVICE).view(num_dir, module.weight.grad.shape[1], 1)
            input_sample = (projection_matrix @ input_sample).view(num_dir, 1, -1)
            random_w = torch.randn(num_dir, module.weight.grad.shape[0], device = DEVICE).view(num_dir, module.weight.grad.shape[0], 1)
            random_b = torch.randn(num_dir, module.bias.grad.shape[0], device = DEVICE)
            random_w = (random_w @ input_sample).view(num_dir, -1)
            random_dir[i] = random_w
            da += (random_dir[i] @ module.weight.grad.view(-1)).view(num_dir, 1)
            i+=1
            random_dir[i] = random_b
            da += (random_dir[i] @ module.bias.grad.view(-1)).view(num_dir, 1)
            i+=1
    return random_dir, da

In [15]:
def forward_using_input(model, num_dir, input_hook):
    random_dir = {}
    da = torch.zeros((num_dir, 1), device = DEVICE)
    i = 0
    
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            input = input_hook[module]
            b = len(input)
            #entry_from_batch = torch.randint(low=0, high=len(input), size=[])
            #inpt = input[entry_from_batch]
            #inpt = torch.mean(input, dim=0)
            #inpt /= inpt.norm()
            #input = input.sign()
            input = input / (input.norm(dim = 1)).view(-1, 1)
            input = input.view(b, 1, -1)
            random_w = torch.randn(num_dir, module.weight.grad.shape[0], device = DEVICE).view(num_dir, module.weight.grad.shape[0], 1)
            random_b = torch.randn(num_dir, module.bias.grad.shape[0], device = DEVICE)
            random_w = (random_w @ input).view(num_dir, -1)
            print(random_w.norm())
            #random_w = Normalize(random_w)
            random_dir[i] = random_w
            da += (random_dir[i] @ module.weight.grad.view(-1)).view(num_dir, 1)
            i+=1
            random_dir[i] = random_b
            da += (random_dir[i] @ module.bias.grad.view(-1)).view(num_dir, 1)
            i+=1
    #print(da)
    #print(random_dir[0].norm())
    '''
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            input = input_hook[module]
            b = len(input)
            softmax = torch.nn.Softmax(dim = 0)
            sum_weight = softmax(torch.randn(b, num_dir, device = DEVICE))
            integrate_input = (input.T @ sum_weight).T
            integrate_input = integrate_input / ((integrate_input.norm(dim = 1)).view(-1, 1))
            integrate_input = integrate_input.view(num_dir, 1, -1)
            random_w = torch.randn(num_dir, module.weight.grad.shape[0], device = DEVICE).view(num_dir, module.weight.grad.shape[0], 1)
            random_b = torch.randn(num_dir, module.bias.grad.shape[0], device = DEVICE)
            random_w = (random_w @ integrate_input).view(num_dir, -1)
            random_w = Normalize(random_w)
            random_dir[i] = random_w
            da += (random_dir[i] @ module.weight.grad.view(-1)).view(num_dir, 1)
            i+=1
            random_dir[i] = random_b
            da += (random_dir[i] @ module.bias.grad.view(-1)).view(num_dir, 1)
            i+=1
    '''
    return random_dir, da




In [16]:
def forward_orth(model, surrogate_dir, random_dir_orth, num_dir):
    da = torch.zeros((num_dir, 1), device=DEVICE)
    da_orth = torch.zeros((num_dir, 1), device=DEVICE)
    if surrogate_dir:
        flag = True
    else:
        flag = False
    dim = 0
    layer = 0

    for i, p in enumerate(model.parameters()):
        g = p.grad.view(-1)
        dim += len(g)
        layer += 1
        if flag:
            #v = Normalize(surrogate_dir[i])
            v = surrogate_dir[i]
            v /= v.norm(dim = 1).view(-1, 1)
        else:
            v = torch.randn(num_dir, len(g), device = DEVICE)
            v /= v.norm(dim = 1).view(-1, 1)
        
        v_orth = torch.randn(num_dir, len(g), device = DEVICE)
        dot_product = torch.diag(v@v_orth.T*torch.eye(num_dir, device=DEVICE)).view(num_dir, 1)
        v_orth -= dot_product * v
        v_orth = Normalize(v_orth)
        random_dir_orth[i] = v_orth
        surrogate_dir[i] = v
        da += (v @ g).view(num_dir, 1)
        da_orth += (v_orth @ g).view(num_dir, 1)

    da *= dim/layer

    return surrogate_dir, random_dir_orth, da, da_orth

In [17]:
def forward_momentum_orth(model, momentum, num_dir):
    random_dir = {}
    da = torch.zeros((num_dir, 1), device = DEVICE)
    if momentum:
        flag = True
    else:
        flag = False
    
    for i, p in enumerate(model.parameters()):
        g = p.grad.view(-1)

        v_orth = torch.randn(num_dir, len(g), device = DEVICE)
        if flag:
            v = momentum[i].expand(v_orth.shape)
            v = v / (v.clone().norm(dim = 1).view(-1, 1))
            dot_product = torch.diag(v@v_orth.T*torch.eye(num_dir, device=DEVICE)).view(num_dir, 1)
            v_orth -= dot_product * v
            #v_orth = Normalize(v_orth)

        random_dir[i] = v_orth
        da += (v_orth @ g).view(num_dir, 1)
    
    return random_dir, da

In [18]:
def run_experiment(model, opt, schedular, criterion, train_dl, valid_dl, test_dl, max_epochs, method, normalize = False, num_dir = 0, momentum_coef = 0.9, use_sign = False):
    itr = -1
    stats = {'train-loss' : [], 'valid-acc' : [], 'diff_mean' : [], 'diff_variance' : [], 'cosine' : [], 'scale_diff' : [], 'same_sign_perc' : []}
    
    if method != "backprop":
        random_dir = {}
        if method == "forward_orth":
            random_dir_orth = {}
            surrogate_dir = {}
        if method == "forward_momentum_orth":
            momentum = {}
        if method == "forward_using_input" or method == "forward_using_projection":
            layer_inputs = {}
            def hook(mod, input):
                layer_inputs[mod] = input[0]
            for module in model.modules():
                module.register_forward_pre_hook(hook)
    
    for epoch in range(max_epochs):
        for x, y in train_dl:
            itr += 1
            opt.zero_grad()
            loss = criterion(model(x), y)
            loss.backward()

            if method != "backprop":
                with torch.no_grad():
                    if method == "forward":
                        random_dir, da = naive_forward(model, normalize, num_dir)
                    if method == "forward_orth":
                        surrogate_dir, random_dir_orth, da, da_orth = forward_orth(model, surrogate_dir, random_dir_orth, num_dir)
                    if method == "forward_momentum_orth":
                        random_dir, da = forward_momentum_orth(model, momentum, num_dir)
                    if method == "forward_using_input":
                        random_dir, da = forward_using_input(model, num_dir, layer_inputs)
                    if method == "forward_using_projection":
                        random_dir, da = forward_using_projection(model, num_dir, layer_inputs)
                    
                    estimation = torch.tensor([]).to(DEVICE)
                    true_gradient = torch.tensor([]).to(DEVICE)
                    for i, p in enumerate(model.parameters()):
                        if method == "forward":
                            g = (da * random_dir[i]).mean(dim = 0)
                        if method == "forward_using_projection":
                            g = (da * random_dir[i]).mean(dim = 0)
                        if method == "forward_using_input":
                            g = (da * random_dir[i]).mean(dim = 0)
                        if method == "forward_orth":
                            g = da*surrogate_dir[i] + da_orth*random_dir_orth[i]
                            surrogate_dir[i] = g
                            g = g.mean(dim = 0)
                        if method == "forward_momentum_orth":
                            g = (da * random_dir[i]).mean(dim = 0)
                            if itr != 0:
                                momentum[i] = momentum[i]*momentum_coef + g
                            else:
                                momentum[i] = g
                        true_gradient = torch.cat((true_gradient, p.grad.view(-1)), dim = 0)
                        if method != "forward_momentum_orth":
                            estimation = torch.cat((estimation, g), dim = 0)
                        else:
                            estimation = torch.cat((estimation, momentum[i]), dim = 0)
                        if use_sign:
                            g[torch.abs(g) < torch.min(torch.abs(torch.max(g)), torch.abs(torch.min(g)))/1.5] = 0.0
                            p.grad = g.view(p.grad.shape).sign()
                        else:
                            #print(torch.sum(torch.abs(g) < torch.min(torch.abs(torch.max(g)), torch.abs(torch.min(g)))/5))
                            #print(torch.sum(torch.abs(g) > torch.min(torch.abs(torch.max(g)), torch.abs(torch.min(g)))*4/5))
                            #g[torch.abs(g) < torch.min(torch.abs(torch.max(g)), torch.abs(torch.min(g)))*1/3] /= 5
                            #g[torch.abs(g) > torch.min(torch.abs(torch.max(g)), torch.abs(torch.min(g)))*2/3] *= 5
                            p.grad = g.view(p.grad.shape)
            if use_sign and method == "backprop":
                for i, p in enumerate(model.parameters()):
                    p.grad = p.grad.sign()
            opt.step()
            #schedular.step()
            stats['train-loss'].append((itr, loss.item()))

            if method != "backprop":
                same_sign_perc = torch.sum(true_gradient.sign() == estimation.sign())/torch.numel(true_gradient)
                scale_true_gradient = true_gradient.norm()
                scale_estimation = estimation.norm()
                stats['diff_mean'].append((itr, torch.mean(estimation - true_gradient)))
                stats['diff_variance'].append((itr, torch.var(estimation - true_gradient)))
                stats['cosine'].append((itr, torch.sum(true_gradient*estimation)/(scale_true_gradient*scale_estimation)))
                stats['scale_diff'].append((itr, scale_estimation - scale_true_gradient))
                stats['same_sign_perc'].append((itr, same_sign_perc))

            if itr % 100 == 0:
                #print(same_sign_perc)
                valid_acc = get_acc(model, valid_dl)
                stats['valid-acc'].append((itr, valid_acc))
                s = f"{epoch}:{itr} [train] loss:{loss.item():.3f}, [valid] acc:{valid_acc:.3f}"
                print(s)

    test_acc = get_acc(model, test_dl)
    print(f"[test] acc:{test_acc:.3f}")

    return stats

        

In [19]:
#model = VGG16().to(DEVICE)
model = LeNet().to(DEVICE)
print(count_parameters(model))

train_batch_size = 128
test_batch_size = 1024

#train_dl, valid_dl, test_dl = get_Cifar10_dl(train_batch_size, test_batch_size, device = DEVICE)
train_dl, valid_dl, test_dl = get_mnist_dl(train_batch_size, test_batch_size, device = DEVICE)

momentum_coef = 0.9
#opt = torch.optim.SGD(model.parameters(), lr = 1e-3, momentum=momentum_coef)
opt = torch.optim.SGD(model.parameters(), lr = 1e-3)
#opt = torch.optim.Adam(model.parameters(), lr = 5e-4)
scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=2000, gamma=0.5)
criterion =nn.CrossEntropyLoss()
max_epochs = 20

method = "forward_using_projection"
normalize = False
num_dir = 20
use_sign = False

2913290


In [20]:
stats = run_experiment(model, opt, scheduler, criterion, train_dl, valid_dl, test_dl, max_epochs, method, normalize, num_dir, momentum_coef, use_sign )
if method != "backprop":
    print_stats_full(stats)
else:
    print_stats(stats)

tensor([[-0.0160],
        [-0.4240],
        [-0.9324],
        [ 0.0588],
        [ 0.0364],
        [-0.1514],
        [ 0.0515],
        [-0.2381],
        [ 0.0700],
        [ 0.0586],
        [ 0.1658],
        [-0.4242],
        [ 0.3769],
        [-0.3174],
        [-0.5235],
        [ 0.1870],
        [ 0.1519],
        [-0.3536],
        [ 0.2937],
        [-0.2334]], device='cuda:0')
tensor([[-0.0186],
        [-0.4349],
        [-0.9272],
        [ 0.0755],
        [ 0.0439],
        [-0.1596],
        [ 0.0527],
        [-0.2614],
        [ 0.0636],
        [ 0.0626],
        [ 0.1758],
        [-0.4163],
        [ 0.3684],
        [-0.2940],
        [-0.5080],
        [ 0.2003],
        [ 0.1503],
        [-0.3511],
        [ 0.2897],
        [-0.2290]], device='cuda:0')
tensor([[ 0.2167],
        [-0.5209],
        [-0.8428],
        [-0.0843],
        [ 0.1561],
        [ 0.1025],
        [-0.4353],
        [-0.1883],
        [-0.2283],
        [ 0.0884],
        [ 0.01

KeyboardInterrupt: 

In [None]:
v = torch.randn(1, 1024, device = DEVICE)
u = torch.randn(1, 1024, device = DEVICE)
true = u.T@v

print("mean and variance of difference between true value and estimation")
rand_1 = torch.randn(1024, 1024, device = DEVICE)
est_1 = torch.sum((u.T@v) * rand_1) * rand_1
print(torch.mean(est_1 - true), torch.var(est_1 - true))

rand_2 = torch.randn(1, 1024, device = DEVICE)
#v_norm = v.sign()
v_norm = v/v.norm()
rand_2 = rand_2.T@v_norm
#rand_2 = Normalize(rand_2)
est_2 = torch.sum((u.T@v) * rand_2) * rand_2
print(torch.mean(est_2 - true), torch.var(est_2 - true))
print()

print("The norm of the true value, and two estimations")
print(true.norm())
print(est_1.norm())
print(est_2.norm())
print()

print("The cosine similarity between the estimation and the true value")
cos = nn.CosineSimilarity(dim = 0)
print(cos(est_1.view(-1), true.view(-1)))
print(cos(est_2.view(-1), true.view(-1)))
print()

print("The norm of 2 random vectors are")
print(rand_1.norm())
print(rand_2.norm())
print()

print("number of elements with the same sign")
print(torch.sum(true.sign() == est_1.sign())/(1024*1024))
print(torch.sum(true.sign() == est_2.sign())/(1024*1024))

mean and variance of difference between true value and estimation
tensor(1.5104, device='cuda:0') tensor(3797856., device='cuda:0')
tensor(-0.0031, device='cuda:0') tensor(198.9366, device='cuda:0')

The norm of the true value, and two estimations
tensor(1051.3427, device='cuda:0')
tensor(1995581.6250, device='cuda:0')
tensor(14419.1113, device='cuda:0')

The cosine similarity between the estimation and the true value
tensor(0.0018, device='cuda:0')
tensor(0.0137, device='cuda:0')

The norm of 2 random vectors are
tensor(1023.9437, device='cuda:0')
tensor(31.6129, device='cuda:0')

number of elements with the same sign
tensor(0.5005, device='cuda:0')
tensor(0.4941, device='cuda:0')
