<a href="https://colab.research.google.com/github/Zhuoyue-Huang/urop_2022_ml/blob/main/Try_and_prove.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
"""try and prove different results"""
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm
import statistics
import numpy as np
import pandas as pd 
import scipy.stats as stats
from copy import deepcopy

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Examine the effect of detach

In [21]:
# Linear auto-encoder model
class LAE(nn.Module):
    def __init__(self, n, p):
        super(LAE, self).__init__()
        self.n = n
        self.p = p
        self.w1 = nn.Linear(n, p, bias=False)
        self.w2 = nn.Linear(p, n, bias=False)

    def forward(self, y):
        y = self.w1(y)
        y = self.w2(y)
        return y

class FE_Net(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(FE_Net, self).__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.theta = nn.Linear(in_dim, out_dim, bias=False)

    def forward(self, W):
        return self.theta(W)

In [None]:
def train_loop(data_dict, model, criterion, optimizer, type, epochs=10, sample_average=10, record=True):
    train_loss = []
    val_loss = []
    if type=='encoder':
        train_inputs = data_dict['train_inputs']
        train_targets = train_inputs
    elif type=='fe':
        train_inputs = data_dict['train_inputs']
        train_targets = data_dict['train_targets']
        sample_average = 1

    for epoch in range(epochs+1):
        loss_total = 0
        optimizer.zero_grad()
        for i in range(sample_average):
            train_outputs = model(train_inputs)
            loss = criterion(train_outputs, train_targets)
            loss_total += loss
        loss_total /= sample_average
        loss_total.backward()
        optimizer.step()
        if record:
          train_loss.append(loss_total.item())
          if epochs>=5 and epoch%(epochs//5)==0:
              v_loss = test_loop(data_dict, model, criterion, type)
              val_loss.append(v_loss)
              print('epoch: ', epoch, ', train loss: ', loss.item(), ', val loss', v_loss)
    if record:
        return {'train_loss': train_loss, 'val_loss': val_loss}
    else:
        return loss_total.item()

def test_loop(data_dict, model, criterion, type):
    if type=='encoder':
        val_inputs = data_dict['val_inputs']
        val_targets = val_inputs
    elif type=='fe':
        val_inputs = data_dict['val_inputs']
        val_targets = data_dict['val_targets']

    with torch.no_grad():
        val_outputs = model(val_inputs)
        loss = criterion(val_outputs, val_targets)
    return loss.item()

In [None]:
# feature extraction
def feature_extraction_test(data_dict, model_parameters, criterion, type, epochs=200, device='cuda'):
    train_inputs, train_targets, val_inputs, val_targets = data_dict.values()

    params = list(model_parameters)
    W1 = params[0]
    W10 = deepcopy(W1)
    W10 = W10.clone().detach()
    print(W10)
    train_inputs_fe0 = train_inputs @ W10.T
    val_inputs_fe0 = val_inputs @ W10.T

    W11 = deepcopy(W1)
    W11 = W11.clone()
    print(W11)
    train_inputs_fe1 = (train_inputs @ W11.T).detach()
    val_inputs_fe1 = (val_inputs @ W11.T).detach()

    reduction_dim = train_inputs_fe1.shape[1]
    target_dim = train_targets.shape[1]

    if type=='ls':
        param_fe0 = (torch.inverse(train_inputs_fe0.T@train_inputs_fe0) @ train_inputs_fe0.T @ train_targets).T
        loss0 = criterion(val_targets, val_inputs_fe0 @ param_fe0.T)
        print('test0', loss0.item())

        param_fe1 = (torch.inverse(train_inputs_fe1.T@train_inputs_fe1) @ train_inputs_fe1.T @ train_targets).T
        loss1 = criterion(val_targets, val_inputs_fe1 @ param_fe1.T)
        print('test1', loss1.item())
        #return loss.item()
    elif type=='gd':
        net_fe = FE_Net(reduction_dim, target_dim).to(device)
        net_fe0 = deepcopy(net_fe)
        net_fe1 = deepcopy(net_fe)

        data_dict_fe0 = {'train_inputs': train_inputs_fe0, 'train_targets': train_targets,
                        'val_inputs': val_inputs_fe0, 'val_targets': val_targets}
        param_fe0 = list(net_fe0.parameters())
        optimizer0 = optim.Adam(param_fe0, lr=0.0001)
        ### TRAINING ###
        loss_fe0 = train_loop(data_dict_fe0, net_fe0, criterion, optimizer0, epochs=epochs, record=False, type='fe')
        print('test0', loss_fe0)
        data_dict_fe1 = {'train_inputs': train_inputs_fe1, 'train_targets': train_targets,
                        'val_inputs': val_inputs_fe1, 'val_targets': val_targets}
        param_fe1 = list(net_fe1.parameters())
        optimizer1 = optim.Adam(param_fe1, lr=0.0001)
        ### TRAINING ###
        loss_fe1 = train_loop(data_dict_fe1, net_fe1, criterion, optimizer1, epochs=epochs, record=False, type='fe')
        print('test1', loss_fe1)
        #return loss_fe

In [None]:
train_num = 50
val_num = 20
H = 8
W = 8
sample_dim = torch.tensor([H, W])
feature_num = H * W
reduction_dim = feature_num
target_dim = feature_num

prob = 0.75
prob_list = torch.rand(feature_num)*0.2 + 0.65
patch_size = torch.div(sample_dim, 2, rounding_mode='floor')

train_inputs = torch.rand(train_num, feature_num) * 2
train_inputs = train_inputs.to(device)
train_targets = torch.rand(train_num, target_dim)
train_targets = train_targets.to(device)
val_inputs = torch.rand(val_num, feature_num) * 2
val_inputs = val_inputs.to(device)
val_targets = torch.rand(val_num, target_dim)
val_targets = val_targets.to(device)
data_dict = {'train_inputs': train_inputs, 'train_targets': train_targets,
             'val_inputs': val_inputs, 'val_targets': val_targets}

In [None]:
net_LAE = LAE(feature_num, reduction_dim).to(device)

params = list(net_LAE.parameters())
criterion = nn.MSELoss()
optimizer = optim.Adam(params, lr=0.001)

### TRAINING ###
train_loop(data_dict, net_LAE, criterion, optimizer, epochs=500, record=False, type='encoder')

# feature extraction
loss_ls = feature_extraction_test(data_dict, net_LAE.parameters(), criterion, type='ls', epochs=500)
loss_gd = feature_extraction_test(data_dict, net_LAE.parameters(), criterion, type='gd', epochs=500)

tensor([[-0.1336,  0.0542,  0.0287,  ..., -0.1842,  0.0315,  0.1009],
        [-0.0978, -0.0365, -0.1108,  ..., -0.0541, -0.1404,  0.0120],
        [ 0.1117,  0.0700,  0.0037,  ...,  0.0655,  0.0070, -0.1101],
        ...,
        [ 0.1055,  0.0057, -0.0793,  ..., -0.0329,  0.1920,  0.0874],
        [-0.1235, -0.0956,  0.1405,  ..., -0.0077,  0.0426, -0.1306],
        [ 0.1496, -0.1518, -0.1922,  ..., -0.0546, -0.0099, -0.0903]],
       device='cuda:0')
tensor([[-0.1336,  0.0542,  0.0287,  ..., -0.1842,  0.0315,  0.1009],
        [-0.0978, -0.0365, -0.1108,  ..., -0.0541, -0.1404,  0.0120],
        [ 0.1117,  0.0700,  0.0037,  ...,  0.0655,  0.0070, -0.1101],
        ...,
        [ 0.1055,  0.0057, -0.0793,  ..., -0.0329,  0.1920,  0.0874],
        [-0.1235, -0.0956,  0.1405,  ..., -0.0077,  0.0426, -0.1306],
        [ 0.1496, -0.1518, -0.1922,  ..., -0.0546, -0.0099, -0.0903]],
       device='cuda:0', grad_fn=<CloneBackward0>)
test0 606.8356323242188
test1 606.8356323242188
tensor([[-

### Least square vs gradient decent

##### check whether least square estimator has zero gradient

In [22]:
def train_loop(data_dict, model, criterion, optimizer, type, epochs=10, sample_average=10, record=True):
    train_loss = []
    val_loss = []
    if type=='encoder':
        train_x = data_dict['train_x']
        train_y = train_x
    elif type=='fe':
        train_x = data_dict['train_x']
        train_y = data_dict['train_y']
        sample_average = 1

    for epoch in range(epochs+1):
        loss_total = 0
        optimizer.zero_grad()
        for i in range(sample_average):
            outputs = model(train_x)
            loss = criterion(outputs, train_y)
            loss_total += loss
        loss_total /= sample_average
        loss_total.backward()
        optimizer.step()
        if record:
          train_loss.append(loss_total.item())
          if epochs>=5 and epoch%(epochs//5)==0:
              v_loss = test_loop(data_dict, model, criterion, type)
              val_loss.append(v_loss)
              print('epoch: ', epoch, ', train loss: ', loss.item(), ', val loss', v_loss)
    if record:
        return {'train_loss': train_loss, 'val_loss': val_loss}
    else:
        return loss_total.item()

def test_loop(data_dict, model, criterion, type):
    if type=='encoder':
        val_x = data_dict['val_x']
        val_y = val_x
    elif type=='fe':
        val_x = data_dict['val_x']
        val_y = data_dict['val_y']

    with torch.no_grad():
        outputs = model(val_x)
        loss = criterion(outputs, val_y)
    return loss.item()

In [41]:
class FE_Net(nn.Module):
    def __init__(self, z_dim, y_dim):
        super(FE_Net, self).__init__()
        self.theta = nn.Linear(z_dim, y_dim, bias=False)

    def forward(self, W):
        return self.theta(W)

def feature_extraction(data_dict, model_parameters, criterion, type, epochs=200, device='cuda'):
    train_x, train_y, val_x, val_y, _, _ = data_dict.values()

    params = list(model_parameters)
    W1 = params[0].clone().detach()
    train_z = train_x @ W1.T
    val_z = val_x @ W1.T
    z_dim = train_z.shape[1]
    y_dim = train_y.shape[1]

    if type=='ls':
        theta = (torch.inverse(train_z.T @ train_z) @ train_z.T @ train_y).T
        print(-train_z.T @ train_y + train_z.T @ train_z @ theta.T)
        loss = criterion(val_y, val_z @ theta.T)
        return loss.item()
    elif type=='gd':
        data_dict_fe = {'train_x': train_z, 'train_y': train_y,
                        'val_x': val_z, 'val_y': val_y}
        net_fe = FE_Net(z_dim, y_dim).to(device)
        theta = list(net_fe.parameters())
        optimizer = optim.Adam(theta, lr=0.00001)
        ### TRAINING ###
        loss_fe = train_loop(data_dict_fe, net_fe, criterion, optimizer, epochs=epochs, record=False, type='fe')
        return loss_fe

In [42]:
train_num, val_num, test_num = (60, 20, 20)
sample_num_split = (train_num, train_num+val_num)
total = train_num+val_num+test_num

# need to consider x_dim < and > z_dim
z_dim = 10 # dimension of z
H = 8
W = 8
sample_dim = torch.tensor([H, W])
x_dim = H * W
y_dim = z_dim // 2

prob = 0.75
prob_list = torch.rand(x_dim)*0.2 + 0.65
patch_size = torch.tensor([2, 2])

In [43]:
z = torch.normal(mean=0, std=1, size=(total,z_dim)) # here distribution is high dimensional guassian
z = z.to(device)

U = torch.rand(x_dim, z_dim)
U = U.to(device)
V = torch.rand(y_dim, z_dim)
V = V.to(device)

x = z @ U.T
y = z @ V.T
train_x, val_x, test_x = torch.tensor_split(x, sample_num_split)
train_y, val_y, test_y = torch.tensor_split(y, sample_num_split)
data_dict = {'train_x': train_x, 'train_y': train_y, 'val_x': val_x, 'val_y': val_y, 'test_x': test_x, 'test_y': test_y}
fe_loss_dict = {'LAE': [], 'MLAE_basic': [], 'MLAE_probs': [], 'MLAE_patches': []}

In [44]:
learning_rate = 0.01
epochs = 500
epochs_fe = 250

In [45]:
net_LAE = LAE(x_dim, z_dim).to(device)

params = list(net_LAE.parameters())
criterion = nn.MSELoss()
optimizer = optim.Adam(params, lr=learning_rate)

### TRAINING ###
train_loop(data_dict, net_LAE, criterion, optimizer, epochs=epochs, record=False, type='encoder')

# feature extraction
loss_ls = feature_extraction(data_dict, net_LAE.parameters(), criterion, type='ls')
loss_gd = feature_extraction(data_dict, net_LAE.parameters(), criterion, type='gd', epochs=50000)
print('least square loss', loss_ls)
print('gradient decent loss', loss_gd)

tensor([[-7.6294e-05, -3.0518e-05,  0.0000e+00, -4.5776e-05, -3.0518e-05],
        [-1.3733e-04, -1.2207e-04, -1.2207e-04, -1.3733e-04, -1.3733e-04],
        [-2.4414e-04, -2.4414e-04, -1.8311e-04, -2.1362e-04, -2.4414e-04],
        [ 3.0518e-05,  3.0518e-05,  3.0518e-05,  4.5776e-05,  2.2888e-05],
        [ 4.2439e-05,  4.1962e-05,  4.2915e-05,  3.9101e-05,  4.1008e-05],
        [-1.6785e-04, -1.2207e-04, -1.0681e-04, -1.5259e-04, -1.3733e-04],
        [ 7.6294e-05,  4.5776e-05,  4.1962e-05,  4.5776e-05,  6.1035e-05],
        [ 4.5776e-05,  4.5776e-05,  3.0518e-05,  4.1962e-05,  4.5776e-05],
        [-5.0545e-05, -5.7220e-05, -5.1498e-05, -4.1962e-05, -5.4359e-05],
        [ 4.5776e-05,  3.8147e-05,  3.0518e-05,  3.0518e-05,  3.8147e-05]],
       device='cuda:0')
least square loss 2.0417222600099816e-12
gradient decent loss 0.009000109508633614
