In [1]:
import os
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader


import matplotlib.pyplot as plt
from PIL import Image

from torchvision import datasets, transforms
from numpy.linalg import svd
from scipy.linalg import subspace_angles

from sklearn.decomposition import PCA
from numpy import linalg as LA
from scipy.linalg import sqrtm



if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True

In [3]:
RANDOM_SEED = 1
C=0.2
# reduced dimensions
START_EPOCH = 0
END_EPOCH = 40
n_components = 1
NUM_EPOCHS_FINE_TUNE = 40
save_dir = f'MNIST_labelnoise{C}'

In [4]:
def get_model_param_vec(model):
    """
    Return model parameters as a vector
    """
    vec = []
    for name,param in model.named_parameters():
        vec.append(param.detach().cpu().numpy().reshape(-1))
    return np.concatenate(vec, 0)

def get_model_grad_vec(model):
    # Return the model grad as a vector

    vec = []
    for name,param in model.named_parameters():
        vec.append(param.grad.detach().reshape(-1))
    return torch.cat(vec, 0)

def update_grad(model, grad_vec):
    idx = 0
    for name,param in model.named_parameters():
        arr_shape = param.grad.shape
        size = 1
        for i in range(len(list(arr_shape))):
            size *= arr_shape[i]
        param.grad.data = grad_vec[idx:idx+size].reshape(arr_shape)
        idx += size

def update_param(model, param_vec):
    idx = 0
    for name,param in model.named_parameters():
        arr_shape = param.data.shape
        size = 1
        for i in range(len(list(arr_shape))):
            size *= arr_shape[i]
        param.data = param_vec[idx:idx+size].reshape(arr_shape)
        idx += size

In [5]:
def get_model_grad_vec(model):
    """Return the gradient of the model as a flattened vector."""
    vec = []
    for param in model.parameters():
        if param.grad is not None:
            vec.append(param.grad.detach().reshape(-1))
    return torch.cat(vec, 0)

def update_grad(model, grad_vec):
    """Update the model gradients with a new flattened gradient vector."""
    idx = 0
    for param in model.parameters():
        if param.grad is not None:
            arr_shape = param.grad.shape
            size = param.grad.numel()
            param.grad.data = grad_vec[idx:idx + size].reshape(arr_shape)
            idx += size
def load_saved_parameters(save_dir, start_epoch, end_epoch):
    W = []
    for epoch in range(start_epoch, end_epoch):
        param_filename = os.path.join(save_dir, f'{epoch + 1}.pt')
        if os.path.exists(param_filename):
            model.load_state_dict(torch.load(param_filename))
            W.append(get_model_param_vec(model))
        else:
            print(f'File not found: {param_filename}')
    W = np.array(W)
    print(f'Loaded {len(W)} parameter vectors with shape: {W.shape}')
    return W


def get_model_param_vec(model):
    vec = []
    for name, param in model.named_parameters():
        vec.append(param.detach().cpu().numpy().reshape(-1))
    return np.concatenate(vec, 0)


In [6]:
#PSGD





W = load_saved_parameters(save_dir, START_EPOCH, END_EPOCH)
  # Obtain base variables through PCA
pca = PCA(n_components=n_components)
pca.fit_transform(W)
P = np.array(pca.components_)
print ('ratio:', pca.explained_variance_ratio_)
print ('P:', P.shape)
print(P.dtype)

P = torch.from_numpy(P).cuda()

optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss().to(DEVICE)

# Training parameters

alpha = 0.1  # Learning rate for residual gradient



for epoch in range(NUM_EPOCHS_FINE_TUNE):
    model.train()
    
    for batch_idx, (features, targets) in enumerate(train_loader):
        features = features.to(DEVICE)
        targets = targets.to(DEVICE)

        # Forward pass
        logits, probas = model(features)
        cost = criterion(logits, targets)

        # Backward pass to compute gradients
        optimizer.zero_grad()
        cost.backward()

        # Get the full gradient as a vector
        grad_vec = []
        for param in model.parameters():
            if param.grad is not None:
                grad_vec.append(param.grad.detach().reshape(-1))
        grad_vec = torch.cat(grad_vec, 0)

        # Project gradient to the reduced space
        gk = torch.mm(P, grad_vec.reshape(-1, 1))
        grad_proj = torch.mm(P.T, gk).reshape(-1)

        # Compute residual gradient
        grad_res = grad_vec - grad_proj

        # Update the model parameters using projected gradient
        idx = 0
        for param in model.parameters():
            if param.grad is not None:
                size = param.grad.numel()
                param.grad.data = grad_proj[idx:idx + size].reshape(param.grad.shape)
                idx += size
        optimizer.step()

        # Update model with residual gradient using a smaller learning rate (alpha)
        idx = 0
        for param in model.parameters():
            if param.grad is not None:
                size = param.grad.numel()
                param.grad.data = alpha * grad_res[idx:idx + size].reshape(param.grad.shape)
                idx += size
        optimizer.step()

        # Logging every 50 batches
        #if batch_idx % 50 == 0:
        #    print(f'Epoch: {epoch + 1}/{NUM_EPOCHS_FINE_TUNE} | Batch {batch_idx}/{len(train_loader)} | Cost: {cost:.4f}')

    # Evaluate the model after each epoch
    model.eval()
    train_acc = compute_accuracy(model, train_loader, DEVICE)
    test_acc = compute_accuracy(model, test_loader, DEVICE)
    print(f'Epoch {epoch + 1}/{NUM_EPOCHS_FINE_TUNE} | Train Acc: {train_acc:.2f}% | Test Acc: {test_acc:.2f}%')

   

NameError: name 'model' is not defined

In [None]:
#TME


def m_estimator(X):
    N, D = X.shape
    initcov = np.eye(D)  
    oldcov = initcov - 1
    cov = initcov
    iter_count = 1
    eps = 1e-10  

    while np.linalg.norm(oldcov - cov, 'fro') > 1e-12 and iter_count < 1000:
        temp = X @ np.linalg.inv(cov + eps * np.eye(D))  
        d = np.sum(temp * np.conjugate(X), axis=1)  
        oldcov = cov

       
        temp = (np.real(d) + eps * np.ones(N))**(-1)  

        
        temp_matrix = np.diag(temp)  
        cov = (X.T @ temp_matrix @ X) / (N * D)  
        cov = cov / np.trace(cov)  
        iter_count += 1  

    return cov



W = load_saved_parameters(save_dir, START_EPOCH, END_EPOCH)
  # Obtain base variables through PCA
V = np.dot(W,W.T)
W_hat = sqrtm(V)
   
Cov = m_estimator(W_hat)


pca = PCA(n_components=n_components)
pca.fit_transform(Cov)
U = np.array(pca.components_)
print('U:',U.shape)
P =  (W.T) @ LA.inv(W_hat) @ (U.T)
    
print ('P:', P.shape)
P = P.T
P = P.astype(np.float32)

P = torch.from_numpy(P).cuda()

optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss().to(DEVICE)

# Training parameters

alpha = 0.1  # Learning rate for residual gradient



for epoch in range(NUM_EPOCHS_FINE_TUNE):
    model.train()
    
    for batch_idx, (features, targets) in enumerate(train_loader):
        features = features.to(DEVICE)
        targets = targets.to(DEVICE)

        # Forward pass
        logits, probas = model(features)
        cost = criterion(logits, targets)

        # Backward pass to compute gradients
        optimizer.zero_grad()
        cost.backward()

        # Get the full gradient as a vector
        grad_vec = []
        for param in model.parameters():
            if param.grad is not None:
                grad_vec.append(param.grad.detach().reshape(-1))
        grad_vec = torch.cat(grad_vec, 0)

        # Project gradient to the reduced space
        gk = torch.mm(P, grad_vec.reshape(-1, 1))
        grad_proj = torch.mm(P.T, gk).reshape(-1)

        # Compute residual gradient
        grad_res = grad_vec - grad_proj

        # Update the model parameters using projected gradient
        idx = 0
        for param in model.parameters():
            if param.grad is not None:
                size = param.grad.numel()
                param.grad.data = grad_proj[idx:idx + size].reshape(param.grad.shape)
                idx += size
        optimizer.step()

        # Update model with residual gradient using a smaller learning rate (alpha)
        idx = 0
        for param in model.parameters():
            if param.grad is not None:
                size = param.grad.numel()
                param.grad.data = alpha * grad_res[idx:idx + size].reshape(param.grad.shape)
                idx += size
        optimizer.step()

        # Logging every 50 batches
        #if batch_idx % 50 == 0:
        #    print(f'Epoch: {epoch + 1}/{NUM_EPOCHS_FINE_TUNE} | Batch {batch_idx}/{len(train_loader)} | Cost: {cost:.4f}')

    # Evaluate the model after each epoch
    model.eval()
    train_acc = compute_accuracy(model, train_loader, DEVICE)
    test_acc = compute_accuracy(model, test_loader, DEVICE)
    print(f'Epoch {epoch + 1}/{NUM_EPOCHS_FINE_TUNE} | Train Acc: {train_acc:.2f}% | Test Acc: {test_acc:.2f}%')

   


In [None]:
#FMS

def FMS(X, dd):
    D, N = X.shape

    # Initial iteration count
    iter = 1

    # Perform SVD and initialize L
    U, _, _ = svd(X, full_matrices=False)
    L = U[:, :dd]

    # Set initial angle and tolerance
    ang = 1

    # Iterate until convergence or max iteration count
    while ang > 1e-12 and iter < 1000:
        Lold = L

        # Compute the residual projection
        temp = (np.eye(D) - L @ L.T) @ X
        w = np.sqrt(np.sum(temp**2, axis=0)) + 1e-10

        # Reweight and update XX
        XX = X @ np.diag(1.0 / w) @ X.T

        # Perform SVD again on the weighted matrix XX
        U, _, _ = svd(XX, full_matrices=False)
        L = U[:, :dd]

        # Compute the angle between new and old subspace
        ang = np.linalg.norm(subspace_angles(L, Lold))

        iter += 1

    return L

W = load_saved_parameters(save_dir, START_EPOCH, END_EPOCH)
  # Obtain base variables through PCA
V = np.dot(W,W.T)
W_hat = sqrtm(V)
n_components = n_components
U= FMS(W_hat,n_components)
    
P =  (W.T) @ LA.inv(W_hat) @ (U)
    
print ('P:', P.shape)
P = P.T
P = P.astype(np.float32)
P = torch.from_numpy(P).cuda()

optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss().to(DEVICE)

# Training parameters

alpha = 0.1  # Learning rate for residual gradient



for epoch in range(NUM_EPOCHS_FINE_TUNE):
    model.train()
    
    for batch_idx, (features, targets) in enumerate(train_loader):
        features = features.to(DEVICE)
        targets = targets.to(DEVICE)

        # Forward pass
        logits, probas = model(features)
        cost = criterion(logits, targets)

        # Backward pass to compute gradients
        optimizer.zero_grad()
        cost.backward()

        # Get the full gradient as a vector
        grad_vec = []
        for param in model.parameters():
            if param.grad is not None:
                grad_vec.append(param.grad.detach().reshape(-1))
        grad_vec = torch.cat(grad_vec, 0)

        # Project gradient to the reduced space
        gk = torch.mm(P, grad_vec.reshape(-1, 1))
        grad_proj = torch.mm(P.T, gk).reshape(-1)

        # Compute residual gradient
        grad_res = grad_vec - grad_proj

        # Update the model parameters using projected gradient
        idx = 0
        for param in model.parameters():
            if param.grad is not None:
                size = param.grad.numel()
                param.grad.data = grad_proj[idx:idx + size].reshape(param.grad.shape)
                idx += size
        optimizer.step()

        # Update model with residual gradient using a smaller learning rate (alpha)
        idx = 0
        for param in model.parameters():
            if param.grad is not None:
                size = param.grad.numel()
                param.grad.data = alpha * grad_res[idx:idx + size].reshape(param.grad.shape)
                idx += size
        optimizer.step()

        # Logging every 50 batches
        #if batch_idx % 50 == 0:
        #    print(f'Epoch: {epoch + 1}/{NUM_EPOCHS_FINE_TUNE} | Batch {batch_idx}/{len(train_loader)} | Cost: {cost:.4f}')

    # Evaluate the model after each epoch
    model.eval()
    train_acc = compute_accuracy(model, train_loader, DEVICE)
    test_acc = compute_accuracy(model, test_loader, DEVICE)
    print(f'Epoch {epoch + 1}/{NUM_EPOCHS_FINE_TUNE} | Train Acc: {train_acc:.2f}% | Test Acc: {test_acc:.2f}%')

    
