## Подготовка данных

In [None]:
%reload_ext autoreload
%autoreload 2

import torch

import jax.numpy as torch
import scipy
import copy
import sys

import torch
from torch import nn
from torchvision.datasets import CIFAR10
from sklearn.preprocessing import StandardScaler
import numpy as np

from torchvision.models import resnet18
from torchvision.models import resnet50

from tucker_riemopt.tucker import Tucker
from tucker_riemopt import backend as back

from tqdm.auto import tqdm
from src.TuckerLinear import TuckerLinear, TuckerLinearPermute
from src.utils import optim
from src.utils.riemann_model import RiemannModel, RiemannParameter
from tucker_riemopt import set_backend

device = 'cuda:0'

back.set_backend('pytorch')

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from copy import deepcopy

device = 'cuda:0'

model_name = "fabriceyhc/bert-base-uncased-imdb"

model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model_orig = deepcopy(model)

In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

In [5]:
def get_model_with_rank(c = 10, n_layers=50):
    model = deepcopy(model_orig)
    n_params = sum(p.numel() for p in model.parameters())
    old_params = 0
    new_params = 0
    linears = [k.split('.') for k, m in model.named_modules() if type(m).__name__ == 'Linear']
    
    linears = linears[-1 - n_layers: -1]
    
    # for linear in tqdm(linears):
    for linear in linears:
        code = 'model'
        for l in linear:
            if l.isdigit():
                code += f'[{l}]'
            else:
                code += f'.{l}'
        layer = eval(code)
        if (layer.in_features, layer.out_features) == (768, 768):
            dims1 = [8, 8, 12]
            dims2 = [8, 8, 12]
            # dims1 = [4, 4, 4, 4, 3]
            # dims2 = [4, 4, 4, 4, 3]
            rank = [c, c, c]
        elif (layer.in_features, layer.out_features) == (768, 3072):
            dims1 = [8, 8, 12]
            dims2 = [16, 16, 12]
            rank = [c, c, c]
        elif (layer.in_features, layer.out_features) == (3072, 768):
            dims1 = [16, 16, 12]
            dims2 = [8, 8, 12]
            rank = [c, c, c]
        elif (layer.in_features, layer.out_features) == (768, 2):
            dims1 = [8, 8, 12]
            dims2 = [2, 1, 1]
            rank = [c, c, c]
        else:
            print("ERROR", (layer.in_features, layer.out_features))
            assert 1 == 0
        
        old_params += sum(p.numel() for p in layer.parameters())
        exec(code + f' = TuckerLinearPermute({layer.in_features}, {layer.out_features}, layer=layer, rank=rank, dims1=dims1, dims2=dims2)')
        new_params += sum(p.numel() for p in eval(code + '.riemann_parameters()'))
    
    print(old_params)
    print(new_params)
    return model, n_params / (n_params - old_params + new_params)

In [19]:
from transformers import TrainingArguments, Trainer

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return { 'Accuracy': np.mean(predictions == labels) }

training_args = TrainingArguments(output_dir="test_trainer")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [20]:
# trainer.evaluate()

## Tucker

In [7]:
@torch.no_grad()
def eval_model(model, test_loader):
    model.eval()
    cum_loss = torch.zeros((1,), device=device)
    acc = torch.zeros((1,), device=device)
    # for batch in tqdm(test_loader):
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        output = model(**batch)
        pred = output.logits.argmax(dim=-1).detach()
        # print(pred.shape)
        # print(batch['labels'].shape)
        acc += (pred == batch['labels']).float().sum()
        loss = criterion(output.logits, batch['labels'])
        # loss = output.loss
        cum_loss += loss.detach()
    return cum_loss.item() / len(test_loader), acc.item() / len(test_loader.dataset)


def fine_tune_epoch(model, train_loader, riem_optimizer, criterion):
    model.train()
    for i, batch in enumerate(tqdm(train_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        riem_optimizer.zero_grad()
        output = model(**batch)
        loss = criterion(output.logits, batch['labels'])
        # loss = output.loss
        if (i + 1) % 10 == 0:
            wandb.log({'train_loss': loss})
            pass
        loss.backward()
        riem_optimizer.step()

In [8]:
def get_riemann_optimizer(model):
    tucker_linears = [k.split('.') for k, m in model.named_modules() if type(m).__name__ == 'TuckerLinearPermute']
    riem_params = []

    for tl in tucker_linears:
        code = 'model'
        for l in tl:
            if l.isdigit():
                code += f'[{l}]'
            else:
                code += f'.{l}'
        layer = eval(code)
        riem_params += [{
            "params": layer.riemann_parameters(),
            "rank": layer.rank
        }]

    riem_params = riem_params[:-1]

    # for p in new_model.regular_parameters():
    #     p.requires_grad_(False)
    riemann_opt = optim.SGDmomentum(riem_params, base_lr=1e-1)
    return riemann_opt

In [9]:
from src.TuckerLinear import TuckerLinearPermute
import torch
from torch import nn
from tucker_riemopt import backend as back

back.set_backend('pytorch')

# get_model_with_rank(5)
layer = TuckerLinearPermute(24, 210, layer=nn.Linear(24, 210), rank=[3, 3, 3], dims1=[2, 3, 4], dims2=[5, 6, 7], bias=False)

device = 'cuda:0'

A = torch.randn(100, 40, 24).to(device)
result = layer(A)

print(result.shape)

torch.Size([100, 40, 210])


In [2]:
from torch.utils.data import DataLoader
from torch.optim import SGD, AdamW, Adam
import wandb

num_model_params = sum(p.numel() for p in model_orig.parameters())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(small_eval_dataset, batch_size=8, shuffle=False)

cs = [1, 2, 3, 5]
nlrs = [10, 20, 30, 50]

f = open("log_usual.txt", "a")

print("C\tn_layers\tmethod\tloss\tacc\tcompression")
print("C\tn_layers\tmethod\tloss\tacc\tcompression", file=f)

f.close()

for c in cs:
    for nlr in nlrs:
        # model = deepcopy(model_orig)
        
        f = open("log_usual.txt", "a")

        model, compression = get_model_with_rank(c=c, n_layers=nlr)
        
        tucker_linears = [k.split('.') for k, m in model.named_modules() if type(m).__name__ == 'TuckerLinearPermute']
        riem_params = []
        
        for p in model.parameters():
            p.requires_grad_(False)

        for tl in tucker_linears:
            code = 'model'
            for l in tl:
                if l.isdigit():
                    code += f'[{l}]'
                else:
                    code += f'.{l}'
            layer = eval(code)
            
            for p in layer.riemann_parameters():
                p.requires_grad_(True)
            
            riem_params += [{
                "params": layer.riemann_parameters(),
                "rank": layer.rank
            }]
        
        # Use for usual SGD
        # riemann_opt = SGD(riem_params, lr=1e-3, momentum=0.9)
        
        # Use for riemopt
        riemann_opt = optim.SGDmomentum(riem_params, base_lr=1e-3)
        
        criterion = torch.nn.CrossEntropyLoss()

        eval_loss, eval_acc = eval_model(model, test_loader)
        print(c, nlr, "replace", round(eval_loss, 3), round(eval_acc, 3), round(compression, 3), sep='\t')
        print(c, nlr, "replace", round(eval_loss, 3), round(eval_acc, 3), round(compression, 3), sep='\t', file=f)
        
        wandb.init(project="rieman-neural-nets")

        fine_tune_epoch(model, train_loader, riemann_opt, criterion)
        eval_loss, eval_acc = eval_model(model, test_loader)
        num_model_params = sum(p.numel() for p in model_orig.parameters())
        n_params = sum(p.numel() for p in model.parameters())
        print(c, nlr, "finetune", round(eval_loss, 3), round(eval_acc, 3), round(compression, 3), sep='\t')
        print(c, nlr, "finetune", round(eval_loss, 3), round(eval_acc, 3), round(compression, 3), sep='\t', file=f)
        
        f.close()

## SF-Tucker

In [10]:
def get_symmetric_model_with_rank(c = 10, n_layers=50):
    model = deepcopy(model_orig)
    n_params = sum(p.numel() for p in model.parameters())
    old_params = 0
    new_params = 0
    linears = [k.split('.') for k, m in model.named_modules() if type(m).__name__ == 'Linear']
    
    linears = linears[-1 - n_layers: -1]
    
    for linear in tqdm(linears):
    # for linear in linears:
        code = 'model'
        for l in linear:
            if l.isdigit():
                code += f'[{l}]'
            else:
                code += f'.{l}'
        layer = eval(code)
        if (layer.in_features, layer.out_features) == (768, 768):
            dims1 = [8, 8, 12]
            dims2 = [8, 8, 12]
            # dims1 = [4, 4, 4, 4, 3]
            # dims2 = [4, 4, 4, 4, 3]
            rank = [c, c, c]
        elif (layer.in_features, layer.out_features) == (768, 3072):
            dims1 = [8, 8, 12]
            dims2 = [16, 16, 12]
            rank = [c, c, c]
        elif (layer.in_features, layer.out_features) == (3072, 768):
            dims1 = [16, 16, 12]
            dims2 = [8, 8, 12]
            rank = [c, c, c]
        elif (layer.in_features, layer.out_features) == (768, 2):
            dims1 = [8, 8, 12]
            dims2 = [2, 1, 1]
            rank = [c, c, c]
        else:
            print("ERROR", (layer.in_features, layer.out_features))
            assert 1 == 0
        
        old_params += sum(p.numel() for p in layer.parameters())
        exec(code + f' = TuckerLinearSymmetric({layer.in_features}, {layer.out_features}, layer=layer, rank=rank, dims1=dims1, dims2=dims2)')
        new_params += sum(p.numel() for p in eval(code + '.riemann_parameters()'))
    
    print(old_params)
    print(new_params)
    return model, n_params / (n_params - old_params + new_params)

In [11]:
@torch.no_grad()
def eval_model(model, test_loader):
    model.eval()
    cum_loss = torch.zeros((1,), device=device)
    acc = torch.zeros((1,), device=device)
    for batch in tqdm(test_loader):
    # for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        output = model(**batch)
        pred = output.logits.argmax(dim=-1).detach()
        # print(pred.shape)
        # print(batch['labels'].shape)
        acc += (pred == batch['labels']).float().sum()
        loss = criterion(output.logits, batch['labels'])
        # loss = output.loss
        cum_loss += loss.detach()
    return cum_loss.item() / len(test_loader), acc.item() / len(test_loader.dataset)


def fine_tune_epoch(model, train_loader, riem_optimizer, criterion):
    model.train()
    sum_loss = 0
    for i, batch in enumerate(tqdm(train_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        riem_optimizer.zero_grad()
        output = model(**batch)
        loss = criterion(output.logits, batch['labels'])
        # loss = output.loss
        sum_loss += loss
        if (i + 1) % 10 == 0:
            wandb.log({'train_loss': loss})
            # print(sum_loss / 10)
            sum_loss = 0
            pass
        loss.backward()
        riem_optimizer.step()

In [3]:
from torch.utils.data import DataLoader
from src.TuckerLinear import TuckerLinearPermute, TuckerLinearSymmetric
import wandb

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(small_eval_dataset, batch_size=8, shuffle=False)

criterion = torch.nn.CrossEntropyLoss()

model, compression = get_symmetric_model_with_rank(c=1, n_layers=10)
eval_model(model, test_loader)

In [4]:
from torch.utils.data import DataLoader
from tucker_riemopt.symmetric.optim import SGDmomentum as SGDmomentumSym
from src.utils import optim
import wandb

num_model_params = sum(p.numel() for p in model_orig.parameters())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(small_eval_dataset, batch_size=8, shuffle=False)

cs = [1, 2, 3, 5]

nlrs = [10, 20, 30, 50]

f = open("log_sym.txt", "a")

print("C\tn_layers\tmethod\tloss\tacc\tcompression")
print("C\tn_layers\tmethod\tloss\tacc\tcompression", file=f)

f.close()


for c in cs:
    for nlr in nlrs:
        # model = deepcopy(model_orig)
        
        f = open("log_sym.txt", "a")

        model, compression = get_symmetric_model_with_rank(c=c, n_layers=nlr)
        
        tucker_linears = [k.split('.') for k, m in model.named_modules() if type(m).__name__ == 'TuckerLinearSymmetric']
        riem_params = []
        
        for p in model.parameters():
            p.requires_grad_(False)

        for tl in tucker_linears:
            code = 'model'
            for l in tl:
                if l.isdigit():
                    code += f'[{l}]'
                else:
                    code += f'.{l}'
            layer = eval(code)
            
            for p in layer.riemann_parameters():
                p.requires_grad_(True)
            
            riem_params += [{
                "params": layer.riemann_parameters(),
                "rank": layer.rank
            }]
            
        riemann_opt = SGDmomentumSym(riem_params, max_lr=1e-4)
        criterion = torch.nn.CrossEntropyLoss()

        eval_loss, eval_acc = eval_model(model, test_loader)
        print(c, nlr, "replace", round(eval_loss, 3), round(eval_acc, 3), round(compression, 3), sep='\t')
        print(c, nlr, "replace", round(eval_loss, 3), round(eval_acc, 3), round(compression, 3), sep='\t', file=f)
        
        wandb.init(project="rieman-neural-nets")

        fine_tune_epoch(model, train_loader, riemann_opt, criterion)
        eval_loss, eval_acc = eval_model(model, test_loader)
        num_model_params = sum(p.numel() for p in model_orig.parameters())
        n_params = sum(p.numel() for p in model.parameters())
        print(c, nlr, "finetune", round(eval_loss, 3), round(eval_acc, 3), round(compression, 3), sep='\t')
        print(c, nlr, "finetune", round(eval_loss, 3), round(eval_acc, 3), round(compression, 3), sep='\t', file=f)
        
        f.close()