In [1]:
import torch
import torch.nn.functional as F
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel
from torch.utils.tensorboard import SummaryWriter
import pytorch_lightning as pl
import random
import dotenv
import omegaconf
import hydra
import logging
from typing import List

import wandb
from datetime import date
import dotenv
import os
import pathlib
from typing import Dict, Any
from copy import deepcopy

from rigl_torch.models import ModelFactory
from rigl_torch.rigl_scheduler import RigLScheduler
from rigl_torch.rigl_constant_fan import RigLConstFanScheduler
from rigl_torch.datasets import get_dataloaders
from rigl_torch.optim import (
    get_optimizer,
    get_lr_scheduler,
)
from rigl_torch.utils.rigl_utils import get_names_and_W
from rigl_torch.utils.checkpoint import Checkpoint
from rigl_torch.utils.rigl_utils import get_T_end, get_fan_in_after_ablation, get_conv_idx_from_flat_idx
from hydra import initialize, compose

from fvcore.nn import FlopCountAnalysis
import pandas as pd


In [2]:
def get_pruner_model_loader(dense_alloc, model, dataset):
    with initialize("../configs", version_base="1.2.0"):
        cfg = compose(
            "config.yaml",
            overrides=[
                f"dataset={dataset}",
                "compute.distributed=False",
                f"model={model}",
                # f"rigl.dense_allocation={dense_alloc}",
                f"rigl.dense_allocation={dense_alloc}",
                ])
    dotenv.load_dotenv("../.env")
    os.environ["IMAGE_NET_PATH"]


    rank=0
    checkpoint=None
    if checkpoint is not None:
        run_id = checkpoint.run_id
        optimizer_state = checkpoint.optimizer
        scheduler_state = checkpoint.scheduler
        pruner_state = checkpoint.pruner
        model_state = checkpoint.model
        cfg = checkpoint.cfg
    else:
        run_id, optimizer_state, scheduler_state, pruner_state, model_state = (
            None,
            None,
            None,
            None,
            None,
        )

    print(cfg.compute)
    cfg.compute.distributed=False
        
    pl.seed_everything(cfg.training.seed)
    use_cuda = not cfg.compute.no_cuda and torch.cuda.is_available()
    if not use_cuda:
        raise SystemError("GPU has stopped responding...waiting to die!")
        logger.warning(
            "Using CPU! Verify cfg.compute.no_cuda and "
            "torch.cuda.is_available() are properly set if this is unexpected"
        )

    if cfg.compute.distributed and use_cuda:
        device = torch.device(f"cuda:{rank}")
    else:
        print(f"loading to device rank: {rank}")
        device = torch.device(f"cuda:{rank}")
    if not use_cuda:
        device = torch.device("cuda" if use_cuda else "cpu")
    train_loader, test_loader = get_dataloaders(cfg)

    model = ModelFactory.load_model(
        model=cfg.model.name, dataset=cfg.dataset.name
    )
    model.to(device)
    if cfg.compute.distributed:
        model = DistributedDataParallel(model, device_ids=[rank])
    if model_state is not None:
        try:
            model.load_state_dict(model_state)
        except RuntimeError:
            model_state = checkpoint.get_single_process_model_state_from_distributed_state()
            model.load_state_dict(model_state)
            
    optimizer = get_optimizer(cfg, model, state_dict=optimizer_state)
    scheduler = get_lr_scheduler(cfg, optimizer, state_dict=scheduler_state)
    pruner = None
    if cfg.rigl.dense_allocation is not None:
        T_end = get_T_end(cfg, [0 for _ in range(0,1251)])
        if cfg.rigl.const_fan_in:
            rigl_scheduler = RigLConstFanScheduler
        else:
            rigl_scheduler = RigLScheduler
        pruner = rigl_scheduler(
            model,
            optimizer,
            dense_allocation=cfg.rigl.dense_allocation,
            alpha=cfg.rigl.alpha,
            delta=cfg.rigl.delta,
            static_topo=cfg.rigl.static_topo,
            T_end=T_end,
            ignore_linear_layers=cfg.rigl.ignore_linear_layers,
            grad_accumulation_n=cfg.rigl.grad_accumulation_n,
            sparsity_distribution=cfg.rigl.sparsity_distribution,
            erk_power_scale=cfg.rigl.erk_power_scale,
            state_dict=pruner_state,
            filter_ablation_threshold=cfg.rigl.filter_ablation_threshold,
            static_ablation=cfg.rigl.static_ablation,
            dynamic_ablation=cfg.rigl.dynamic_ablation,
            min_salient_weights_per_neuron=cfg.rigl.min_salient_weights_per_neuron,  # noqa
            use_sparse_init=cfg.rigl.use_sparse_initialization,
            init_method_str=cfg.rigl.init_method_str,
            use_sparse_const_fan_in_for_ablation=cfg.rigl.use_sparse_const_fan_in_for_ablation,  # noqa
        )
        
        step=0
    return pruner, model, train_loader

In [3]:
def get_flops_df(model_name, dataset):
    df = {k:[] for k in ["rigl.dense_allocation", "flops", "model",]}
    for da in ["null", 0.01, 0.05, 0.0625, 0.1, 0.2, 0.25,]:
        print(f"Calculating with dense_alloc == {da}")
        pruner, model, train_loader = get_pruner_model_loader(da, model_name, dataset)
        model.train()
        for data, _ in train_loader:
            data = data[0].to("cpu").reshape(1, *data[0].shape)
            break
        
        flops = FlopCountAnalysis(model.to("cpu"),data)
        total_flops = 0

        names, W = get_names_and_W(model)
        if pruner is not None:
            S = pruner.S
        else:
            S = [0. for _ in range(len(names))]
        for name, counter in flops.by_module_and_operator().items():
            if name in names:
                if len(counter) != 1:
                    raise ValueError(f"Too many items found in {name}. Goodbye")
                f = list(counter.values())[0]
                s = S[names.index(name)]
                if s is None:
                    s=0
                total_flops += f*(1-s)
        del model
        del pruner
        del train_loader
        df["rigl.dense_allocation"].append(da)
        df["flops"].append(total_flops)
        df["model"].append(model_name)
    
    df=pd.DataFrame(df)
    df["normalized_flops"] = df["flops"]/ df.loc[df["rigl.dense_allocation"]=="null"]["flops"].item()

    return pd.DataFrame(df)

df = get_flops_df("resnet50", "imagenet")
df.to_csv("../train_flops_fvcore.csv")

Calculating with dense_alloc == null


Global seed set to 42


{'no_cuda': False, 'cuda_kwargs': {'num_workers': '${ oc.decode:${oc.env:NUM_WORKERS} }', 'pin_memory': True}, 'distributed': False, 'world_size': 4, 'dist_backend': 'nccl'}
loading to device rank: 0


INFO:/home/mike/condensed-sparsity/src/rigl_torch/models/model_factory.py:Loading model resnet50/imagenet using <function get_imagenet_resnet50 at 0x7fc9ce06e170> with args: () and kwargs: {}
Global seed set to 42


Calculating with dense_alloc == 0.01
{'no_cuda': False, 'cuda_kwargs': {'num_workers': '${ oc.decode:${oc.env:NUM_WORKERS} }', 'pin_memory': True}, 'distributed': False, 'world_size': 4, 'dist_backend': 'nccl'}
loading to device rank: 0


INFO:/home/mike/condensed-sparsity/src/rigl_torch/models/model_factory.py:Loading model resnet50/imagenet using <function get_imagenet_resnet50 at 0x7fc9ce06e170> with args: () and kwargs: {}
Global seed set to 42


Calculating with dense_alloc == 0.05
{'no_cuda': False, 'cuda_kwargs': {'num_workers': '${ oc.decode:${oc.env:NUM_WORKERS} }', 'pin_memory': True}, 'distributed': False, 'world_size': 4, 'dist_backend': 'nccl'}
loading to device rank: 0


INFO:/home/mike/condensed-sparsity/src/rigl_torch/models/model_factory.py:Loading model resnet50/imagenet using <function get_imagenet_resnet50 at 0x7fc9ce06e170> with args: () and kwargs: {}
Global seed set to 42


Calculating with dense_alloc == 0.0625
{'no_cuda': False, 'cuda_kwargs': {'num_workers': '${ oc.decode:${oc.env:NUM_WORKERS} }', 'pin_memory': True}, 'distributed': False, 'world_size': 4, 'dist_backend': 'nccl'}
loading to device rank: 0


INFO:/home/mike/condensed-sparsity/src/rigl_torch/models/model_factory.py:Loading model resnet50/imagenet using <function get_imagenet_resnet50 at 0x7fc9ce06e170> with args: () and kwargs: {}
Global seed set to 42


Calculating with dense_alloc == 0.1
{'no_cuda': False, 'cuda_kwargs': {'num_workers': '${ oc.decode:${oc.env:NUM_WORKERS} }', 'pin_memory': True}, 'distributed': False, 'world_size': 4, 'dist_backend': 'nccl'}
loading to device rank: 0


INFO:/home/mike/condensed-sparsity/src/rigl_torch/models/model_factory.py:Loading model resnet50/imagenet using <function get_imagenet_resnet50 at 0x7fc9ce06e170> with args: () and kwargs: {}
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 1 set to 0.0
Global seed set to 42


Calculating with dense_alloc == 0.2
{'no_cuda': False, 'cuda_kwargs': {'num_workers': '${ oc.decode:${oc.env:NUM_WORKERS} }', 'pin_memory': True}, 'distributed': False, 'world_size': 4, 'dist_backend': 'nccl'}
loading to device rank: 0


INFO:/home/mike/condensed-sparsity/src/rigl_torch/models/model_factory.py:Loading model resnet50/imagenet using <function get_imagenet_resnet50 at 0x7fc9ce06e170> with args: () and kwargs: {}
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 1 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 3 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 4 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 5 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 7 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 8 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 10 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/r

Calculating with dense_alloc == 0.25
{'no_cuda': False, 'cuda_kwargs': {'num_workers': '${ oc.decode:${oc.env:NUM_WORKERS} }', 'pin_memory': True}, 'distributed': False, 'world_size': 4, 'dist_backend': 'nccl'}
loading to device rank: 0


INFO:/home/mike/condensed-sparsity/src/rigl_torch/models/model_factory.py:Loading model resnet50/imagenet using <function get_imagenet_resnet50 at 0x7fc9ce06e170> with args: () and kwargs: {}
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 1 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 3 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 4 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 5 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 7 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 8 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/rigl_scheduler.py:Sparsity of layer at index 10 set to 0.0
INFO:/home/mike/condensed-sparsity/src/rigl_torch/r

In [4]:
df

Unnamed: 0,rigl.dense_allocation,flops,model,normalized_flops
0,,4089184000.0,resnet50,1.0
1,0.01,99517040.0,resnet50,0.024337
2,0.05,497551600.0,resnet50,0.121675
3,0.0625,621925700.0,resnet50,0.15209
4,0.1,988923800.0,resnet50,0.241839
5,0.2,1692089000.0,resnet50,0.413796
6,0.25,1946310000.0,resnet50,0.475965


In [5]:
p, m, l = get_pruner_model_loader("null", "resnet50", "imagenet")

Global seed set to 42


{'no_cuda': False, 'cuda_kwargs': {'num_workers': '${ oc.decode:${oc.env:NUM_WORKERS} }', 'pin_memory': True}, 'distributed': False, 'world_size': 4, 'dist_backend': 'nccl'}
loading to device rank: 0


INFO:/home/mike/condensed-sparsity/src/rigl_torch/models/model_factory.py:Loading model resnet50/imagenet using <function get_imagenet_resnet50 at 0x7fc9ce06e170> with args: () and kwargs: {}


In [6]:
from micronet_challenge import counting
import torch.nn as nn

from functools import partial
def register_forward_hook(model, masked_layers):
    def hook_input_shape(x, *args, n, mod,_unwrapped_forward, **kwargs):
        # print(f"{n}: Input Shape: {x.shape}")
        if not hasattr(mod, "_input_shape"):
            mod._input_shape=x.shape
        return _unwrapped_forward(x, *args, **kwargs)
    for n,mod in model.named_modules():
        if n in masked_layers:
            _unwrapped_forward = mod.forward
            mod.forward = partial(hook_input_shape, n=n, mod=mod, _unwrapped_forward=_unwrapped_forward)

def get_conv_op(conv: nn.Conv2d):
    # use_bias = True if conv.bias is not None else False
    use_bias=True
    c_out, c_in, k_x, k_y = conv.weight.shape
    input_size = conv._input_shape[-1]
    return counting.Conv2D(
        input_size=input_size,
        kernel_shape=(k_x, k_y, c_in, c_out),
        strides=conv.stride,
        use_bias=use_bias,
        padding="same",
        activation="relu",
    )

def get_add_op(conv_downsample: nn.Conv2d):
    return counting.Add(
        input_size=conv_downsample._input_shape[-1],
        n_channels=conv_downsample.out_channels
    )

def get_linear_op(linear: nn.Linear, use_relu_activation: bool = True):
    c_out, c_in = linear.weight.shape
    input_size = linear._input_shape[-1]
    return counting.FullyConnected(
        kernel_shape=(c_in, c_out),
        # use_bias = True if linear.bias is not None else False,
        use_bias=True,
        activation="relu" if use_relu_activation else None,
    )

In [7]:
from typing import List, Optional, NamedTuple
import numpy as np

def get_op_from_module(m):
    if isinstance(m, nn.Conv2d):
        return get_conv_op(m)
    if isinstance(m, nn.Linear):
        return get_linear_op(m, use_relu_activation=False)  # Only 1 layer

def get_names_and_ops(
    module,
    target_names: Optional[List[str]]=None,
) -> Dict[str, nn.Module]:
    if target_names is None:
        target_names, _ = get_names_and_W(module)
    names_ops = {k: None for k in target_names}
    add_counter = 0
    for n,m in module.named_modules():
        if n in target_names:
            op = get_op_from_module(m)
            names_ops[n]=op
            # if "downsample" in n:
            #     names_ops[f"add.{add_counter}"] = get_add_op(m)
            #     add_counter+=1
    return names_ops


def get_model_info(m, p):
    names = get_names_and_ops(m)
    # names

    total_flops = 0
    flops_dict = {n:0 for n in names}
    if p is not None:
        S = p.S
    else:
        S = [0. for _ in range(len(names))]
    total_flops = 0
    total_param_bits = 0
    total_params = 0.
    n_zeros = 0.
    for s, (n, o) in list(zip(S, names.items())):
        param_count, n_mults, n_adds = counting.count_ops(o, s, param_bits=32)
        # print(f"{n}: FLOPS: {(n_mults+n_adds)/1e9}")
        total_flops += n_mults + n_adds
        if isinstance(o, counting.Add):
            continue
        k_shape = o.kernel_shape
        total_param_bits += param_count
        n_param = np.prod(k_shape)
        total_params += n_param
        n_zeros += int(n_param * s)
    return total_flops, total_param_bits, n_zeros / total_params

In [8]:
total_flops/1e9

NameError: name 'total_flops' is not defined

In [None]:
def init_input_shape(m):
    m.to("cpu")
    masked_layers, _ = get_names_and_W(m)
    register_forward_hook(m, masked_layers)
    input = torch.ones(size=(1,3,224,224))
    out = m(input)
    return

In [None]:
def get_flops_df(model_name, dataset):
    df = {k:[] for k in ["rigl.dense_allocation", "flops", "model",]}
    for da in ["null", 0.01, 0.05, 0.0625, 0.1, 0.2, 0.25,]:
        print(f"Calculating with dense_alloc == {da}")
        pruner, model, _ = get_pruner_model_loader(da, model_name, dataset)
        model.eval()
        model.to("cpu")
        init_input_shape(model)
        total_flops, params, global_sparsity = get_model_info(model, pruner)
        del model
        del pruner
        df["rigl.dense_allocation"].append(da)
        df["flops"].append(total_flops)
        df["model"].append(model_name)
    df = pd.DataFrame(df)
    df["normalized_flops"] = df["flops"]/ df.loc[df["rigl.dense_allocation"]=="null"]["flops"].item()
    df["training_flops"] = df["flops"]* 3
    df.loc[df["rigl.dense_allocation"]=="null"]['training_flops'] = df.loc[df["rigl.dense_allocation"]=="null"]["flops"].item()*3
    df["normalized_training_flops"] = df["training_flops"]/ df.loc[df["rigl.dense_allocation"]=="null"]["training_flops"].item()
    return df
    
df = get_flops_df("resnet50", "imagenet")
df

Global seed set to 42


Calculating with dense_alloc == null
{'no_cuda': False, 'cuda_kwargs': {'num_workers': '${ oc.decode:${oc.env:NUM_WORKERS} }', 'pin_memory': True}, 'distributed': False, 'world_size': 4, 'dist_backend': 'nccl'}
loading to device rank: 0


INFO:/home/mike/condensed-sparsity/src/rigl_torch/models/model_factory.py:Loading model resnet50/imagenet using <function get_imagenet_resnet50 at 0x7f93be862170> with args: () and kwargs: {}


NameError: name 'init_input_shape' is not defined

In [None]:
df

Unnamed: 0,rigl.dense_allocation,flops,model,normalized_flops
0,,4089184000.0,resnet50,1.0
1,0.01,99517040.0,resnet50,0.024337
2,0.05,497551600.0,resnet50,0.121675
3,0.0625,621925700.0,resnet50,0.15209
4,0.1,988923800.0,resnet50,0.241839
5,0.2,1692089000.0,resnet50,0.413796
6,0.25,1946310000.0,resnet50,0.475965


In [None]:
df["flops"]/1e9

0    4.089184
1    0.099517
2    0.497552
3    0.621926
4    0.988924
5    1.692089
6    1.946310
Name: flops, dtype: float64

In [None]:
pytorch_profiler_flops = pd.read_csv("../flops_fvcore.csv", index_col="Unnamed: 0")
pytorch_profiler_flops

Unnamed: 0,rigl.dense_allocation,flops,model
0,0.01,99517040.0,resnet50
1,0.05,497551600.0,resnet50
2,0.0625,621925700.0,resnet50
3,0.1,988923800.0,resnet50
4,0.2,1692089000.0,resnet50
5,0.25,1946310000.0,resnet50


In [None]:
pytorch_profiler_flops = pd.read_csv("../training_flops_fvcore.csv", index_col="Unnamed: 0")
pytorch_profiler_flops