<a href="https://colab.research.google.com/github/ameek2/CISC662/blob/master/empirical_FLOPS_TPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook adapted from Marius Hobbhahn, link: https://www.lesswrong.com/posts/jJApGWG95495pYM7C/how-to-measure-flop-s-for-neural-networks-empirically

We are adapting this code to run on both GPU and TPUs to determine NN utilization for analysis in roofline models. This is for a project for our Computer Architecture class (CISC 662 at the University of Delaware).

### Measuring FLOPS in Pytorch

To test FLOPS/s for currently used ML models we train multiple different classic NN architectures by training them for 10 epochs on CIFAR10.

By reconnecting our Colab instance a bunch of times, we will try to get different GPUs to make the process comparable.

In [1]:
### set up google files
from google.colab import drive
drive.mount("/content/drive", force_remount=True) 

file_location = '/content/drive/My Drive/Parameters, Compute and Data Trends in Machine Learning/empirical_FLOPs/'
data_file_location = file_location + 'data/'

Mounted at /content/drive


In [2]:
!nvidia-smi
#Tesla P100-PCIE... (Austin note: when we use a GPU accelerator, it may be a P100 or a T4)

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [6]:
!pip install --upgrade git+https://github.com/sovrasov/flops-counter.pytorch.git
!pip install -U 'git+https://github.com/facebookresearch/fvcore'
!pip install onnx -U
!pip install fprint
!pip install pthflops -U
!pip install --upgrade git+https://github.com/Lyken17/pytorch-OpCounter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/sovrasov/flops-counter.pytorch.git
  Cloning https://github.com/sovrasov/flops-counter.pytorch.git to /tmp/pip-req-build-axfcg71e
  Running command git clone -q https://github.com/sovrasov/flops-counter.pytorch.git /tmp/pip-req-build-axfcg71e
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/facebookresearch/fvcore
  Cloning https://github.com/facebookresearch/fvcore to /tmp/pip-req-build-tifn8zur
  Running command git clone -q https://github.com/facebookresearch/fvcore /tmp/pip-req-build-tifn8zur
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels

In [9]:
import torch
import torchvision
import torchvision.transforms as transforms
import time
import matplotlib.pyplot as plt
import torch.optim as optim
import torchvision.models as models

#for working with TPUs
import torch_xla
import torch_xla.core.xla_model as xm

ImportError: ignored

In [6]:
### import dataset

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

batch_size_default = 128

CIFAR10_testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
CIFAR10_testloader = torch.utils.data.DataLoader(CIFAR10_testset, batch_size=batch_size_default,
                                         shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data


In [7]:
### import different models
import torch.nn as nn
import torch.nn.functional as F

class TestNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 5, 10)
        self.conv2 = nn.Conv2d(5, 6, 10)
        self.fc1 = nn.Linear(254616, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = self.fc1(x)
        return x

test_net = TestNet().cuda()

resnet18 = models.resnet18(num_classes=10).cuda()
resnet34 = models.resnet34(num_classes=10).cuda()
resnet50 = models.resnet50(num_classes=10).cuda()
resnet101 = models.resnet101(num_classes=10).cuda()
resnet152 = models.resnet152(num_classes=10).cuda()

vgg11 = models.vgg11(num_classes=10).cuda()
vgg13 = models.vgg13(num_classes=10).cuda()
vgg16 = models.vgg16(num_classes=10).cuda()
vgg19 = models.vgg19(num_classes=10).cuda()

wide_resnet50_2 = models.wide_resnet50_2(num_classes=10).cuda()
alexnet = models.alexnet(num_classes=10).cuda()
mobilenet_v2 = models.mobilenet_v2(num_classes=10).cuda()
efficientnet_b0 = models.efficientnet_b0(num_classes=10).cuda()

In [8]:
### set up flop counting libraries
import ptflops
import fvcore
from pthflops import count_ops#
from ptflops import get_model_complexity_info
from fvcore.nn import FlopCountAnalysis, parameter_count
from thop.profile import profile as thop_profile
from torch.profiler import profile, record_function, ProfilerActivity

In [9]:
# using fvcore
def get_flops_fvcore_per_layer(model, inputs=(torch.randn((1,3,224,224)).cuda(),)):

    FMAs_per_layer = FlopCountAnalysis(model, inputs).by_module_and_operator()
    return(FMAs_per_layer)

def get_flops_ptflops_per_layer(model, input_dims=(3, 224, 224)):

    with torch.cuda.device(0):
        FMAs, _ = get_model_complexity_info(model, input_dims, as_strings=False,
                                            print_per_layer_stat=True, verbose=False)
    return(FMAs, _)

print(get_flops_fvcore_per_layer(vgg13, inputs=(torch.randn((1,3,224,224)).cuda(),)))
print(get_flops_ptflops_per_layer(vgg13, input_dims=(3, 224, 224)))



{'': Counter({'conv': 11184832512, 'linear': 119578624, 'adaptive_avg_pool2d': 25088}), 'features': Counter({'conv': 11184832512}), 'features.0': Counter({'conv': 86704128}), 'features.1': Counter(), 'features.2': Counter({'conv': 1849688064}), 'features.3': Counter(), 'features.4': Counter(), 'features.5': Counter({'conv': 924844032}), 'features.6': Counter(), 'features.7': Counter({'conv': 1849688064}), 'features.8': Counter(), 'features.9': Counter(), 'features.10': Counter({'conv': 924844032}), 'features.11': Counter(), 'features.12': Counter({'conv': 1849688064}), 'features.13': Counter(), 'features.14': Counter(), 'features.15': Counter({'conv': 924844032}), 'features.16': Counter(), 'features.17': Counter({'conv': 1849688064}), 'features.18': Counter(), 'features.19': Counter(), 'features.20': Counter({'conv': 462422016}), 'features.21': Counter(), 'features.22': Counter({'conv': 462422016}), 'features.23': Counter(), 'features.24': Counter(), 'avgpool': Counter({'adaptive_avg_p

In [10]:
### returns the flops and number of parameters of a model
# using ptflops
def get_flops_ptflops(model, input_dims=(3, 224, 224)):

    with torch.cuda.device(0):
        macs, num_params = get_model_complexity_info(model, input_dims, as_strings=False,
                                            print_per_layer_stat=False, verbose=False)
    return(macs, num_params)

# using fvcore
def get_flops_fvcore(model, inputs=(torch.randn((1,3,224,224)),)):

    flops = FlopCountAnalysis(model, inputs)
    total_flops = flops.total()
    num_params = parameter_count(model)['']
    return(total_flops, num_params)

# using thop
def get_flops_thop(model, inputs=(torch.randn((1,3,224,224)),)):

    macs, params = thop_profile(model, inputs=inputs)
    return(macs, params)

# using pthflops
def get_flops_pthflops(model, inputs=torch.randn(1,3,224,224)):

    flops = count_ops(model, inputs)
    return(flops[0])


# using the profiler
def get_flops_profiler_old(model, inputs=(torch.randn((1,3,224,224)),)):

    with profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        with_flops=True) as prof:
        
        outputs = model(inputs)

    events = prof.events()
    flops = sum([int(evt.flops) for evt in events]) 
    flops = flops / 2 # divide by 2 because of FMAs (see text)
    return(flops)

    # using the profiler
def get_flops_profiler(model, inputs=(torch.randn((1,3,224,224)),)):

    # warm up cuda memory allocator, recommended here: https://github.com/pytorch/pytorch/blob/master/torch/autograd/profiler.py
    outputs = model(inputs)

    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        with_flops=True) as prof:
            with record_function("model_inference"):
                outputs = model(inputs)

    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
    events = prof.events()
    flops = sum([int(evt.flops) for evt in events]) 
    flops = flops / 2 # divide by 2 because of FMAs (see text)
    return(flops)

def get_flops_profiler_bw(model, inputs=(torch.randn((1,3,224,224)),), y=torch.tensor([0]).cuda()):

    optimizer = optim.SGD(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()
    # warm up cuda memory allocator, recommended here: https://github.com/pytorch/pytorch/blob/master/torch/autograd/profiler.py
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()

    ### measure only backward pass
    optimizer.zero_grad()
    outputs = model(inputs)
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        with_flops=True) as prof:
            with record_function("model_inference"):
                loss = criterion(outputs, y)
                loss.backward()
                optimizer.step()
                

    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
    events = prof.events()
    flops = sum([int(evt.flops) for evt in events]) 
    flops = flops / 2 # divide by 2 because of FMAs (see text)
    return(flops)

In [11]:
#flops_profiler = get_flops_profiler(resnet18, inputs=torch.randn((1,3,224,224)).cuda())
#print("profiler: ", flops_profiler)

flops_profiler = get_flops_profiler_bw(resnet18, inputs=torch.randn((1,3,224,224)).cuda())
print("profiler: ", flops_profiler)
### doesn't seem to measure the backward pass ... 

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  Total KFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
autograd::engine::evaluate_function: ConvolutionBack...         1.21%     359.000us        14.19%       4.203ms     210.150us       0.000us         0.00%       6.231ms     311.550us            20            --  
                                   ConvolutionBackward0         0.57%     169.000us        12.38%       3.669ms     183.450us       0.000us         0.0

In [12]:
def get_flops_profiler_full_table(model, inputs=torch.randn(1,3,224,224).cuda()):

    # warm up cuda memory allocator, recommended here: https://github.com/pytorch/pytorch/blob/master/torch/autograd/profiler.py
    #outputs = model(inputs)

    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        with_flops=True) as prof:
            with record_function("model_inference"):
                outputs = model(inputs)

    print(prof.key_averages().table(row_limit=1000))
    #print(prof.events())
    events = prof.events()
    flops = sum([int(evt.flops) for evt in events]) 
    flops = flops / 2 # divide by 2 because of FMAs (see text)
    return(flops)

get_flops_profiler_full_table(alexnet, inputs=torch.randn(1,3,224,224).cuda())

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  Total MFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::zeros         0.30%      20.000us         0.46%      30.000us      30.000us       0.000us         0.00%       0.000us       0.000us             1            --  
                                            aten::empty         1.82%     120.000us         1.82%     120.000us      15.000us       0.000us         0.0

710133440.0

In [21]:
### test for TestNet
flops_ptflops, num_params_ptflops = get_flops_ptflops(test_net, input_dims=(3, 224, 224))
print("ptflops: ", flops_ptflops, "num_params: ", num_params_ptflops)

flops_fvcore, num_params_fvcore = get_flops_fvcore(test_net, inputs=(torch.randn((1,3,224,224)).cuda(),))
print("fvcore: ", flops_fvcore, "num_params: ", num_params_fvcore)

#flops_thop, num_params_thop = get_flops_thop(test_net, inputs=(torch.randn((1,3,224,224)),))
#print("thop: ", flops_thop, "num_params: ", num_params_thop)
# some problem with fprint

flops_pthflops = get_flops_pthflops(test_net, inputs=torch.randn(1,3,224,224).cuda())
print("pthflops: ", flops_pthflops)

flops_profiler = get_flops_profiler(test_net, inputs=torch.randn((1,3,224,224)).cuda())
print("profiler: ", flops_profiler)

ptflops:  199677411.0 num_params:  2550681
fvcore:  199191660 num_params:  2550681
OperationOPS         
------  ----------  
conv1   69568625    
conv2   127562616   
fc1     2546170     
-----   ---------   
Input size: (1, 3, 224, 224)
199,677,411 FLOPs or approx. 0.20 GFLOPs
pthflops:  199677411
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  Total MFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference       

In [14]:
from torch.autograd import DeviceType

def get_time_profiler(profiler):

    events = profiler.events()
    sum_self_cuda_time_total = 0
    for evt in events:
        if evt.device_type == DeviceType.CPU:
            # in legacy profiler, kernel info is stored in cpu events
            if evt.is_legacy:
                sum_self_cuda_time_total += evt.self_cuda_time_total
        elif evt.device_type == DeviceType.CUDA:
            # in kineto profiler, there're events with the correct device type (e.g. CUDA)
            sum_self_cuda_time_total += evt.self_cuda_time_total

    return(sum_self_cuda_time_total)

def get_time_epoch_profiler(model, trainloader): 

    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    criterion = torch.nn.CrossEntropyLoss()

    ### measure time
    print("measure time")
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ]) as prof:
        for x,y in trainloader:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
    time_ = get_time_profiler(profiler=prof)
    return(time_)

In [None]:
### test with TestNet
#time_testnet = get_time_epoch_profiler(test_net, CIFAR10_testloader)
#print(time_testnet / 1e6)

In [15]:
### compute flops for one forward pass with one datapoint for all methods
def get_flops_forward_all_methods(model): 

    flops_ptflops, num_params_ptflops = get_flops_ptflops(model, input_dims=(3, 224, 224))
    flops_fvcore, num_params_fvcore = get_flops_fvcore(model, inputs=(torch.randn((1,3,224,224)).cuda(),))
    flops_pthflops = get_flops_pthflops(model, inputs=torch.randn(1,3,224,224).cuda())
    flops_profiler = get_flops_profiler(model, inputs=torch.randn((1,3,224,224)).cuda())

    return(flops_ptflops, flops_fvcore, flops_pthflops, flops_profiler, num_params_fvcore)

In [16]:
### run the function for multiple models and batch_sizes

def compare_model_batch_sizes(model, model_name, batch_sizes): 

    ### compute flops for one forward pass on one datapoint
    flops_ptflops, flops_fvcore, flops_pthflops, flops_profiler, num_params_fvcore = get_flops_forward_all_methods(model)

    ### compute times for different batch_sizes
    batch_sizes_time_dict = dict()
    for batch_size in batch_sizes:

        print("model: {}; batch_size: {};".format(model_name, batch_size))
        ### define training data
        testloader = torch.utils.data.DataLoader(CIFAR10_testset, batch_size=batch_size,
                                        shuffle=True, num_workers=1)

        ### compute flops for one epoch an training time for a bunch of epochs
        time_epoch = get_time_epoch_profiler(model, testloader)

        batch_sizes_time_dict[batch_size] = time_epoch

    ### save results for current batch size in dict
    results = {
        "flops_ptflops_forward":flops_ptflops,
        "flops_fvcore_forward":flops_fvcore,
        "flops_pthflops_forward":flops_pthflops,
        "flops_profiler_forward":flops_profiler,
        "num_params":num_params_fvcore,
        "time_epoch_batch_sizes":batch_sizes_time_dict
    }
    print(results)

    return(results)

In [17]:
#batch_sizes_test = [256, 128, 64]
#batch_sizes_test = [128, 64]
batch_sizes_test = [64]
#model, model_name = test_net, "test_net"
#model, model_name = resnet18, "resnet18"
#model, model_name = resnet34, "resnet34"
#model, model_name = resnet50, "resnet50"
#model, model_name = resnet101, "resnet101"
#model, model_name = resnet152, "resnet152"
#model, model_name = vgg11, "vgg11"
#model, model_name = vgg13, "vgg13" 
#model, model_name = vgg16, "vgg16"
#model, model_name = vgg19, "vgg19"
#model, model_name = wide_resnet50_2, "wide_resnet50_2"
#model, model_name = alexnet, "alexnet"
#model, model_name = mobilenet_v2, "mobilenet_v2"
model, model_name = efficientnet_b0, "efficientnet_b0"
t0 = time.time()
results_dict = compare_model_batch_sizes(model, model_name, batch_sizes_test)
t1 = time.time()
print("comparing models took {} seconds".format(t1 - t0))

features.1.0.stochastic_depth, features.2.0.stochastic_depth, features.2.1.stochastic_depth, features.3.0.stochastic_depth, features.3.1.stochastic_depth, features.4.0.stochastic_depth, features.4.1.stochastic_depth, features.4.2.stochastic_depth, features.5.0.stochastic_depth, features.5.1.stochastic_depth, features.5.2.stochastic_depth, features.6.0.stochastic_depth, features.6.1.stochastic_depth, features.6.2.stochastic_depth, features.6.3.stochastic_depth, features.7.0.stochastic_depth


Operation                      OPS        
-----------------------------  ---------  
features_0_0                   10838016   
features_0_1                   802816     
features_1_0_block_0_0         3612672    
features_1_0_block_0_1         802816     
features_1_0_block_1_avgpool   401408     
features_1_0_block_1_fc1       264        
features_1_0_block_1_fc2       288        
mul                            802816     
features_1_0_block_2_0         6422528    
features_1_0_block_2_1         401408     
features_2_0_block_0_0         19267584   
features_2_0_block_0_1         2408448    
features_2_0_block_1_0         2709504    
features_2_0_block_1_1         602112     
features_2_0_block_2_avgpool   301056     
features_2_0_block_2_fc1       388        
features_2_0_block_2_fc2       480        
mul_1                          602112     
features_2_0_block_3_0         7225344    
features_2_0_block_3_1         150528     
features_2_1_block_0_0         10838016   
features_2_

In [18]:
!nvidia-smi

Wed Nov 30 23:19:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    29W /  70W |  10432MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [19]:
results_dict

{'flops_ptflops_forward': 400401542.0,
 'flops_fvcore_forward': 400392192,
 'flops_pthflops_forward': 405434822,
 'flops_profiler_forward': 384552032.0,
 'num_params': 4020358,
 'time_epoch_batch_sizes': {64: 46360468}}

In [20]:
### don't forget to name the GPU that you were using and the right model class (e.g. resnets)
GPU_type = "Tesla_P100"
results_dict['GPU_type'] = GPU_type
torch.save(results_dict, data_file_location + "{}_flops_time_{}.pt".format(model_name, GPU_type))

FileNotFoundError: ignored

Alright, from here on I'll add some loops to use some of the functions above over all the models above.

List of the models:

resnet18

resnet34

resnet50

resnet101

resnet152

vgg11

vgg13

vgg16

vgg19

wide_resnet50_2 

alexnet

mobilenet_v2

efficientnet_b0 

In [34]:
def get_size(model):
  #credit to ptrblack on the pytorch forums for this bit of code
  #https://discuss.pytorch.org/t/finding-model-size/130275
  param_size = 0
  buffer_size = 0
  for param in model.parameters():
    param_size += param.nelement() * param.element_size()
  for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()
  size_all_mb = (param_size + buffer_size) / 1024**2
  return size_all_mb

flops_per_model = {}
size_per_model = {}

flops_ptflops, num_params_ptflops = get_flops_ptflops(resnet18)
flops_per_model['resnet18'] = flops_ptflops
size_per_model['resnet18'] = get_size(resnet18)

flops_ptflops, num_params_ptflops = get_flops_ptflops(resnet34)
flops_per_model['resnet34'] = flops_ptflops
size_per_model['resnet34'] = get_size(resnet34)

flops_ptflops, num_params_ptflops = get_flops_ptflops(resnet50)
flops_per_model['resnet50'] = flops_ptflops
size_per_model['resnet50'] = get_size(resnet50)

flops_ptflops, num_params_ptflops = get_flops_ptflops(resnet101)
flops_per_model['resnet101'] = flops_ptflops
size_per_model['resnet101'] = get_size(resnet101)

flops_ptflops, num_params_ptflops = get_flops_ptflops(resnet152)
flops_per_model['resnet152'] = flops_ptflops
size_per_model['resnet152'] = get_size(resnet152)

flops_ptflops, num_params_ptflops = get_flops_ptflops(vgg11)
flops_per_model['vgg11'] = flops_ptflops
size_per_model['vgg11'] = get_size(vgg11)

flops_ptflops, num_params_ptflops = get_flops_ptflops(vgg13)
flops_per_model['vgg13'] = flops_ptflops
size_per_model['vgg13'] = get_size(vgg13)

flops_ptflops, num_params_ptflops = get_flops_ptflops(vgg16)
flops_per_model['vgg16'] = flops_ptflops
size_per_model['vgg16'] = get_size(vgg16)

flops_ptflops, num_params_ptflops = get_flops_ptflops(vgg19)
flops_per_model['vgg19'] = flops_ptflops
size_per_model['vgg19'] = get_size(vgg19)

flops_ptflops, num_params_ptflops = get_flops_ptflops(wide_resnet50_2)
flops_per_model['wide_resnet50_2'] = flops_ptflops
size_per_model['wide_resnet50_2'] = get_size(wide_resnet50_2)

flops_ptflops, num_params_ptflops = get_flops_ptflops(alexnet)
flops_per_model['alexnet'] = flops_ptflops
size_per_model['alexnet'] = get_size(alexnet)

flops_ptflops, num_params_ptflops = get_flops_ptflops(mobilenet_v2)
flops_per_model['mobilenet_v2'] = flops_ptflops
size_per_model['mobilenet_v2'] = get_size(mobilenet_v2)

flops_ptflops, num_params_ptflops = get_flops_ptflops(efficientnet_b0)
flops_per_model['efficientnet_b0'] = flops_ptflops
size_per_model['efficientnet_b0'] = get_size(efficientnet_b0)

print(flops_per_model)
print(size_per_model)

{'resnet18': 1821669898.0, 'resnet34': 3675121162.0, 'resnet50': 4119896586.0, 'resnet101': 7847471626.0, 'resnet152': 11578659338.0, 'vgg11': 7626050058.0, 'vgg13': 11335059978.0, 'vgg16': 15499467274.0, 'vgg19': 19663874570.0, 'wide_resnet50_2': 11438593034.0, 'alexnet': 711505866.0, 'mobilenet_v2': 318969098.0, 'efficientnet_b0': 400401542.0}
{'resnet18': 42.69135284423828, 'resnet34': 81.27936553955078, 'resnet50': 89.95722961425781, 'resnet101': 162.6060562133789, 'resnet152': 222.4580078125, 'vgg11': 491.36087799072266, 'vgg13': 492.06473541259766, 'vgg16': 512.3196182250977, 'vgg19': 532.5745010375977, 'wide_resnet50_2': 255.2912139892578, 'alexnet': 217.60868072509766, 'mobilenet_v2': 8.662788391113281, 'efficientnet_b0': 15.497100830078125}
