In [1]:
import torch
from models import ClipCaptionModel, RobertaDiscriminator

def print_model_architecture():
    # Attributes based on your earlier script.
    prefix_length = 10
    prefix_dim = 768  # Example for ViT-L/14
    num_layers = 4
    mapping_type = 'mlp'  # or 'transformer' based on your setup
    discriminator_type = 'roberta-base'
    sc_baseline_type = 'greedy'

    # Create instance of the generator
    generator = ClipCaptionModel(prefix_length, clip_length=prefix_length, prefix_size=prefix_dim,
                                 num_layers=num_layers, mapping_type=mapping_type)

    # Setting up a class for args since lambda was insufficient
    class Args:
        def __init__(self):
            self.sc_baseline_type = sc_baseline_type
            self.discriminator_type = discriminator_type
            self.max_gen_length = 20  # Set this according to your model's needs
            self.device = 'cpu'  # Use 'cpu' or 'cuda' as required

    args = Args()

    # Create instance of the discriminator
    discriminator = RobertaDiscriminator(args, args.sc_baseline_type, args.discriminator_type)

    # Print the models
    print("Generator Model Architecture:")
    print(generator)
    print("\nDiscriminator Model Architecture:")
    print(discriminator)

if __name__ == "__main__":
    print_model_architecture()


Using MLP as Mapper


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generator Model Architecture:
ClipCaptionModel(
  (gpt): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  )
  (

In [2]:
import torch
from torchsummary import summary
from models import ClipCaptionModel, RobertaDiscriminator

def print_model_architecture():
    # Adjust these parameters to fit the configuration of your models
    prefix_length = 10
    prefix_dim = 768  # Example for ViT-L/14
    num_layers = 4
    mapping_type = 'mlp'  # or 'transformer' based on your setup
    discriminator_type = 'roberta-base'
    sc_baseline_type = 'greedy'
    input_size = (3, 224, 224)  # Typical size for image models, adjust as necessary

    # Create instance of the generator
    generator = ClipCaptionModel(prefix_length, clip_length=prefix_length, prefix_size=prefix_dim,
                                 num_layers=num_layers, mapping_type=mapping_type)
    generator.to('cuda')

    # Create instance of the discriminator using a dummy Args class
    class Args:
        def __init__(self):
            self.sc_baseline_type = sc_baseline_type
            self.discriminator_type = discriminator_type
            self.max_gen_length = 20
            self.device = 'cuda'

    args = Args()
    discriminator = RobertaDiscriminator(args, args.sc_baseline_type, args.discriminator_type)
    discriminator.to('cuda')

    # Print the models using torchsummary
    print("Generator Model Architecture:")
    summary(generator, input_size=(prefix_dim,), device='cuda')

    print("\nDiscriminator Model Architecture:")
    # Adjust input_size based on the expected input dimensions to the discriminator
    summary(discriminator, input_size=(prefix_dim,), device='cuda')

if __name__ == "__main__":
    print_model_architecture()


Using MLP as Mapper


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generator Model Architecture:
Layer (type:depth-idx)                   Param #
├─GPT2LMHeadModel: 1-1                   --
|    └─GPT2Model: 2-1                    --
|    |    └─Embedding: 3-1               38,597,376
|    |    └─Embedding: 3-2               786,432
|    |    └─Dropout: 3-3                 --
|    |    └─ModuleList: 3-4              85,054,464
|    |    └─LayerNorm: 3-5               1,536
|    └─Linear: 2-2                       38,597,376
├─MLP: 1-2                               --
|    └─Sequential: 2-3                   --
|    |    └─Linear: 3-6                  2,952,960
|    |    └─Tanh: 3-7                    --
|    |    └─Linear: 3-8                  29,498,880
Total params: 195,489,024
Trainable params: 195,489,024
Non-trainable params: 0

Discriminator Model Architecture:
Layer (type:depth-idx)                   Param #
├─RobertaModel: 1-1                      --
|    └─RobertaEmbeddings: 2-1            --
|    |    └─Embedding: 3-1               38,603,52