In [1]:
import argparse
from model.visualglm import VisualGLMModel

[2023-12-12 18:47:11,214] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model, model_args = VisualGLMModel.from_pretrained(
    name="visualglm-6b",
    args=argparse.Namespace(
        fp16=True,
        skip_init=True,
        use_gpu_initialization=True,
        device='cuda',
        cls_fusion=True,
    ),
)
model = model.eval()

[2023-12-12 18:47:12,617] [INFO] building VisualGLMModel model ...
[2023-12-12 18:47:12,645] [INFO] [RANK 0] > initializing model parallel with size 1
[2023-12-12 18:47:12,647] [INFO] [RANK 0] You are using model-only mode.
For torch.distributed users or loading model parallel models, set environment variables RANK, WORLD_SIZE and LOCAL_RANK.
[2023-12-12 18:47:21,535] [INFO] [RANK 0]  > number of parameters on model parallel rank 0: 7802201600


In [3]:
def show_children_struct(model):
    for item in model.named_children():
        print('-----------------')
        print(item[0])
        print(item[1])
        print('-----------------')

def count_model_parameters(model):
    total = sum([param.nelement() for param in model.parameters()])
    print("Number of parameter: %.2fM" % (total/1e6))

In [4]:
show_children_struct(model)

-----------------
mixins
ModuleDict(
  (chatglm-final): ChatGLMFinalMixin(
    (lm_head): ColumnParallelLinear()
  )
  (chatglm-attn): ChatGLMAttnMixin(
    (rotary_emb): RotaryEmbedding()
  )
  (chatglm-layer): ChatGLMLayerMixin()
  (eva): ImageMixin(
    (model): BLIP2(
      (vit): EVAViT(
        (mixins): ModuleDict(
          (patch_embedding): ImagePatchEmbeddingMixin(
            (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
          )
          (pos_embedding): InterpolatedPositionEmbeddingMixin()
          (cls): LNFinalyMixin(
            (ln_vision): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
          )
        )
        (transformer): BaseTransformer(
          (embedding_dropout): Dropout(p=0.1, inplace=False)
          (word_embeddings): Embedding(1, 1408)
          (position_embeddings): Embedding(257, 1408)
          (layers): ModuleList(
            (0-38): 39 x BaseTransformerLayer(
              (input_layernorm): LayerNorm((1408,), ep

In [5]:
mixins = model.mixins
show_children_struct(mixins)

-----------------
chatglm-final
ChatGLMFinalMixin(
  (lm_head): ColumnParallelLinear()
)
-----------------
-----------------
chatglm-attn
ChatGLMAttnMixin(
  (rotary_emb): RotaryEmbedding()
)
-----------------
-----------------
chatglm-layer
ChatGLMLayerMixin()
-----------------
-----------------
eva
ImageMixin(
  (model): BLIP2(
    (vit): EVAViT(
      (mixins): ModuleDict(
        (patch_embedding): ImagePatchEmbeddingMixin(
          (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
        )
        (pos_embedding): InterpolatedPositionEmbeddingMixin()
        (cls): LNFinalyMixin(
          (ln_vision): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
        )
      )
      (transformer): BaseTransformer(
        (embedding_dropout): Dropout(p=0.1, inplace=False)
        (word_embeddings): Embedding(1, 1408)
        (position_embeddings): Embedding(257, 1408)
        (layers): ModuleList(
          (0-38): 39 x BaseTransformerLayer(
            (input_layernor

In [6]:
eva = model.mixins.eva
show_children_struct(eva)

-----------------
model
BLIP2(
  (vit): EVAViT(
    (mixins): ModuleDict(
      (patch_embedding): ImagePatchEmbeddingMixin(
        (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
      )
      (pos_embedding): InterpolatedPositionEmbeddingMixin()
      (cls): LNFinalyMixin(
        (ln_vision): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
      )
    )
    (transformer): BaseTransformer(
      (embedding_dropout): Dropout(p=0.1, inplace=False)
      (word_embeddings): Embedding(1, 1408)
      (position_embeddings): Embedding(257, 1408)
      (layers): ModuleList(
        (0-38): 39 x BaseTransformerLayer(
          (input_layernorm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (attention): SelfAttention(
            (query_key_value): ColumnParallelLinear()
            (attention_dropout): Dropout(p=0.1, inplace=False)
            (dense): RowParallelLinear()
            (output_dropout): Dropout(p=0.1, inplace=False)
          )
        

In [7]:
count_model_parameters(eva)

Number of parameter: 1094.27M


In [8]:
vit = eva.model.vit
qformer = eva.model.qformer
count_model_parameters(vit)
count_model_parameters(qformer)

Number of parameter: 985.95M
Number of parameter: 105.16M


In [9]:
show_children_struct(vit)

-----------------
mixins
ModuleDict(
  (patch_embedding): ImagePatchEmbeddingMixin(
    (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
  )
  (pos_embedding): InterpolatedPositionEmbeddingMixin()
  (cls): LNFinalyMixin(
    (ln_vision): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
  )
)
-----------------
-----------------
transformer
BaseTransformer(
  (embedding_dropout): Dropout(p=0.1, inplace=False)
  (word_embeddings): Embedding(1, 1408)
  (position_embeddings): Embedding(257, 1408)
  (layers): ModuleList(
    (0-38): 39 x BaseTransformerLayer(
      (input_layernorm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      (attention): SelfAttention(
        (query_key_value): ColumnParallelLinear()
        (attention_dropout): Dropout(p=0.1, inplace=False)
        (dense): RowParallelLinear()
        (output_dropout): Dropout(p=0.1, inplace=False)
      )
      (post_attention_layernorm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      (

In [10]:
vit_transformer = vit.transformer
show_children_struct(vit_transformer)

-----------------
embedding_dropout
Dropout(p=0.1, inplace=False)
-----------------
-----------------
word_embeddings
Embedding(1, 1408)
-----------------
-----------------
position_embeddings
Embedding(257, 1408)
-----------------
-----------------
layers
ModuleList(
  (0-38): 39 x BaseTransformerLayer(
    (input_layernorm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (attention): SelfAttention(
      (query_key_value): ColumnParallelLinear()
      (attention_dropout): Dropout(p=0.1, inplace=False)
      (dense): RowParallelLinear()
      (output_dropout): Dropout(p=0.1, inplace=False)
    )
    (post_attention_layernorm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (mlp): MLP(
      (dense_h_to_4h): ColumnParallelLinear()
      (dense_4h_to_h): RowParallelLinear()
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
)
-----------------


In [11]:
from transformers import AutoModel

In [12]:
model = AutoModel.from_pretrained("/home/qianq/model/chatglm-6b", trust_remote_code=True)
# AutoModel.from_pretrained("/home/qianq/model/chatglm-6b", trust_remote_code=True, from_tf=True)

Loading checkpoint shards: 100%|██████████| 8/8 [00:11<00:00,  1.41s/it]


In [13]:
show_children_struct(model)

-----------------
transformer
ChatGLMModel(
  (word_embeddings): Embedding(130528, 4096)
  (layers): ModuleList(
    (0-27): 28 x GLMBlock(
      (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      (attention): SelfAttention(
        (rotary_emb): RotaryEmbedding()
        (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)
        (dense): Linear(in_features=4096, out_features=4096, bias=True)
      )
      (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      (mlp): GLU(
        (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)
        (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)
      )
    )
  )
  (final_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
)
-----------------
-----------------
lm_head
Linear(in_features=4096, out_features=130528, bias=False)
-----------------


: 