In [1]:
import torch
import re
from open_flamingo.src.helpers import PerceiverResampler, PerceiverAttention, FeedForward, VisionTokenizer
from einops_exts import rearrange_many
# from open_flamingo.src.helpers import Forward, 
DIR = '/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct'
ckpt = torch.load(DIR + '/xgenmm.projector')

In [2]:
# projector = PerceiverResampler(dim=1152, dim_inner=3072, depth=6, dim_head=96,heads=16,num_latents=128)
# projector.load_state_dict(ckpt, strict=True)
from torch import einsum, nn
class MyPerceiverAttention(nn.Module):
    def __init__(self, *, dim, dim_head=64, heads=8):
        super().__init__()
        self.scale = dim_head**-0.5
        self.heads = heads
        inner_dim = dim_head * heads

        self.norm_media = nn.LayerNorm(dim)
        self.norm_latents = nn.LayerNorm(dim)

        self.to_q = nn.Linear(dim, inner_dim, bias=False)
        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
        self.to_out = nn.Linear(inner_dim, dim, bias=False)

    def forward(self, x, latents, vision_attn_masks=None):
        """
        Args:
            x (torch.Tensor): image features
                shape (b, T, n1, D)
            latent (torch.Tensor): latent features
                shape (b, T, n2, D)
        """
        x = self.norm_media(x)
        # print('latents:', latents.shape)
        # print('before ln:', latents)
        latents = self.norm_latents(latents)
        # print('after ln:', latents)
        # print(latents)
        h = self.heads

        q = self.to_q(latents)
        kv_input = torch.cat((x, latents), dim=-2) # TODO: Change the shape of vision attention mask according to this.
        if vision_attn_masks is not None:
            vision_attn_masks = torch.cat((vision_attn_masks, 
                                            torch.ones((latents.shape[0], latents.shape[-2]), dtype=latents.dtype, device=latents.device)),
                                            dim=-1)
        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
        # print('q:', q.shape, 'k:', k.shape)
        # print('q * self.scale:', q * self.scale)
        q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h)
        q = q * self.scale
        # print('q:', q.shape, 'k:', k.shape)
        # print('q', q)
        # attention
        sim = einsum("... i d, ... j d  -> ... i j", q, k)
        # print('sim:', sim.shape)
        # print('sim:', sim)
        # Apply vision attention mask here.
        # Reference: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html#torch.nn.functional.scaled_dot_product_attention
        if vision_attn_masks is not None:
            attn_bias = torch.zeros((q.size(0), 1, 1, q.size(-2), k.size(-2)), dtype=q.dtype, device=q.device)
            vision_attn_masks = repeat(vision_attn_masks, 'b n -> b 1 1 l n', l=q.size(-2))
            attn_bias.masked_fill_(vision_attn_masks.logical_not(), float("-inf"))
            sim += attn_bias
        # print('remove safe softmax')
        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
        attn = sim.softmax(dim=-1)
        # print('attn:', attn.shape)
        # print('attn:', attn)
        
        out = einsum("... i j, ... j d -> ... i d", attn, v)
        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
        return self.to_out(out)


In [3]:
import torch.nn as nn
torch.set_printoptions(precision=6)
class MyModel(VisionTokenizer):
    def __init__(
        self,
        *,
        dim,
        dim_inner=None,
        depth=6,
        dim_head=96,
        heads=16,
        num_latents=64,
        ff_mult=4,
    ):
        if dim_inner is not None:
            projection = nn.Linear(dim, dim_inner)
        else:
            projection = None
            dim_inner = dim
        super().__init__(dim_media=dim, num_tokens_per_media=num_latents)
        self.projection = projection
        self.latents = nn.Parameter(torch.randn(num_latents, dim))

        self.layers = nn.ModuleList([])
        print('use MyPerceiverAttention')
        for _ in range(depth):
            self.layers.append(
                nn.ModuleList(
                    [
                        MyPerceiverAttention(
                            dim=dim, dim_head=dim_head, heads=heads
                        ),
                        FeedForward(dim=dim, mult=ff_mult),
                    ]
                )
            )

        self.norm = nn.LayerNorm(dim)

    def forward(self, x, up_layer_idx, vision_attn_masks=None):
        """
        Args:
            x (torch.Tensor): image features
                shape (b, T, F, v, D)
            vision_attn_masks (torch.Tensor): attention masks for padded visiont tokens (i.e., x)
                shape (b, v)
        Returns:
            shape (b, T, n, D) where n is self.num_latents
        """
        b, T, F, v = x.shape[:4]
        x = rearrange(
            x, "b T F v d -> b T (F v) d"
        )  # flatten the frame and spatial dimensions

        # blocks
        # FIXME: extending query tokens proportional to the vision sequence length. Hard-coded as dfn5b token_len=729.
        latents = self.latents
        latents = repeat(latents, "n d -> b T n d", b=b, T=T)
        for attn, ff in self.layers[up_layer_idx]:
            latents = attn(x, latents, vision_attn_masks) + latents
            latents = ff(latents) + latents
        return self.projection(self.norm(latents)) 
   

In [4]:
projector = MyModel(dim=1152, dim_inner=3072, depth=6, dim_head=96,heads=16,num_latents=128)
projector.load_state_dict(ckpt, strict=True)

use MyPerceiverAttention


<All keys matched successfully>

In [28]:
from einops import rearrange, repeat
x = torch.load('torch_input.pt')
b, T, F, v = x.shape[:4]
x = rearrange(
    x, "b T F v d -> b T (F v) d"
)  # flatten the frame and spatial dimensions

# blocks
# FIXME: extending query tokens proportional to the vision sequence length. Hard-coded as dfn5b token_len=729.
latents = projector.latents
latents = repeat(latents, "n d -> b T n d", b=b, T=T)
x.shape, latents.shape

(torch.Size([1, 1, 729, 1152]), torch.Size([1, 1, 128, 1152]))

In [33]:
latents = projector.latents
latents = repeat(latents, "n d -> b T n d", b=b, T=T)
for attn, ff in projector.layers[:7]:
    print('one layer')
    ffn_input = attn(x, latents, None) + latents
    ffn_output = ff(ffn_input) + ffn_input
    latents = ffn_output
ffn_output

one layer
one layer
one layer
one layer
one layer
one layer


tensor([[[[-0.212549, -1.531190,  1.109906,  ..., -0.526986,  1.954799,
            0.747433],
          [-2.086961, -0.522794,  1.756205,  ..., -0.127052, -0.632777,
           -0.725916],
          [ 0.810579, -0.438765,  0.084244,  ...,  1.184662,  3.135023,
           -0.504171],
          ...,
          [ 0.877389, -0.372569,  0.230555,  ...,  1.573029,  0.251383,
           -0.565987],
          [ 1.149897, -0.206728,  0.088850,  ...,  0.226263, -0.506940,
            0.667587],
          [-2.386264, -0.104232,  4.303236,  ...,  0.231421,  2.927466,
            0.611564]]]], grad_fn=<AddBackward0>)

In [34]:
projector.projection(projector.norm(ffn_output))



tensor([[[[-0.051345,  0.106190, -0.329059,  ..., -0.076452, -0.646982,
            0.387117],
          [ 0.146720,  0.416739,  0.177801,  ...,  0.757915, -0.779272,
            0.101562],
          [-0.861851, -0.739894,  0.088086,  ..., -0.329879, -0.030024,
            0.196694],
          ...,
          [-0.044654, -0.909024,  0.351772,  ...,  1.402903,  0.128508,
            0.024843],
          [ 0.564767,  1.046333,  0.147816,  ...,  0.293189,  0.838271,
            0.520264],
          [-0.476983,  0.052826, -0.849679,  ...,  0.401451, -0.028640,
            0.563384]]]], grad_fn=<ViewBackward0>)

In [10]:
x

tensor([[[[-1.125840, -1.152360, -0.250579,  ...,  0.702943, -0.667477,
            0.125382],
          [ 0.404710, -0.654932,  0.052124,  ...,  0.409271,  1.328065,
           -1.037527],
          [ 0.224494,  0.217863, -0.925740,  ..., -1.003946, -1.285568,
           -0.940877],
          ...,
          [ 1.961059, -0.128769,  1.245687,  ..., -0.747905, -1.223406,
           -0.803803],
          [ 1.250931,  0.260638, -0.322367,  ...,  1.388404,  0.753187,
            1.057841],
          [-0.822385,  1.267051, -0.065255,  ...,  0.139525, -1.134014,
           -0.970328]]]])

In [11]:
attn, ff = projector.layers[0]

In [12]:
outputs = attn(x, latents)
ffn_input = outputs + latents

In [13]:
ffn_input

tensor([[[[ 0.193154, -1.191534,  1.030737,  ..., -1.064739,  0.755063,
            0.570516],
          [-1.445420, -0.834621,  1.985458,  ...,  0.295841, -1.793584,
           -1.009780],
          [ 0.590621,  0.971060, -0.044593,  ...,  1.033780,  1.789779,
            0.500850],
          ...,
          [-0.190476, -0.839463,  0.623966,  ...,  0.057086,  0.115040,
           -0.028689],
          [-0.572107,  0.195076, -2.075379,  ...,  0.128773,  0.362365,
            0.636456],
          [-0.542767, -0.468020,  2.784523,  ...,  0.120420,  1.266044,
            0.393691]]]], grad_fn=<AddBackward0>)

In [14]:

ff[2](ff[1](ff[0](ffn_input)))

tensor([[[[ 0.237422,  0.595619,  0.052967,  ...,  0.083949,  0.468008,
           -0.169237],
          [ 0.221389,  0.089346,  0.026736,  ...,  0.359654, -0.048257,
            0.539953],
          [-0.083690,  1.282188,  0.372275,  ...,  0.252378,  0.091954,
            0.194415],
          ...,
          [ 0.075917, -0.148179, -0.169421,  ..., -0.169345, -0.004600,
           -0.135732],
          [-0.169918, -0.131036,  1.439505,  ..., -0.026955, -0.133199,
           -0.160709],
          [-0.151510,  0.193553, -0.015910,  ..., -0.002669, -0.125417,
           -0.112417]]]], grad_fn=<GeluBackward0>)

In [15]:

nn.GELU(approximate='tanh')(ff[1](ff[0](ffn_input)))

tensor([[[[ 0.237422,  0.595619,  0.052967,  ...,  0.083949,  0.468008,
           -0.169237],
          [ 0.221389,  0.089346,  0.026736,  ...,  0.359654, -0.048257,
            0.539953],
          [-0.083690,  1.282188,  0.372275,  ...,  0.252378,  0.091954,
            0.194415],
          ...,
          [ 0.075917, -0.148179, -0.169421,  ..., -0.169345, -0.004600,
           -0.135732],
          [-0.169918, -0.131036,  1.439505,  ..., -0.026955, -0.133199,
           -0.160709],
          [-0.151510,  0.193553, -0.015910,  ..., -0.002669, -0.125417,
           -0.112417]]]], grad_fn=<GeluBackward0>)

In [210]:

ff(ffn_input) + ffn_input

tensor([[[[ 0.10968, -1.06094,  0.92263,  ..., -1.09242,  1.04648,  0.94565],
          [-1.74872, -0.71779,  1.96360,  ...,  0.25901, -1.39720, -0.80547],
          [ 0.32887,  0.59734, -0.12273,  ...,  1.13904,  2.14656,  0.41976],
          ...,
          [-0.10411, -0.80275,  0.60681,  ...,  0.20535, -0.04840, -0.15943],
          [-0.26529, -0.23041, -1.59478,  ..., -0.28988,  0.21144,  0.10470],
          [-1.21751, -0.48135,  2.93902,  ...,  0.41143,  1.67213,  0.36061]]]],
       grad_fn=<AddBackward0>)

In [68]:
# torch.save(attn.to_q.state_dict()['weight'], 'w.pt')
# torch.save(attn.norm_media.weight, 'ln_w.pt')
# torch.save(attn.norm_media.bias, 'ln_b.pt')

In [30]:
x

tensor([[[[[-1.1258, -1.1524, -0.2506,  ...,  0.7029, -0.6675,  0.1254],
           [ 0.4047, -0.6549,  0.0521,  ...,  0.4093,  1.3281, -1.0375],
           [ 0.2245,  0.2179, -0.9257,  ..., -1.0039, -1.2856, -0.9409],
           ...,
           [ 1.9611, -0.1288,  1.2457,  ..., -0.7479, -1.2234, -0.8038],
           [ 1.2509,  0.2606, -0.3224,  ...,  1.3884,  0.7532,  1.0578],
           [-0.8224,  1.2671, -0.0653,  ...,  0.1395, -1.1340, -0.9703]]]]])

In [59]:
x = torch.load('torch_input.pt')
attn.norm_media(x)

tensor([[[[[-1.1170, -1.1500, -0.2608,  ...,  0.6681, -0.6668,  0.1079],
           [ 0.4050, -0.6630,  0.0482,  ...,  0.4039,  1.3266, -1.0374],
           [ 0.2603,  0.2543, -0.9079,  ..., -0.9850, -1.2718, -0.9196],
           ...,
           [ 1.9527, -0.1003,  1.2426,  ..., -0.7054, -1.1713, -0.7581],
           [ 1.2876,  0.2804, -0.3153,  ...,  1.4153,  0.7773,  1.0838],
           [-0.7789,  1.3087, -0.0273,  ...,  0.1751, -1.0870, -0.9215]]]]],
       grad_fn=<NativeLayerNormBackward0>)

In [60]:
attn.norm_media

LayerNorm((1152,), eps=1e-05, elementwise_affine=True)

In [65]:
(x - x.mean(-1, keepdim=True)) / torch.sqrt(x.var(-1, keepdim=True) + 1e-5) 

tensor([[[[[-1.1175, -1.1435, -0.2604,  ...,  0.6734, -0.6686,  0.1078],
           [ 0.4032, -0.6596,  0.0495,  ...,  0.4078,  1.3293, -1.0434],
           [ 0.2587,  0.2519, -0.9096,  ..., -0.9890, -1.2751, -0.9250],
           ...,
           [ 1.9496, -0.1005,  1.2478,  ..., -0.7078, -1.1743, -0.7627],
           [ 1.2851,  0.2778, -0.3151,  ...,  1.4249,  0.7788,  1.0887],
           [-0.7796,  1.2996, -0.0262,  ...,  0.1776, -1.0898, -0.9269]]]]])

In [53]:
(x - x.mean(-1, keepdim=True)) / torch.sqrt(x.var(-1, keepdim=True) + 1e-5) * attn.norm_media.weight + attn.norm_media.bias

tensor([[[[[-1.1165, -1.1495, -0.2606,  ...,  0.6678, -0.6665,  0.1079],
           [ 0.4048, -0.6628,  0.0482,  ...,  0.4037,  1.3260, -1.0369],
           [ 0.2602,  0.2542, -0.9075,  ..., -0.9846, -1.2713, -0.9192],
           ...,
           [ 1.9518, -0.1003,  1.2421,  ..., -0.7051, -1.1708, -0.7578],
           [ 1.2870,  0.2803, -0.3152,  ...,  1.4147,  0.7770,  1.0833],
           [-0.7785,  1.3082, -0.0273,  ...,  0.1750, -1.0865, -0.9211]]]]],
       grad_fn=<AddBackward0>)

In [39]:
import numpy as np

class LayerNorm:
    def __init__(self, normalized_shape, eps=1e-5):
        """
        Initialize LayerNorm.
        
        Args:
        normalized_shape (tuple): The shape of the normalized dimensions (usually the last dimensions).
        eps (float): Small epsilon value to avoid division by zero.
        """
        self.eps = eps
        self.gamma = np.ones(normalized_shape)
        print('gamma', self.gamma.shape)
        self.beta = np.zeros(normalized_shape)
        print('beta', self.beta.shape)
    
    def __call__(self, x):
        """
        Apply layer normalization to the input tensor.
        
        Args:
        x (numpy.ndarray): Input tensor.
        
        Returns:
        numpy.ndarray: The layer-normalized tensor.
        """
        # Compute the mean and variance along the specified dimensions
        mean = np.mean(x, axis=-1, keepdims=True)
        print('mean', mean.shape)
        variance = np.var(x, axis=-1, keepdims=True)
        print('variance', variance.shape)
        
        # Normalize
        normalized_x = (x - mean) / np.sqrt(variance + self.eps)
        
        # Apply affine transformation
        normalized_x = self.gamma * normalized_x + self.beta
        
        return normalized_x

# Example usage
normalized_shape = (64,)  # Assuming normalization over the last dimension of size 64
layer_norm = LayerNorm(normalized_shape)

x = np.random.randn(10, 32, 32, 64)  # Example 4D tensor
normalized_x = layer_norm(x)
print(normalized_x.shape)  # Should be (10, 32, 32, 64)

gamma (64,)
beta (64,)
mean (10, 32, 32, 1)
variance (10, 32, 32, 1)
(10, 32, 32, 64)


In [2]:
print(ckpt.keys())

dict_keys(['latents', 'projection.weight', 'projection.bias', 'layers.0.0.norm_media.weight', 'layers.0.0.norm_media.bias', 'layers.0.0.norm_latents.weight', 'layers.0.0.norm_latents.bias', 'layers.0.0.to_q.weight', 'layers.0.0.to_kv.weight', 'layers.0.0.to_out.weight', 'layers.0.1.0.weight', 'layers.0.1.0.bias', 'layers.0.1.1.weight', 'layers.0.1.3.weight', 'layers.1.0.norm_media.weight', 'layers.1.0.norm_media.bias', 'layers.1.0.norm_latents.weight', 'layers.1.0.norm_latents.bias', 'layers.1.0.to_q.weight', 'layers.1.0.to_kv.weight', 'layers.1.0.to_out.weight', 'layers.1.1.0.weight', 'layers.1.1.0.bias', 'layers.1.1.1.weight', 'layers.1.1.3.weight', 'layers.2.0.norm_media.weight', 'layers.2.0.norm_media.bias', 'layers.2.0.norm_latents.weight', 'layers.2.0.norm_latents.bias', 'layers.2.0.to_q.weight', 'layers.2.0.to_kv.weight', 'layers.2.0.to_out.weight', 'layers.2.1.0.weight', 'layers.2.1.0.bias', 'layers.2.1.1.weight', 'layers.2.1.3.weight', 'layers.3.0.norm_media.weight', 'layers.3

In [3]:
ckpt['latents'].shape

torch.Size([128, 1152])

In [4]:
ckpt['latents'].shape

torch.Size([128, 1152])

In [5]:
ckpt['layers.0.0.to_q.weight'].shape, ckpt['layers.0.0.to_kv.weight'].shape

(torch.Size([1536, 1152]), torch.Size([3072, 1152]))

In [6]:
ckpt['layers.0.0.to_kv.weight'].chunk(2, dim=0)[0].shape, ckpt['layers.0.0.to_kv.weight'].chunk(2, dim=0)[1].shape

(torch.Size([1536, 1152]), torch.Size([1536, 1152]))

In [7]:
for k in ckpt.keys():
    if 'layers.0' in k:
        print(k)
        
def _replace_attn_layer(key, value):
    # Check for the special case first
    if re.match(r'layers\.(\d+)\.0\.to_kv\.weight', key):
        idx = re.search(r'layers\.(\d+)\.0\.to_kv\.weight', key).group(1)
        KVweight = value.chunk(2, dim=0)
        return {f'blk.{idx}.attn.to_k.weight': KVweight[0],
                f'blk.{idx}.attn.to_v.weight': KVweight[1]
                }
    
    # Apply general replacements for other patterns
    # Define the replacement patterns
    patterns = [
        (r'layers\.(\d+)\.0\.norm_media\.(weight|bias)', r'blk.\1.attn.norm_media.\2'),
        (r'layers\.(\d+)\.0\.norm_latents\.(weight|bias)', r'blk.\1.attn.norm_latents.\2'),
        (r'layers\.(\d+)\.0\.to_q\.(weight)', r'blk.\1.attn.to_q.\2'),
        (r'layers\.(\d+)\.0\.to_out\.(weight)', r'blk.\1.attn.to_out.\2'),
        (r'layers\.(\d+)\.1\.0\.(weight|bias)', r'blk.\1.ffn.ln.\2'),
        (r'layers\.(\d+)\.1\.1\.weight', r'blk.\1.ffn.linear_up.weight'),
        (r'layers\.(\d+)\.1\.3\.weight', r'blk.\1.ffn.linear_down.weight'),
    ]
    for pattern, replacement in patterns:
        key = re.sub(pattern, replacement, key)
    
    return {key: value}

def replace_tensor_name_xgenmm_projector(ckpt):
    identifier = 'perceiver_resampler.'
    new_state_dict = {}
    for k, v in ckpt.items():
        # handel the layer
        if 'layers' in k:
            new_kvs = _replace_attn_layer(k, v)
            for new_k, new_v in new_kvs.items():
                new_state_dict[identifier+new_k] = new_v
        elif k == 'norm.weight':
            new_k = 'ln.weight'
            new_state_dict[identifier+new_k] = v
        elif k == 'norm.bias':
            new_k = 'ln.bias'
            new_state_dict[identifier+new_k] = v  
        else:
            new_state_dict[identifier+k] = v
    return new_state_dict     
        

layers.0.0.norm_media.weight
layers.0.0.norm_media.bias
layers.0.0.norm_latents.weight
layers.0.0.norm_latents.bias
layers.0.0.to_q.weight
layers.0.0.to_kv.weight
layers.0.0.to_out.weight
layers.0.1.0.weight
layers.0.1.0.bias
layers.0.1.1.weight
layers.0.1.3.weight


In [8]:
projector = replace_tensor_name_xgenmm_projector(ckpt)

In [11]:
list(projector.keys())

['perceiver_resampler.latents',
 'perceiver_resampler.projection.weight',
 'perceiver_resampler.projection.bias',
 'perceiver_resampler.blk.0.attn.norm_media.weight',
 'perceiver_resampler.blk.0.attn.norm_media.bias',
 'perceiver_resampler.blk.0.attn.norm_latents.weight',
 'perceiver_resampler.blk.0.attn.norm_latents.bias',
 'perceiver_resampler.blk.0.attn.to_q.weight',
 'perceiver_resampler.blk.0.attn.to_k.weight',
 'perceiver_resampler.blk.0.attn.to_v.weight',
 'perceiver_resampler.blk.0.attn.to_out.weight',
 'perceiver_resampler.blk.0.ffn.ln.weight',
 'perceiver_resampler.blk.0.ffn.ln.bias',
 'perceiver_resampler.blk.0.ffn.linear_up.weight',
 'perceiver_resampler.blk.0.ffn.linear_down.weight',
 'perceiver_resampler.blk.1.attn.norm_media.weight',
 'perceiver_resampler.blk.1.attn.norm_media.bias',
 'perceiver_resampler.blk.1.attn.norm_latents.weight',
 'perceiver_resampler.blk.1.attn.norm_latents.bias',
 'perceiver_resampler.blk.1.attn.to_q.weight',
 'perceiver_resampler.blk.1.attn.to

In [6]:
from transformers import AutoModel, AutoTokenizer
model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
resampler = model.resampler

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

configuration_minicpm.py:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2:
- configuration_minicpm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_minicpmv.py:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

resampler.py:   0%|          | 0.00/36.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2:
- resampler.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_minicpm.py:   0%|          | 0.00/71.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2:
- modeling_minicpm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2:
- modeling_minicpmv.py
- resampler.py
- modeling_minicpm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [7]:

def _replace_name_resampler(s, v):
    if re.match("resampler.pos_embed", s):
        return {
            s: v,
            re.sub("pos_embed", "pos_embed_k", s): None,
        }
    if re.match("resampler.proj", s):
        return {
            re.sub("proj", "pos_embed_k", s): None, 
            re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
        }
    if re.match("resampler.attn.in_proj_.*", s):
        return {
            re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
            re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
            re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
        }
    return {s: v}

res = {}
for k in model.state_dict().keys():
    if re.match("resampler", k):
        print(k)
        temp = _replace_name_resampler(k, model.state_dict()[k])
        res.update(temp)
        print(temp.keys())
        print('===')
            

resampler.pos_embed
dict_keys(['resampler.pos_embed', 'resampler.pos_embed_k'])
===
resampler.query
dict_keys(['resampler.query'])
===
resampler.proj
dict_keys(['resampler.pos_embed_k', 'resampler.proj.weight'])
===
resampler.kv_proj.weight
dict_keys(['resampler.kv_proj.weight'])
===
resampler.attn.in_proj_weight
dict_keys(['resampler.attn.q.weight', 'resampler.attn.k.weight', 'resampler.attn.v.weight'])
===
resampler.attn.in_proj_bias
dict_keys(['resampler.attn.q.bias', 'resampler.attn.k.bias', 'resampler.attn.v.bias'])
===
resampler.attn.out_proj.weight
dict_keys(['resampler.attn.out_proj.weight'])
===
resampler.attn.out_proj.bias
dict_keys(['resampler.attn.out_proj.bias'])
===
resampler.ln_q.weight
dict_keys(['resampler.ln_q.weight'])
===
resampler.ln_q.bias
dict_keys(['resampler.ln_q.bias'])
===
resampler.ln_kv.weight
dict_keys(['resampler.ln_kv.weight'])
===
resampler.ln_kv.bias
dict_keys(['resampler.ln_kv.bias'])
===
resampler.ln_post.weight
dict_keys(['resampler.ln_post.weight']

In [9]:
res['resampler.attn.q.weight'].shape, res['resampler.attn.k.weight'].shape, res['resampler.attn.v.weight'].shape

(torch.Size([2304, 2304]), torch.Size([2304, 2304]), torch.Size([2304, 2304]))