In [1]:
from initialized_model.configuration_phi import PhiConfig
from initialized_model.modeling_phi import PhiForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True)

Loading checkpoint shards: 100%|██████████| 2/2 [00:22<00:00, 11.39s/it]


In [9]:
original_config = PhiConfig.from_pretrained("microsoft/phi-2")

In [20]:
original_config.n_layer = 12
original_config.n_embd = 768

In [21]:
smol = PhiForCausalLM(original_config)

In [25]:
import torch
import numpy as np

def uniform_selection(weights, old_dim, new_dim, vocab_size):
    new_weights = {}
    for key in weights:
        original = np.array(weights[key])
        new_dim_shapes = [size if size == vocab_size else int(size * new_dim / old_dim) for size in original.shape]
        indices = [np.linspace(0, o-1, n, dtype=int) for o, n in zip(original.shape, new_dim_shapes)]
        new_weights[key] = torch.Tensor(original[tuple(np.ix_(*indices))])
    return new_weights

In [28]:
# Now we want to copy all the weights from the original model to the smol model
# We can do this by copying the state dict from the original model to the smol model
def initialize_model(model, smol):
    original = model.state_dict()
    smol_dict = smol.state_dict()

    for key in original.keys():
        if key in smol_dict.keys():
            smol_dict[key] = original[key]

    if model.config.n_embd != smol.config.n_embd:
        print("Model dimensions do not match. We need to perform weight selection")
        smol_dict = uniform_selection(smol_dict, model.config.n_embd, smol.config.n_embd, smol.config.vocab_size)

    smol.load_state_dict(smol_dict)
    return smol

In [29]:
smol = initialize_model(model, smol)

Model dimensions do not match. We need to perform weight selection


In [40]:
list(smol.parameters())[0].numel()

39321600

In [42]:
sum(
    x.numel() for x in model.parameters() if x.requires_grad
)

2779683840

In [1]:
from initialized_model.configuration_phi import PhiConfig
from initialized_model.modeling_phi import PhiForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = PhiForCausalLM(PhiConfig(n_layer=16))

In [None]:
# Escape periods and compile the regex patterns
compiled_patterns = [
    re.compile(pattern.replace(".", "\\.")) for pattern in regex_patterns
]

# First, freeze all parameters in the model
for param in model.parameters():
    param.requires_grad = False

# Unfreeze layers that match the regex patterns
for name, param in model.named_parameters():
    if any(pattern.match(name) for pattern in compiled_patterns):
        print(f"Unfreezing {name}")
        param.requires_grad = True