In [1]:
from MergeModels.ActivationMerging import get_calib_feat, activation_merge

In [2]:
import yaml

with open('checkpoints.yml', 'r') as file:
    checkpoints = yaml.safe_load(file)

pretrained_model_name = checkpoints['Instruct'][0]['HF_Repo']
finetuned_model_names = [item[next(iter(item))][0]['HF_Repo'] for item in checkpoints['Checkpoints']]

In [None]:
# finetuned_model_names = ['markrodrigo/Llama-3.1-8B-Instruct-Spatial-SQL-1.0', 'nvidia/OpenMath2-Llama3.1-8B', 'passthepizza/NarrativAI-Reflection']
# pretrained_model_name = 'meta-llama/Llama-3.1-8B-Instruct'
finetuned_model_names = ['markrodrigo/Llama-3.1-8B-Instruct-Spatial-SQL-1.0', 'nvidia/OpenMath2-Llama3.1-8B', 'passthepizza/NarrativAI-Reflection','meta-llama/Llama-3.1-8B-Instruct']
pretrained_model_name = 'meta-llama/Llama-3.1-8B'

In [4]:
pretrained_model_name

'meta-llama/Llama-3.1-8B'

In [5]:
from MergeModels.ActivationMerging._utils import  *

In [6]:
print('loading models...')
models_to_merge, finetuned_tokenizers, finetuned_configs = [], [], []
for finetuned_model_name in finetuned_model_names:
    try:
        finetuned_model = AutoModelForCausalLM.from_pretrained(finetuned_model_name, device_map='cpu', torch_dtype=torch.bfloat16)
        finetuned_tokenizer = AutoTokenizer.from_pretrained(finetuned_model_name)
        finetuned_config = AutoConfig.from_pretrained(finetuned_model_name)
        models_to_merge.append(finetuned_model)
        finetuned_tokenizers.append(finetuned_tokenizer)
        finetuned_configs.append(finetuned_config)
    except Exception as e:
        print(f"Model {finetuned_model_name} could not be loaded.")
        print(f"Reason: {e}")

pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name, torch_dtype=torch.bfloat16)
pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name)
pretrained_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=pretrained_model_name)


logger = logging.getLogger(__name__)
# align the tokens of pretrained and finetuned tokenizer
align_tokenizers_and_embeddings(pretrained_model=pretrained_model, pretrained_tokenizer=pretrained_tokenizer,
                                pretrained_config=pretrained_config, finetuned_models=models_to_merge,
                                finetuned_tokenizers=finetuned_tokenizers, finetuned_configs=finetuned_configs, logger=logger)
print('loading calibration dataset...')
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
dataset = dataset.shuffle(seed=42)
    
print('getting calibration features...')
scale_dicts = []
for i in range(len(models_to_merge)):
    models_to_merge[i].to('cuda')
    tokenizer = finetuned_tokenizers[i]
    scale_dict = get_calib_feat(models_to_merge[i], tokenizer, dataset)
    scale_dicts.append(scale_dict)
    models_to_merge[i].to('cpu')

pretrained_model.to('cuda')
pretrained_scale_dict = get_calib_feat(pretrained_model, pretrained_tokenizer,dataset)
pretrained_model.to('cpu')

layer_mapping_dicts = []
for i in range(len(models_to_merge)):
    layer_mapping_dict = {}
    for name, param in models_to_merge[i].named_modules():
        if isinstance(param, nn.Linear):
            layer_mapping_dict[name] = param
    layer_mapping_dicts.append(layer_mapping_dict)

pretrained_layer_mapping_dict = {}
for name, param in pretrained_model.named_modules():
    if isinstance(param, nn.Linear):
        pretrained_layer_mapping_dict[name] = param


# merged_model = deepcopy(pretrained_model)
# merged_layer_mapping_dict = {}
# for name, param in pretrained_model.named_modules():
#     if isinstance(param, nn.Linear):
#         merged_layer_mapping_dict[name] = param
print('merging models...')
# make a copy of the pretrained model

loading models...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


loading calibration dataset...
getting calibration features...
 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (7573 > 4096). Running this sequence through the model will result in indexing errors


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

merging models...


In [7]:
with torch.no_grad():
    top_ks = [0.01, 0.05, 0.1]
    gammas = [1.0, 2.0, 3.0, 5.0]
    
    final_weight_dict = {}
    for top_k in top_ks:
        for gamma in gammas:
            final_weight_dict[(top_k,gamma)] = {}

        for name, param in tqdm(pretrained_layer_mapping_dict.items(), total=len(pretrained_layer_mapping_dict)):
            base_importance = torch.softmax(pretrained_scale_dict[name],dim=0)
            base_importance = base_importance / base_importance.max()
            topk = torch.topk(base_importance, int(base_importance.numel() * top_k)).indices
            important_clone = param.weight.data[:,topk].clone()
            for gamma in gammas:
                final_weight_dict[(top_k,gamma)][name] = param.weight.data.clone()
            # final_weight_dict[name] = param.weight.data.clone()
            for i in range(len(models_to_merge)):
                scale_dict = scale_dicts[i]
                layer_mapping_dict = layer_mapping_dicts[i]
                scale = torch.softmax(scale_dict[name],dim=0)
                scale = scale / scale.max()
                delta = layer_mapping_dict[name].weight.data - pretrained_layer_mapping_dict[name].weight.data
                delta = delta * scale[None, :]
                
                for gamma in gammas:
                    final_weight_dict[(top_k,gamma)][name] += delta*gamma
                # final_weight_dict[name] += delta*gamma
                # print(torch.linalg.norm(delta))
                # merged_layer_mapping_dict[name].weight.data += delta*gamma
                
            # merged_layer_mapping_dict[name].weight.data[:,topk] = important_clone
            for gamma in gammas:
                final_weight_dict[(top_k,gamma)][name][:,topk] = important_clone
            # final_weight_dict[name][:,topk] = important_clone

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

In [8]:
for top_k in top_ks:
    for gamma in gammas:
        has_changed = False
        for name, mod in pretrained_model.named_modules():
            if isinstance(mod, nn.Linear):
                if not has_changed:
                    if not torch.allclose(mod.weight.data, final_weight_dict[(top_k,gamma)][name]):
                        print(f'changed')
                        has_changed = True
                mod.weight.data = final_weight_dict[(top_k,gamma)][name]
        pretrained_model.save_pretrained(f'./ActiveM_base/ActiveM_{top_k}_{gamma}')
        pretrained_tokenizer.save_pretrained(f'./ActiveM_base/ActiveM_{top_k}_{gamma}')

changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed
changed


In [None]:
for param_name, param_value in pretrained_model.named_parameters():
    param_value.data.copy_(merged_model.state_dict()[param_name])

In [9]:
pretrained_model.save_pretrained('./ActiveM')
pretrained_tokenizer.save_pretrained('./ActiveM')

('./ActiveM/tokenizer_config.json',
 './ActiveM/special_tokens_map.json',
 './ActiveM/tokenizer.json')

In [10]:
from hfUtils import *

In [11]:
tokenizer_, model_ = load_model_and_tokenizer('./ActiveM',device='cpu')
tokenizer, model = load_model_and_tokenizer(pretrained_model_name,device='cpu')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
# loop through the model and compare the weights
import torch
lin_dict = {}
for name, param in model.named_modules():
    if isinstance(param, nn.Linear):
        lin_dict[name] = param.weight
for name, param in model_.named_modules():
    if isinstance(param, nn.Linear):
        if torch.allclose(param.weight, lin_dict[name]):
            print(f'{name} is equal')
        else:
            print(f'{name} is not equal')

model.layers.0.self_attn.q_proj is equal
model.layers.0.self_attn.k_proj is equal
model.layers.0.self_attn.v_proj is equal
model.layers.0.self_attn.o_proj is not equal
model.layers.0.mlp.gate_proj is equal
model.layers.0.mlp.up_proj is equal
model.layers.0.mlp.down_proj is not equal
model.layers.1.self_attn.q_proj is not equal
model.layers.1.self_attn.k_proj is not equal
model.layers.1.self_attn.v_proj is not equal
model.layers.1.self_attn.o_proj is not equal
model.layers.1.mlp.gate_proj is not equal
model.layers.1.mlp.up_proj is not equal
model.layers.1.mlp.down_proj is equal
model.layers.2.self_attn.q_proj is not equal
model.layers.2.self_attn.k_proj is not equal
model.layers.2.self_attn.v_proj is not equal
model.layers.2.self_attn.o_proj is not equal
model.layers.2.mlp.gate_proj is not equal
model.layers.2.mlp.up_proj is equal
model.layers.2.mlp.down_proj is equal
model.layers.3.self_attn.q_proj is not equal
model.layers.3.self_attn.k_proj is not equal
model.layers.3.self_attn.v_pro

In [None]:
model, tokenizer = activation_merge(finetuned_model_names, pretrained_model_name, topk=0.05, scaling=1.0)

loading models...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model ahmeterdempmk/Llama-3.1-8B-Fast-Food-Based-Tuned could not be loaded.
Reason: Unrecognized model in ahmeterdempmk/Llama-3.1-8B-Fast-Food-Based-Tuned. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, audio-spectrogram-transformer, autoformer, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, dinat, dinov2, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, encodec, encoder-decoder, ernie, ernie_m, esm, f

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model yitzshapiro/brochat-meta-llama-31-8b could not be loaded.
Reason: yitzshapiro/brochat-meta-llama-31-8b does not appear to have a file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt or flax_model.msgpack.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model rishitdass/Youtube-Video-Summarizer could not be loaded.
Reason: Unrecognized model in rishitdass/Youtube-Video-Summarizer. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, audio-spectrogram-transformer, autoformer, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, dinat, dinov2, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fasts

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

getting calibration features...


Repo card metadata block was not found. Setting CardData to empty.


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.
Token indices sequence length is longer than the specified maximum sequence length for this model (7573 > 2048). Running this sequence through the model will result in indexing errors


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.
Token indices sequence length is longer than the specified maximum sequence length for this model (7573 > 4096). Running this sequence through the model will result in indexing errors


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


 * Split into 124 blocks


  0%|          | 0/124 [00:00<?, ?it/s]

merging models...


  0%|          | 0/225 [00:00<?, ?it/s]

ValueError: only one element tensors can be converted to Python scalars