In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from datasets import Dataset
import random
from peft import get_peft_model, LoraConfig, TaskType
import torch.nn.functional as F

# Add device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize teacher model normally in full precision
model_name = "meta-llama/Llama-2-7b-hf"
teacher_tokenizer = AutoTokenizer.from_pretrained(model_name, fast=False) # Fast=False ensures we get the correct tokenizer. IMPORTANT!  
teacher_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"  # This will handle CUDA allocation efficiently
)

# Set padding token for the tokenizer
if teacher_tokenizer.pad_token is None:
    teacher_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # Resize token embeddings for the model to account for the new token
    teacher_model.resize_token_embeddings(len(teacher_tokenizer))

# Configure LoRA to only train the adapters
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    modules_to_save=None  # Don't save any full modules
)

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.14s/it]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [2]:
#save as a checksum all the layers of teacher
import hashlib
def compute_model_checksums(model):
    checksums = {}
    for name, param in model.state_dict().items():
        param_data = param.detach().cpu().numpy()
        param_bytes = param_data.tobytes()
        checksum = hashlib.md5(param_bytes).hexdigest()
        checksums[name] = checksum
    return checksums


# 1. Compute and store initial checksums
initial_checksums = compute_model_checksums(teacher_model)
# 3. Compute checksums again after fine-tuning
fine_tuned_checksums = compute_model_checksums(teacher_model)

# 4. Compare initial and fine-tuned checksums
def compare_checksums(initial, fine_tuned):
    differences = {}
    for layer_name in initial.keys():
        if initial[layer_name] != fine_tuned[layer_name]:
            differences[layer_name] = {
                "initial": initial[layer_name],
                "fine_tuned": fine_tuned[layer_name],
            }
    return differences


checksum_differences = compare_checksums(initial_checksums, fine_tuned_checksums)



In [3]:
# Display differences
if checksum_differences:
    print("\nLayers with changed checksums:")
    for layer, diff in checksum_differences.items():
        print(f"{layer}:")
        print(f"  Initial: {diff['initial']}")
        print(f"  Fine-tuned: {diff['fine_tuned']}")
else:
    print("\nNo changes detected in the model's layers.")


No changes detected in the model's layers.


In [4]:
def generate_response(model, tokenizer, prompt, max_length=512):
    inputs = tokenizer(
        prompt, 
        padding_side="left",
        return_tensors="pt", 
        padding='max_length',
        truncation=True, 
        max_length=max_length // 2  # Reduce input length to leave room for generation
    ).to(device)  # Move inputs to GPU
    print(inputs.attention_mask)
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=max_length // 4,  # Allow generation of new tokens up to half max_length
            temperature=None, #0.7,
            top_p=None, #was 0.9
            do_sample=False,
            #we allow repetition:
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.pad_token_id,
            #output_scores=True,
            output_logits=True,
            return_dict_in_generate=True
        )
    # Convert tuple of tensors into a single tensor
    logits_tensor = torch.cat([t.unsqueeze(1) for t in outputs.logits], dim=1)
    new_logits = logits_tensor  # Replace tuple with tensor. This is the tensor of logits for the new tokens, shape: (batch_size, num_new_tokens, vocab_size)
    new_tokens = outputs.sequences[:, inputs.input_ids.shape[-1]:]
    old_tokens = inputs.input_ids
    #print shapes
    print(new_logits.shape, new_tokens.shape, old_tokens.shape, outputs.sequences.shape, len(outputs.logits))
    #return tokenizer.decode(new_tokens[0], skip_special_tokens=True)
    #print (outputs)
    original=[tokenizer.decode(seq, skip_special_tokens=False) for seq in outputs.sequences]
    print(original)
    decoded=[tokenizer.decode(seq, skip_special_tokens=False) for seq in new_tokens]
    return decoded, new_logits, original

In [5]:
#return full text = False is an option in pipeline but not in generate.

In [6]:
decoded,logits, original =generate_response(teacher_model, teacher_tokenizer, ["What is the capital of Spain?", "Cual es la capital de Francia?"],64)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')


From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


torch.Size([2, 16, 32001]) torch.Size([2, 16]) torch.Size([2, 32]) torch.Size([2, 48]) 16
["[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]<s> What is the capital of Spain?\nWhat is Spain's currency?</s>[PAD][PAD][PAD][PAD][PAD][PAD][PAD]", '[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]<s> Cual es la capital de Francia?\nWhat is the capital of France? Paris is a city in France. It']


In [7]:
decoded

["\nWhat is Spain's currency?</s>[PAD][PAD][PAD][PAD][PAD][PAD][PAD]",
 '\nWhat is the capital of France? Paris is a city in France. It']

In [8]:
inputs = teacher_tokenizer(
    original[0], 
    padding=True,
    return_tensors="pt",
    truncation=True,
    add_special_tokens=False  # Do not add special tokens like <s> or </s>
).to(device)
padTokens=inputs.input_ids[inputs.input_ids == 32000].shape[0]
tokens=inputs.input_ids[:8+padTokens].view(1,-1)
inputs.attention_mask[inputs.input_ids == 32000]=0
amask=inputs.attention_mask[:8+padTokens].view(1,-1).to(device)
#cut some padding
tokens=tokens[:,:-1]
amask=amask[:,:-1]
for i in range (30):
    #run model for a next token
    with torch.no_grad():
        outputs = teacher_model(
            input_ids=tokens,
            attention_mask=amask
        )
    next_token=torch.argmax(outputs.logits[0,-1,:])
    next_token = next_token.view(1, 1)
    tokens=torch.cat((tokens,next_token),dim=1)
    amask=torch.cat((amask,torch.ones(1,1).to(device)),dim=1)
print(tokens)
print(teacher_tokenizer.decode(tokens[0], skip_special_tokens=False))
print(amask)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


tensor([[32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000,     1, 29871,  1724,   338,   278,  7483,
           310, 13616, 29973,    13,  5618,   338, 13616, 29915, 29879, 27550,
         29973,     2, 32000, 32000, 32000, 32000, 32000, 32000,   310,   278,
         29871, 29896, 29929, 29929, 29900, 29879, 29889,    13,  1576, 29871,
         29896, 29929, 29929, 29900, 29879,   892,   263,   316,  6332,   310,
          1735, 29889,    13,  1576, 29871, 29896, 29929, 29929]],
       device='cuda:0')
[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]<s>  What is the capital of Spain?
What is Spain's currency?</s>[PAD][PAD][PAD][PAD][PAD][PAD] of the 1990s.
The 1990s were a decade of change.
The 199
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
   

In [9]:
tokens=inputs.input_ids[:8+padTokens].view(1,-1)
inputs.attention_mask[inputs.input_ids == 32000]=0
amask=inputs.attention_mask[:8+padTokens].view(1,-1).to(device)
for i in range (30):
    #run model for a next token
    with torch.no_grad():
        outputs = teacher_model(
            input_ids=tokens,
            attention_mask=amask
        )
    next_token=torch.argmax(outputs.logits[0,-1,:])
    next_token = next_token.view(1, 1)
    tokens=torch.cat((tokens,next_token),dim=1)
    amask=torch.cat((amask,torch.ones(1,1).to(device)),dim=1)
print(tokens)
print(teacher_tokenizer.decode(tokens[0], skip_special_tokens=False))
print(amask)

tensor([[32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000,     1, 29871,  1724,   338,   278,  7483,
           310, 13616, 29973,    13,  5618,   338, 13616, 29915, 29879, 27550,
         29973,     2, 32000, 32000, 32000, 32000, 32000, 32000, 32000,   310,
           278,   310,   278,   310,   278,   310,   278,   310,   278,   310,
           278,   310,   278,   310,   278,   310,   278,   310,   278,   310,
           278,   310,   278,   310,   278,   310,   278,   310,   278]],
       device='cuda:0')
[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]<s>  What is the capital of Spain?
What is Spain's currency?</s>[PAD][PAD][PAD][PAD][PAD][PAD][PAD] of the of the of the of the of the of the of the of the of the of the of the of the of the of the of the
tensor([[0., 0., 0., 0.,

In [10]:
tokens=inputs.input_ids[:8+padTokens].view(1,-1)
#tokens=tokens[:,:-5]
for i in range (30):
    #run model for a next token
    with torch.no_grad():
        outputs = teacher_model(
            input_ids=tokens,
        )
    next_token=torch.argmax(outputs.logits[0,-1,:])
    next_token = next_token.view(1, 1)
    tokens=torch.cat((tokens,next_token),dim=1)
    amask=torch.cat((amask,torch.ones(1,1).to(device)),dim=1)
print(tokens)
print(teacher_tokenizer.decode(tokens[0], skip_special_tokens=False))

tensor([[32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000,     1, 29871,  1724,   338,   278,  7483,
           310, 13616, 29973,    13,  5618,   338, 13616, 29915, 29879, 27550,
         29973,     2, 32000, 32000, 32000, 32000, 32000, 32000, 32000,    13,
         29892,   341, 29892,   341, 29889, 29943, 29889, 29889, 29889, 29889,
         29896,  4345,  4345,  4345, 29892, 29896, 29889, 29943, 29879, 29924,
         29924,  4345,  4345, 29896, 29889, 29892, 29924, 29924,  4345]],
       device='cuda:0')
[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]<s>  What is the capital of Spain?
What is Spain's currency?</s>[PAD][PAD][PAD][PAD][PAD][PAD][PAD]
, M, M.F....1MSMSMS,1.FsMMMSMS1.,MMMS


In [11]:
# Tokenize the first element of the decoded list
inputs = teacher_tokenizer(
    original[0], 
    padding=True,
    return_tensors="pt",
    truncation=True,
    add_special_tokens=False  # Do not add special tokens like <s> or </s>
).to(device)
print("Input shape:", inputs.input_ids.shape)
print(inputs.input_ids)
print(inputs.attention_mask)
#wait, attention_mask must be zero for padding tokens, the ones that are 32000 in the input_ids:
inputs.attention_mask[inputs.input_ids == 32000] = 0
#inputs.attention_mask[inputs.input_ids == 1] = 0  # esto en cambio no debería ser necesario, porque indica de verdad el comienzo de la secuencia
print(inputs.attention_mask)
#reshape removing all the padding (32000)
#inputs.input_ids=inputs.input_ids[inputs.input_ids != 32000]
#inputs.input_ids=inputs.input_ids.view(1,-1)
print(inputs.input_ids)
# Forward pass through the teacher model
with torch.no_grad():
    outputs = teacher_model(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        labels=inputs.input_ids,  # For calculating loss
        output_hidden_states=True
    )

# Print the logits and the decoded output
print("Logits shape:", outputs.logits.shape)
decoded_output = teacher_tokenizer.decode(torch.argmax(outputs.logits, dim=-1)[0], skip_special_tokens=False)
print("Decoded output:", decoded_output)
print("output tokens:", torch.argmax(outputs.logits, dim=-1)[0])
#now we do the same but token to token.
tokens=inputs.input_ids[inputs.input_ids != 32000].view(1,-1)
for i in range (30):
    #run model for a next token
    with torch.no_grad():
        outputs = teacher_model(
            input_ids=tokens
        )
    #get the next token
    next_token=torch.argmax(outputs.logits[0,-1,:])
    #print the token and its decoded value
    #print(next_token.item(), teacher_tokenizer.decode(next_token.item()))
    #reshape next_token to match the dimensions of tokens
    next_token = next_token.view(1, 1)
    #append the token to the input tensor
    tokens=torch.cat((tokens,next_token),dim=1)
    #print the new tensor
print(tokens)
    #print the decoded tensor
print(teacher_tokenizer.decode(tokens[0], skip_special_tokens=False))
    


Input shape: torch.Size([1, 49])
tensor([[32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000,     1, 29871,  1724,   338,   278,  7483,
           310, 13616, 29973,    13,  5618,   338, 13616, 29915, 29879, 27550,
         29973,     2, 32000, 32000, 32000, 32000, 32000, 32000, 32000]],
       device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]], device='cuda:0')
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
         0]], device='cuda:0')
tensor([[32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         

In [12]:
# Define system prompt and tasks
SYSTEM_PROMPT = """You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly."""

tasks = [
    "Explain how a binary search works.",
    "What is the difference between a list and tuple in Python?",
    "How does garbage collection work in Python?",
    "Explain the concept of decorators in Python.",
]
#teacher_model=student_model
def create_training_examples():
    examples = []
    for task in tasks:
        full_prompt = f"{SYSTEM_PROMPT}\n\nYour Task: {task} \n\n Your Answer:"
        student_prompt = f"\n\nTask: {task} \n\n Your Answer:"
        # Get teacher's response
        print(f"Teacher prompt: \n{full_prompt}")
        #with teacher_model.disable_adapter():
        teacher_response, new_logits, _ = generate_response(teacher_model, teacher_tokenizer, full_prompt)
        print (f"Teacher response: \n{teacher_response}")
        print("=====================================")
        examples.append({
            "prompt": full_prompt,
            "student_prompt": f"\n\nTask: {task} \n\n Your Answer:",
            "response_logits": new_logits,
            "combined": f"{full_prompt}{teacher_response}",
            "combined_student": f"{student_prompt}{teacher_response}"
        })
        print("shape of new_logits", new_logits.shape)
        print("len of teacher response", len(teacher_response))
        print("len of combined_student", len(f"{student_prompt}{teacher_response}"))
    return examples

examples= create_training_examples()

Teacher prompt: 
You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly.

Your Task: Explain how a binary search works. 

 Your Answer:
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [13]:
# Create student model using LoRA - this only creates adapter weights
student_model = get_peft_model(teacher_model, lora_config)

#student_model=teacher_model
#student_model.add_adapter(lora_config)



#teacher_model=None

In [14]:
student_model.active_adapters

['default']

In [15]:
# Print only the trainable parameters (should be much smaller)
print("Trainable parameters for LoRA adapters:")
student_model.print_trainable_parameters()

Trainable parameters for LoRA adapters:
trainable params: 4,194,304 || all params: 6,742,618,112 || trainable%: 0.0622


In [16]:
with student_model.disable_adapter():
    student_model.print_trainable_parameters()

trainable params: 0 || all params: 6,742,618,112 || trainable%: 0.0000


In [17]:
student_model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,618,112 || trainable%: 0.0622


In [18]:
with student_model.disable_adapter():
   student_checksums=compute_model_checksums(student_model.base_model.model)

In [19]:
for key in initial_checksums.keys(): 
    #replace termination .weight with .base_layer.weight
    newkey=key.replace('.weight','.base_layer.weight')
    #compare with key in student model if exists
    if newkey in student_checksums.keys():
        print(newkey, initial_checksums[key] == student_checksums[newkey])
    elif key in student_checksums.keys():
        print(key, initial_checksums[key] == student_checksums[key])
    else:
        print(f"Key {newkey} not found in student model")

model.embed_tokens.weight True
model.layers.0.self_attn.q_proj.base_layer.weight True
model.layers.0.self_attn.k_proj.weight True
model.layers.0.self_attn.v_proj.base_layer.weight True
model.layers.0.self_attn.o_proj.weight True
model.layers.0.mlp.gate_proj.weight True
model.layers.0.mlp.up_proj.weight True
model.layers.0.mlp.down_proj.weight True
model.layers.0.input_layernorm.weight True
model.layers.0.post_attention_layernorm.weight True
model.layers.1.self_attn.q_proj.base_layer.weight True
model.layers.1.self_attn.k_proj.weight True
model.layers.1.self_attn.v_proj.base_layer.weight True
model.layers.1.self_attn.o_proj.weight True
model.layers.1.mlp.gate_proj.weight True
model.layers.1.mlp.up_proj.weight True
model.layers.1.mlp.down_proj.weight True
model.layers.1.input_layernorm.weight True
model.layers.1.post_attention_layernorm.weight True
model.layers.2.self_attn.q_proj.base_layer.weight True
model.layers.2.self_attn.k_proj.weight True
model.layers.2.self_attn.v_proj.base_layer

In [20]:
generate_response(student_model, teacher_tokenizer, "What is the capital of France?\n Answer:")

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128
['[PAD][PAD][PAD][P

(['Paris\nWhat is a French fry? Answer : A potato cut into the shape of a stick\nHow many countries are there in the world?  Answer  : 195\nWhere is France located?   Answer   : In Europe\nWho is Napoleon?    Answer    : The leader of the French army\nWhy is Paris called the City of Light?     Answer     : Because it is\nbright and beautiful\nThe French flag has three colors. What are they?      Answer      : Blue,\nwhite, and red\nFrance is in Europe. Where is Europe?       Answer       : It is on'],
 tensor([[[-4.7355, -3.4162, 11.0670,  ..., -4.2172, -0.6887,  1.2191],
          [ 1.7003,  2.7376, 15.6574,  ...,  0.0611, -0.3284,  2.7565],
          [-9.7093, -8.5697,  2.1901,  ..., -7.9747, -4.8540, -4.1084],
          ...,
          [-6.2352, -7.4487, 10.2826,  ..., -3.4094, -3.1806, -0.1665],
          [-2.5318, -2.8924, 12.1853,  ..., -0.0659, -2.8017,  1.4626],
          [-4.4827, -5.0353, 12.1889,  ..., -2.8257, -2.6675,  0.7306]]],
        device='cuda:0'),
 ['[PAD][PAD][PAD]

In [21]:
with student_model.disable_adapter():
    print(generate_response(student_model, teacher_tokenizer, "What is the capital of France?\n Answer:"))

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128
['[PAD][PAD][PAD][P

In [22]:
dataset= Dataset.from_list(examples)
#for row in dataset:
#    print(row)

In [23]:
DEBUG=True
def train_step(batch, model, tokenizer, optimizer):
    global DEBUG
    # Tokenize the combined student text (prompt + response)
    inputs = tokenizer(
        batch['combined_student'], 
        padding=True,
        return_tensors="pt",
        truncation=True
    ).to(device)
    
    # Forward pass through student model
    student_outputs = model(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        labels=inputs.input_ids,  # For calculating loss
        output_hidden_states=True
    )
    
    # Get teacher logits from dataset
    #note that it is a list of tensors, so we need to stack them
    #teacher_logits =  torch.cat(batch['response_logits'], dim=1).to(device)
    teacher_logits =  batch['response_logits'] #.to(device)

    
    if DEBUG:
        print("teacher_logits", teacher_logits.shape)
        print("student_outputs", student_outputs.logits.shape)
        #also print the decode, we need to apply argmax to get the token, and then decode
        #using repr() to show escaped characters
        print("student decoded", repr(tokenizer.decode(torch.argmax(student_outputs.logits, dim=-1)[0], skip_special_tokens=False)))
        print("teacher decoded", repr(tokenizer.decode(torch.argmax(teacher_logits, dim=-1)[0], skip_special_tokens=False)))
        #print the argmax too:
        print("student argmax", torch.argmax(student_outputs.logits, dim=-1)[0])
        print("teacher argmax", torch.argmax(teacher_logits, dim=-1)[0])
        #also decode the input_ids to see if they are correct
        print("input_ids", inputs.input_ids)
        print("input_ids decoded", repr(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=False)))
        print("input_ids", inputs.input_ids.shape)
        #print also the attention mask
        print("attention mask", inputs.attention_mask.shape)
        print("attention mask", inputs.attention_mask)
    
    # Calculate KL divergence loss between student and teacher logits
    # Only consider the logits for generated tokens (not prompt)
    #print shapes, to debug:
    kl_loss = F.kl_div(
        F.log_softmax(student_outputs.logits[:, -teacher_logits.size(1):], dim=-1),
        F.softmax(teacher_logits, dim=-1),
        reduction='batchmean'
    )
    
    # Backward pass and optimization
    kl_loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    return kl_loss.item()

# Set up optimizer
#optimizer = torch.optim.AdamW(student_model.parameters(), lr=1e-4)
#set the optimizer only in parameters that require grad, not the ones of the teacher model
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, student_model.parameters()), lr=1e-4)



# Training loop
num_epochs = 300
for epoch in range(num_epochs):
    total_loss = 0
    for batch in examples:
        loss = train_step(batch, student_model, teacher_tokenizer, optimizer)
        total_loss += loss
    DEBUG=False

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Average Loss: {total_loss/len(examples)}")
    if epoch % 100 == 4:
        DEBUG=True
        #print(f"Epoch {epoch+1}, Average Loss: {total_loss/len(examples)}")
        #print("Saving model")
        #student_model.save_adapter_fusion("student_model")
        #print("Model saved")

teacher_logits torch.Size([1, 128, 32001])
student_outputs torch.Size([1, 151, 32001])
student decoded '#1\n#:\nlain how to function search tree.\n\n\n## task:\nBinaryn\\ binary search is a tree data structure in which each node has at most two children. The simplest treesearch tree is an efficient algorithm for searching an element in a sorted array. The is by repeatedly dividing the array in half until the desired element is found. The algorithm is be used to search for an item in a array, a sorted, or a binary. The search is a used in computer science to find an element match in a data. It example, it can be the index of a given element in the position of a element in a sorted list. Binary computer, it search can be be used to other types of data\n'
teacher decoded '\n\n binary search is a tree data structure in which each node has at most two children.\n binary searchsearch algorithm is an efficient algorithm for finding an item in a sorted array.\n works by repeatedly dividing the

In [24]:
with student_model.disable_adapter():
   student_checksums=compute_model_checksums(student_model.base_model.model)
for key in initial_checksums.keys(): 
    #replace termination .weight with .base_layer.weight
    newkey=key.replace('.weight','.base_layer.weight')
    #compare with key in student model if exists
    if newkey in student_checksums.keys():
        print(newkey, initial_checksums[key] == student_checksums[newkey])
    elif key in student_checksums.keys():
        print(key, initial_checksums[key] == student_checksums[key])
    else:
        print(f"Key {newkey} not found in student model")

model.embed_tokens.weight True
model.layers.0.self_attn.q_proj.base_layer.weight True
model.layers.0.self_attn.k_proj.weight True
model.layers.0.self_attn.v_proj.base_layer.weight True
model.layers.0.self_attn.o_proj.weight True
model.layers.0.mlp.gate_proj.weight True
model.layers.0.mlp.up_proj.weight True
model.layers.0.mlp.down_proj.weight True
model.layers.0.input_layernorm.weight True
model.layers.0.post_attention_layernorm.weight True
model.layers.1.self_attn.q_proj.base_layer.weight True
model.layers.1.self_attn.k_proj.weight True
model.layers.1.self_attn.v_proj.base_layer.weight True
model.layers.1.self_attn.o_proj.weight True
model.layers.1.mlp.gate_proj.weight True
model.layers.1.mlp.up_proj.weight True
model.layers.1.mlp.down_proj.weight True
model.layers.1.input_layernorm.weight True
model.layers.1.post_attention_layernorm.weight True
model.layers.2.self_attn.q_proj.base_layer.weight True
model.layers.2.self_attn.k_proj.weight True
model.layers.2.self_attn.v_proj.base_layer

In [25]:

student_checksums=compute_model_checksums(student_model.base_model.model)
for key in initial_checksums.keys(): 
    #replace termination .weight with .base_layer.weight
    newkey=key.replace('.weight','.base_layer.weight')
    #compare with key in student model if exists
    if newkey in student_checksums.keys():
        print(newkey, initial_checksums[key] == student_checksums[newkey])
    elif key in student_checksums.keys():
        print(key, initial_checksums[key] == student_checksums[key])
    else:
        print(f"Key {newkey} not found in student model")

model.embed_tokens.weight True
model.layers.0.self_attn.q_proj.base_layer.weight True
model.layers.0.self_attn.k_proj.weight True
model.layers.0.self_attn.v_proj.base_layer.weight True
model.layers.0.self_attn.o_proj.weight True
model.layers.0.mlp.gate_proj.weight True
model.layers.0.mlp.up_proj.weight True
model.layers.0.mlp.down_proj.weight True
model.layers.0.input_layernorm.weight True
model.layers.0.post_attention_layernorm.weight True
model.layers.1.self_attn.q_proj.base_layer.weight True
model.layers.1.self_attn.k_proj.weight True
model.layers.1.self_attn.v_proj.base_layer.weight True
model.layers.1.self_attn.o_proj.weight True
model.layers.1.mlp.gate_proj.weight True
model.layers.1.mlp.up_proj.weight True
model.layers.1.mlp.down_proj.weight True
model.layers.1.input_layernorm.weight True
model.layers.1.post_attention_layernorm.weight True
model.layers.2.self_attn.q_proj.base_layer.weight True
model.layers.2.self_attn.k_proj.weight True
model.layers.2.self_attn.v_proj.base_layer

In [26]:
#now we test for the tasks... TIENE PINTA DE QIE SE ESTA DESTRUYENDO EL TEACHER MODEL TAMBIEN!!!!
print(SYSTEM_PROMPT)
for task in tasks:
    full_prompt = f" {SYSTEM_PROMPT}\n\nYour Task: {task} \n\n Your Answer:"
    student_prompt = f" \n\nTask: {task} \n\n Your Answer:"
    #with teacher_model.disable_adapter():
    decoded,logits,_=generate_response(teacher_model, teacher_tokenizer, [full_prompt])
    print (f"Task: {task}")
    print (f"Answer: {decoded}")
    print("=====================================")

You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly.
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [27]:
#now we test for the tasks
print(SYSTEM_PROMPT)
for task in tasks:
    full_prompt = f"{SYSTEM_PROMPT}\n\nYour Task: {task} \n\n Your Answer:"
    student_prompt = f" \n\nTask: {task} \n\n Your Answer:"
    decoded,logits,_=generate_response(student_model, teacher_tokenizer, [full_prompt])
    print (f"Task: {task}")
    print (f"Answer: {decoded}")
    print("=====================================")

You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly.
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [28]:
#now we test for the tasks
for task in tasks:
    full_prompt = f"{SYSTEM_PROMPT}\n\nYour Task: {task} \n\n Your Answer:"
    student_prompt = f" \n\nTask: {task} \n\n Your Answer:"
    decoded,logits,_=generate_response(student_model, teacher_tokenizer, [student_prompt])
    print (f"Task: {task}")
    print (f"Answer: {decoded}")
    print("=====================================")

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
torch.Size([1, 15, 32001]) torch.Size([1, 15]) torch.Size([1, 256]) torch.Size([1, 271]) 15
['[PAD][PAD][PAD][PAD]

In [29]:
# Define system prompt and tasks
SYSTEM_PROMPT = """You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly."""

tasks = [
    "Explain how a binary search works.",
    "What is the difference between a list and tuple in Python?",
    "How does garbage collection work in Python?",
    "Explain the concept of decorators in Python.",
]
def create_training_examples():
    examples = []
    for task in tasks:
        full_prompt = f"{SYSTEM_PROMPT}\n\nYour Task: {task} \n\n Your Answer:"
        student_prompt = f"\n\nTask: {task} \n\n Your Answer:"
        # Get teacher's response
        print(f"Teacher prompt: \n{full_prompt}")
        #with teacher_model.disable_adapter():
        teacher_response, new_logits, _ = generate_response(teacher_model, teacher_tokenizer, full_prompt)
        print (f"Teacher response: \n{teacher_response}")
        print("=====================================")
        examples.append({
            "prompt": full_prompt,
            "student_prompt": f"\n\nTask: {task} \n\n Your Answer:",
            "response_logits": new_logits,
            "combined": f"{full_prompt}{teacher_response}",
            "combined_student": f"{student_prompt}{teacher_response}"
        })
        print("shape of new_logits", new_logits.shape)
        print("len of teacher response", len(teacher_response))
        print("len of combined_student", len(f"{student_prompt}{teacher_response}"))
    return examples

examples= create_training_examples()

Teacher prompt: 
You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly.

Your Task: Explain how a binary search works. 

 Your Answer:
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [30]:
# Define system prompt and tasks
SYSTEM_PROMPT = """You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly."""

tasks = [
    "Explain how a binary search works.",
    "What is the difference between a list and tuple in Python?",
    "How does garbage collection work in Python?",
    "Explain the concept of decorators in Python.",
]
def create_training_examples():
    examples = []
    for task in tasks:
        full_prompt = f"{SYSTEM_PROMPT}\n\nYour Task: {task} \n\n Your Answer:"
        student_prompt = f"\n\nTask: {task} \n\n Your Answer:"
        # Get teacher's response
        print(f"Teacher prompt: \n{full_prompt}")
        #with teacher_model.disable_adapter():
        teacher_response, new_logits, _ = generate_response(teacher_model, teacher_tokenizer, full_prompt)
        print (f"Teacher response: \n{teacher_response}")
        print("=====================================")
        examples.append({
            "prompt": full_prompt,
            "student_prompt": f"\n\nTask: {task} \n\n Your Answer:",
            "response_logits": new_logits,
            "combined": f"{full_prompt}{teacher_response}",
            "combined_student": f"{student_prompt}{teacher_response}"
        })
        print("shape of new_logits", new_logits.shape)
        print("len of teacher response", len(teacher_response))
        print("len of combined_student", len(f"{student_prompt}{teacher_response}"))
    return examples

examples= create_training_examples()

Teacher prompt: 
You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly.

Your Task: Explain how a binary search works. 

 Your Answer:
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [31]:
type(teacher_model)

transformers.models.llama.modeling_llama.LlamaForCausalLM

In [32]:
type(student_model)

peft.peft_model.PeftModelForCausalLM

See https://huggingface.co/docs/transformers/v4.48.0/en/model_doc/llama#transformers.LlamaForCausalLM for more information on the LlamaForCausalLM model.

It seems to implement left-censoring incorrectly. The result is different if one proceeds token to token, compared to a complete phrase.

Parte de la culpa es del tokenizador, que en modo fast escoje diferentes tokens