In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from datasets import Dataset
import random
from peft import get_peft_model, LoraConfig, TaskType
import torch.nn.functional as F

# Add device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize teacher model normally in full precision
model_name = "meta-llama/Llama-2-7b-hf"
teacher_tokenizer = AutoTokenizer.from_pretrained(model_name)
teacher_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"  # This will handle CUDA allocation efficiently
)

# Set padding token for the tokenizer
if teacher_tokenizer.pad_token is None:
    teacher_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # Resize token embeddings for the model to account for the new token
    teacher_model.resize_token_embeddings(len(teacher_tokenizer))

# Configure LoRA to only train the adapters
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    modules_to_save=None  # Don't save any full modules
)

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.12s/it]
We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [2]:
#save as a checksum all the layers of teacher
import hashlib
def compute_model_checksums(model):
    checksums = {}
    for name, param in model.state_dict().items():
        param_data = param.detach().cpu().numpy()
        param_bytes = param_data.tobytes()
        checksum = hashlib.md5(param_bytes).hexdigest()
        checksums[name] = checksum
    return checksums


# 1. Compute and store initial checksums
initial_checksums = compute_model_checksums(teacher_model)
# 3. Compute checksums again after fine-tuning
fine_tuned_checksums = compute_model_checksums(teacher_model)

# 4. Compare initial and fine-tuned checksums
def compare_checksums(initial, fine_tuned):
    differences = {}
    for layer_name in initial.keys():
        if initial[layer_name] != fine_tuned[layer_name]:
            differences[layer_name] = {
                "initial": initial[layer_name],
                "fine_tuned": fine_tuned[layer_name],
            }
    return differences


checksum_differences = compare_checksums(initial_checksums, fine_tuned_checksums)



In [3]:
# Display differences
if checksum_differences:
    print("\nLayers with changed checksums:")
    for layer, diff in checksum_differences.items():
        print(f"{layer}:")
        print(f"  Initial: {diff['initial']}")
        print(f"  Fine-tuned: {diff['fine_tuned']}")
else:
    print("\nNo changes detected in the model's layers.")


No changes detected in the model's layers.


In [4]:
def generate_response(model, tokenizer, prompt, max_length=512):
    inputs = tokenizer(
        prompt, 
        padding_side="left",
        return_tensors="pt", 
        padding='max_length',
        truncation=True, 
        max_length=max_length // 2  # Reduce input length to leave room for generation
    ).to(device)  # Move inputs to GPU
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=max_length // 4,  # Allow generation of new tokens up to half max_length
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            #output_scores=True,
            output_logits=True,
            return_dict_in_generate=True
        )
    # Convert tuple of tensors into a single tensor
    logits_tensor = torch.cat([t.unsqueeze(1) for t in outputs.logits], dim=1)
    new_logits = logits_tensor  # Replace tuple with tensor. This is the tensor of logits for the new tokens, shape: (batch_size, num_new_tokens, vocab_size)
    new_tokens = outputs.sequences[:, inputs.input_ids.shape[-1]:]
    old_tokens = inputs.input_ids
    #print shapes
    print(new_logits.shape, new_tokens.shape, old_tokens.shape, outputs.sequences.shape, len(outputs.logits))
    #return tokenizer.decode(new_tokens[0], skip_special_tokens=True)
    #print (outputs)
    decoded=[tokenizer.decode(seq, skip_special_tokens=True) for seq in new_tokens]
    return decoded, new_logits

In [5]:
#return full text = False is an option in pipeline but not in generate.

In [6]:
decoded,logits=generate_response(teacher_model, teacher_tokenizer, ["What is the capital of France?", "Cual es la capital de Francia?"])

From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


torch.Size([2, 128, 32001]) torch.Size([2, 128]) torch.Size([2, 256]) torch.Size([2, 384]) 128


In [7]:
decoded

['\nThe capital of France is Paris.\nWhat is the capital of the country of France?\nThe capital of France is Paris.\nWhat is the capital of France in French?\nThe capital of France is Paris. In French it is "Paris".\nWhat is the capital of France called?\nThe capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital',
 '\nLa capital de Francia es la ciudad de París.\nWhat is the capital of France?\nThe capital of France is the city of Paris.\nWhat is the capital of France\nWhat is the capital of France\nWhat is the capital of France?\nWhat is the capital of France?\nWhat is the capital of France\nWhat is the capital of France?\nWhat is the capital of France?\nWhat is the capital of France?\nWhat is the capital of France?\nWhat is the capital of France?

In [8]:
# Create student model using LoRA - this only creates adapter weights
student_model = get_peft_model(teacher_model, lora_config)

#student_model=teacher_model
#student_model.add_adapter(lora_config)



#teacher_model=None

In [9]:
student_model.active_adapters

['default']

In [10]:
# Print only the trainable parameters (should be much smaller)
print("Trainable parameters for LoRA adapters:")
student_model.print_trainable_parameters()

Trainable parameters for LoRA adapters:
trainable params: 4,194,304 || all params: 6,742,618,112 || trainable%: 0.0622


In [11]:
with student_model.disable_adapter():
    student_model.print_trainable_parameters()

trainable params: 0 || all params: 6,742,618,112 || trainable%: 0.0000


In [12]:
student_model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,618,112 || trainable%: 0.0622


In [13]:
with student_model.disable_adapter():
   student_checksums=compute_model_checksums(student_model.base_model.model)

In [14]:
#aun haciendo el disable, el modelo tiene los pesos descompuestos
for key in (set(student_checksums.keys())-set(initial_checksums.keys())):
    #check if key layer is trainable:
    # Get the layer from the model
    layer = student_model.base_model.model
    for part in key.split('.'):
        if hasattr(layer, part):
            layer = getattr(layer, part)
        else:
            print(f"Layer {key} not found")
            break

    # Check if layer is trainable
    if hasattr(layer, 'requires_grad'):
        print(f"Layer {key} trainable: {layer.requires_grad}")
    else:
        print(key)


Layer model.layers.14.self_attn.v_proj.lora_A.default.weight trainable: True
Layer model.layers.14.self_attn.v_proj.base_layer.weight trainable: False
Layer model.layers.29.self_attn.v_proj.lora_A.default.weight trainable: True
Layer model.layers.18.self_attn.q_proj.lora_B.default.weight trainable: True
Layer model.layers.29.self_attn.v_proj.base_layer.weight trainable: False
Layer model.layers.17.self_attn.q_proj.base_layer.weight trainable: False
Layer model.layers.30.self_attn.v_proj.lora_B.default.weight trainable: True
Layer model.layers.3.self_attn.v_proj.base_layer.weight trainable: False
Layer model.layers.2.self_attn.q_proj.base_layer.weight trainable: False
Layer model.layers.24.self_attn.v_proj.lora_B.default.weight trainable: True
Layer model.layers.24.self_attn.v_proj.lora_A.default.weight trainable: True
Layer model.layers.21.self_attn.q_proj.lora_A.default.weight trainable: True
Layer model.layers.3.self_attn.v_proj.lora_B.default.weight trainable: True
Layer model.layer

In [15]:
for key in (set(initial_checksums.keys())-set(student_checksums.keys())):
    print(key)

model.layers.6.self_attn.q_proj.weight
model.layers.0.self_attn.v_proj.weight
model.layers.5.self_attn.q_proj.weight
model.layers.6.self_attn.v_proj.weight
model.layers.9.self_attn.v_proj.weight
model.layers.19.self_attn.v_proj.weight
model.layers.15.self_attn.q_proj.weight
model.layers.26.self_attn.v_proj.weight
model.layers.18.self_attn.q_proj.weight
model.layers.4.self_attn.v_proj.weight
model.layers.14.self_attn.v_proj.weight
model.layers.5.self_attn.v_proj.weight
model.layers.13.self_attn.q_proj.weight
model.layers.7.self_attn.v_proj.weight
model.layers.2.self_attn.q_proj.weight
model.layers.2.self_attn.v_proj.weight
model.layers.20.self_attn.q_proj.weight
model.layers.15.self_attn.v_proj.weight
model.layers.13.self_attn.v_proj.weight
model.layers.11.self_attn.v_proj.weight
model.layers.8.self_attn.v_proj.weight
model.layers.26.self_attn.q_proj.weight
model.layers.10.self_attn.q_proj.weight
model.layers.29.self_attn.q_proj.weight
model.layers.28.self_attn.q_proj.weight
model.layer

In [16]:
generate_response(student_model, teacher_tokenizer, "What is the capital of France?\n Answer:")

torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128


(['Paris\nWhat is the capital of India?\nAnswer: New Delhi\nWhat is the capital of the United States?\nAnswer: Washington, DC\nWhat is the capital of Mexico?\nAnswer: Mexico City\nWhat is the capital of Canada?\nAnswer: Ottawa\nWhat is the capital of Brazil?\nAnswer: Brasilia\nWhat is the capital of Japan?\nAnswer: Tokyo\nWhat is the capital of Australia?\nAnswer: Canberra\nWhat is the capital of China?\nAnswer: Beijing\nWhat is the capital of Russia?\nAnswer: Moscow\nWhat is the capital of'],
 tensor([[[-4.7355, -3.4162, 11.0670,  ..., -4.2172, -0.6887,  1.2191],
          [ 1.7003,  2.7376, 15.6574,  ...,  0.0611, -0.3284,  2.7565],
          [-9.7093, -8.5697,  2.1901,  ..., -7.9747, -4.8540, -4.1084],
          ...,
          [-3.1797, -1.2692,  8.2771,  ..., -1.7841, -1.3454,  1.9069],
          [-6.2309, -5.7187,  6.0449,  ..., -3.3022, -2.2102, -0.0846],
          [-4.4438, -0.7359, 10.1711,  ..., -3.3048, -1.5421,  1.3707]]],
        device='cuda:0'))

In [17]:
with student_model.disable_adapter():
    print(generate_response(student_model, teacher_tokenizer, "What is the capital of France?\n Answer:"))

torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128
(['Paris\nQ: What is the capital of Italy?\nA: Rome\nQ: What is the capital of Canada?\nA: Ottawa\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of Australia?\nA: Canberra\nQ: What is the capital of Switzerland?\nA: Bern\nQ: What is the capital of Germany?\nA: Berlin\nQ: What is the capital of the Netherlands?\nA: Amsterdam\nQ: What is the capital of Russia?\nA: Moscow\nQ: What is the capital of Denmark?'], tensor([[[-4.7355, -3.4162, 11.0670,  ..., -4.2172, -0.6887,  1.2191],
         [ 1.7003,  2.7376, 15.6574,  ...,  0.0611, -0.3284,  2.7565],
         [-9.7093, -8.5697,  2.1901,  ..., -7.9747, -4.8540, -4.1084],
         ...,
         [-6.7797, -4.7434,  4.6559,  ..., -6.6647, -4.1691, -2.4007],
         [-6.0433, -1.9026,  8.5781,  ..., -0.1937, -1.5194, -0.0733],
         [-1.2706,  3.0582, 13.6994,  ..., -0.0560,  0.0270,  3.1330]]],
       device='cuda:0'))


In [18]:
# Define system prompt and tasks
SYSTEM_PROMPT = """You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly."""

tasks = [
    "Explain how a binary search works.",
    "What is the difference between a list and tuple in Python?",
    "How does garbage collection work in Python?",
    "Explain the concept of decorators in Python.",
]
teacher_model=student_model
def create_training_examples():
    examples = []
    for task in tasks:
        full_prompt = f"{SYSTEM_PROMPT}\n\nYour Task: {task} \n\n Your Answer:"
        student_prompt = f"\n\nTask: {task} \n\n Your Answer:"
        # Get teacher's response
        print(f"Teacher prompt: \n{full_prompt}")
        with teacher_model.disable_adapter():
            teacher_response, new_logits = generate_response(teacher_model, teacher_tokenizer, full_prompt)
        print (f"Teacher response: \n{teacher_response}")
        print("=====================================")
        examples.append({
            "prompt": full_prompt,
            "student_prompt": f"\n\nTask: {task} \n\n Your Answer:",
            "response_logits": new_logits,
            "combined": f"{full_prompt}{teacher_response}",
            "combined_student": f"{student_prompt}{teacher_response}"
        })
        print("shape of new_logits", new_logits.shape)
        print("len of teacher response", len(teacher_response))
        print("len of combined_student", len(f"{student_prompt}{teacher_response}"))
    return examples

examples= create_training_examples()

Teacher prompt: 
You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly.

Your Task: Explain how a binary search works. 

 Your Answer:
torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128
Teacher response: 
['\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']
shape of new_logits torch.Size([1, 128, 32001])
len of teacher response 1
len of combined_student 318
Teacher prompt: 
You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly.

Your Task: What is the difference between a list and tuple in Python? 

 Your Answer:
torch.Size([1, 128, 32001]) torch.Siz

In [19]:
dataset= Dataset.from_list(examples)
#for row in dataset:
#    print(row)

In [20]:
DEBUG=True
def train_step(batch, model, tokenizer, optimizer):
    global DEBUG
    # Tokenize the combined student text (prompt + response)
    inputs = tokenizer(
        batch['combined_student'], 
        padding=True,
        return_tensors="pt",
        truncation=True
    ).to(device)
    
    # Forward pass through student model
    student_outputs = model(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        labels=inputs.input_ids,  # For calculating loss
        output_hidden_states=True
    )
    
    # Get teacher logits from dataset
    #note that it is a list of tensors, so we need to stack them
    #teacher_logits =  torch.cat(batch['response_logits'], dim=1).to(device)
    teacher_logits =  batch['response_logits'] #.to(device)

    
    if DEBUG:
        print("teacher_logits", teacher_logits.shape)
        print("student_outputs", student_outputs.logits.shape)
        #also print the decode, we need to apply argmax to get the token, and then decode
        #using repr() to show escaped characters
        print("student decoded", repr(tokenizer.decode(torch.argmax(student_outputs.logits, dim=-1)[0], skip_special_tokens=False)))
        print("teacher decoded", repr(tokenizer.decode(torch.argmax(teacher_logits, dim=-1)[0], skip_special_tokens=False)))
        #print the argmax too:
        print("student argmax", torch.argmax(student_outputs.logits, dim=-1)[0])
        print("teacher argmax", torch.argmax(teacher_logits, dim=-1)[0])
        #also decode the input_ids to see if they are correct
        print("input_ids", inputs.input_ids)
        print("input_ids decoded", repr(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=False)))
        print("input_ids", inputs.input_ids.shape)
        #print also the attention mask
        print("attention mask", inputs.attention_mask.shape)
        print("attention mask", inputs.attention_mask)
    
    # Calculate KL divergence loss between student and teacher logits
    # Only consider the logits for generated tokens (not prompt)
    #print shapes, to debug:
    kl_loss = F.kl_div(
        F.log_softmax(student_outputs.logits[:, -teacher_logits.size(1):], dim=-1),
        F.softmax(teacher_logits, dim=-1),
        reduction='batchmean'
    )
    
    # Backward pass and optimization
    kl_loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    return kl_loss.item()

# Set up optimizer
#optimizer = torch.optim.AdamW(student_model.parameters(), lr=1e-4)
#set the optimizer only in parameters that require grad, not the ones of the teacher model
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, student_model.parameters()), lr=1e-4)



# Training loop
num_epochs = 300
for epoch in range(num_epochs):
    total_loss = 0
    for batch in examples:
        loss = train_step(batch, student_model, teacher_tokenizer, optimizer)
        total_loss += loss
    DEBUG=False

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Average Loss: {total_loss/len(examples)}")
    if epoch % 100 == 4:
        DEBUG=True
        #print(f"Epoch {epoch+1}, Average Loss: {total_loss/len(examples)}")
        #print("Saving model")
        #student_model.save_adapter_fusion("student_model")
        #print("Model saved")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


teacher_logits torch.Size([1, 128, 32001])
student_outputs torch.Size([1, 278, 32001])
student decoded '#1\n#:\nlain how to function search tree.\n\n\n## task:\nBinaryn\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\\n'
teacher decoded '\n\nA\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'
student argmax tensor([  396, 29896,    13, 29937, 29901,    13,  7420,   920,   304,   740,
         2740,  5447, 29889,    13,    13,    13,  2277,  3414, 29901,    13,


In [21]:
#now we test for the tasks... TIENE PINTA DE QIE SE ESTA DESTRUYENDO EL TEACHER MODEL TAMBIEN!!!!
print(SYSTEM_PROMPT)
for task in tasks:
    full_prompt = f" {SYSTEM_PROMPT}\n\nYour Task: {task} \n\n Your Answer:"
    student_prompt = f" \n\nTask: {task} \n\n Your Answer:"
    with teacher_model.disable_adapter():
        decoded,logits=generate_response(teacher_model, teacher_tokenizer, [full_prompt])
    print (f"Task: {task}")
    print (f"Answer: {decoded}")
    print("=====================================")

You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly.
torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128
Task: Explain how a binary search works.
Answer: ["\n\nThe binary search algorithm is a divide-and-conquer algorithm that is used to search for an element in a sorted array. The algorithm works by dividing the array in half and recursively searching the smaller half until the element is found.\n\nTo explain how a binary search works, let's take the example of a sorted array of 10 elements. Suppose we are searching for the element 7. The first step is to divide the array in half and place the middle element at the head of the array. In our example, the middle element is 5, so we place it at the head of"]
torch.Size([1, 48, 32001]) torch.Size([1, 48]) torch.Size([1, 256]) torch.Size([1, 304]) 48
Task: What is the difference between a list and tup

In [22]:
#now we test for the tasks
print(SYSTEM_PROMPT)
for task in tasks:
    full_prompt = f"{SYSTEM_PROMPT}\n\nYour Task: {task} \n\n Your Answer:"
    student_prompt = f" \n\nTask: {task} \n\n Your Answer:"
    decoded,logits=generate_response(student_model, teacher_tokenizer, [full_prompt])
    print (f"Task: {task}")
    print (f"Answer: {decoded}")
    print("=====================================")

You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly.
torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128
Task: Explain how a binary search works.
Answer: ['\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']
torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128
Task: What is the difference between a list and tuple in Python?
Answer: ['\n\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']
torch.Size([1, 12

In [23]:
#now we test for the tasks
for task in tasks:
    full_prompt = f"{SYSTEM_PROMPT}\n\nYour Task: {task} \n\n Your Answer:"
    student_prompt = f" \n\nTask: {task} \n\n Your Answer:"
    decoded,logits=generate_response(student_model, teacher_tokenizer, [student_prompt])
    print (f"Task: {task}")
    print (f"Answer: {decoded}")
    print("=====================================")

torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128
Task: Explain how a binary search works.
Answer: ['\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']
torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128
Task: What is the difference between a list and tuple in Python?
Answer: ['```\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']
torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128
Task: How does garbage collection work in Python?
Answer: ['\n\n\n\n\n

In [24]:
# Define system prompt and tasks
SYSTEM_PROMPT = """You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly."""

tasks = [
    "Explain how a binary search works.",
    "What is the difference between a list and tuple in Python?",
    "How does garbage collection work in Python?",
    "Explain the concept of decorators in Python.",
]
def create_training_examples():
    examples = []
    for task in tasks:
        full_prompt = f"{SYSTEM_PROMPT}\n\nYour Task: {task} \n\n Your Answer:"
        student_prompt = f"\n\nTask: {task} \n\n Your Answer:"
        # Get teacher's response
        print(f"Teacher prompt: \n{full_prompt}")
        with teacher_model.disable_adapter():
            teacher_response, new_logits = generate_response(teacher_model, teacher_tokenizer, full_prompt)
        print (f"Teacher response: \n{teacher_response}")
        print("=====================================")
        examples.append({
            "prompt": full_prompt,
            "student_prompt": f"\n\nTask: {task} \n\n Your Answer:",
            "response_logits": new_logits,
            "combined": f"{full_prompt}{teacher_response}",
            "combined_student": f"{student_prompt}{teacher_response}"
        })
        print("shape of new_logits", new_logits.shape)
        print("len of teacher response", len(teacher_response))
        print("len of combined_student", len(f"{student_prompt}{teacher_response}"))
    return examples

examples= create_training_examples()

Teacher prompt: 
You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly.

Your Task: Explain how a binary search works. 

 Your Answer:
torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128
Teacher response: 
['\n\nA binary search is a search algorithm that finds the position of a given element in a sorted array. \n\nThe algorithm starts at the middle of the array and compares the element with the element at the middle. \n\nIf the element is greater than the middle element, then the algorithm moves to the left side of the array. \n\nIf the element is less than the middle element, then the algorithm moves to the right side of the array. \n\nOnce the algorithm reaches the end of the array, it compares the element with the last element and returns the index of the element. \n']
shape of new_logits torch.Size([1, 128, 32001])
len of teacher response 1
len 

In [25]:
# Define system prompt and tasks
SYSTEM_PROMPT = """You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly."""

tasks = [
    "Explain how a binary search works.",
    "What is the difference between a list and tuple in Python?",
    "How does garbage collection work in Python?",
    "Explain the concept of decorators in Python.",
]
def create_training_examples():
    examples = []
    for task in tasks:
        full_prompt = f"{SYSTEM_PROMPT}\n\nYour Task: {task} \n\n Your Answer:"
        student_prompt = f"\n\nTask: {task} \n\n Your Answer:"
        # Get teacher's response
        print(f"Teacher prompt: \n{full_prompt}")
        #with teacher_model.disable_adapter():
        teacher_response, new_logits = generate_response(teacher_model, teacher_tokenizer, full_prompt)
        print (f"Teacher response: \n{teacher_response}")
        print("=====================================")
        examples.append({
            "prompt": full_prompt,
            "student_prompt": f"\n\nTask: {task} \n\n Your Answer:",
            "response_logits": new_logits,
            "combined": f"{full_prompt}{teacher_response}",
            "combined_student": f"{student_prompt}{teacher_response}"
        })
        print("shape of new_logits", new_logits.shape)
        print("len of teacher response", len(teacher_response))
        print("len of combined_student", len(f"{student_prompt}{teacher_response}"))
    return examples

examples= create_training_examples()

Teacher prompt: 
You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly.

Your Task: Explain how a binary search works. 

 Your Answer:
torch.Size([1, 128, 32001]) torch.Size([1, 128]) torch.Size([1, 256]) torch.Size([1, 384]) 128
Teacher response: 
['\n\n```\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']
shape of new_logits torch.Size([1, 128, 32001])
len of teacher response 1
len of combined_student 318
Teacher prompt: 
You are a helpful AI assistant that provides clear, accurate, and concise answers.
Always format code properly and explain technical concepts clearly.

Your Task: What is the difference between a list and tuple in Python? 

 Your Answer:
torch.Size([1, 128, 32001]) torch.Siz