In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.optim import AdamW

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium").to("cuda" if torch.cuda.is_available() else "cpu")
device = "cuda" if torch.cuda.is_available() else "cpu"



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [12]:
# Add special tokens to the tokenizer and model
special_tokens_dict = {'pad_token': '[PAD]'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 1024)

In [5]:
# Test Inference
text = "Why is the sky blue?"
inputs = tokenizer.encode(text, return_tensors='pt').to(device)
outputs = model.generate(inputs, max_length=50)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Why is the sky blue?

The sky blue is a color that is used to represent the sky. It is a color that is used to represent the sky. It is a color that is used to represent the sky. It is a color


In [6]:
# Load the dataset
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca")
dataset = dataset['train']

# Data formatting functions
def add_to_csv(csv_name, prompt, answer):
    # Placeholder function, can be implemented to save data to CSV if needed
    pass

def format_data(instruction, input, output):
    user = f"Instruction: {instruction} \n Input: {input}"
    assistant = f"Output: {output}"

    return {
        "user": user,
        "assistant": assistant
    }

def format_prompt(messages):
    prompt = ""
    for message in messages:
        prompt += f"{message['content']} \n"
    return prompt


Downloading readme:   0%|          | 0.00/905 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18612 [00:00<?, ? examples/s]

In [9]:
# Generate RLAIF dataset
constitution = "Identify specific ways in which the assistant’s last output code was incorrect, the code wasn't clear, or the output wasn't well written."
constitution_revision = "Please rewrite the assistant response making sure the output is correct, the code is very clear, and very well written."

# Limit the dataset for a demo
sample_size = 2
demo_dataset = dataset.select(range(sample_size))

for data in tqdm(demo_dataset):
    messages = []
    formatted_data = format_data(data['instruction'], data['input'], data['output'])
    messages.append({"role": "user", "content": formatted_data['user']})
    messages.append({"role": "assistant", "content": formatted_data['assistant']})
    messages.append({"role": "user", "content": constitution})

    prompt = format_prompt(messages)

    # Ensure the input length does not exceed model's max length
    max_length = 1024 - 100  # leaving space for generated text
    input_ids = tokenizer.encode(prompt, return_tensors='pt', truncation=True, max_length=max_length).to(device)
    initial_output = model.generate(input_ids, max_length=input_ids.shape[1] + 100, do_sample=True, temperature=0.4, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(initial_output[0], skip_special_tokens=True)
    generated_text = generated_text.replace(prompt, '').strip()

    messages.append({"role": "assistant", "content": generated_text})
    messages.append({"role": "user", "content": constitution_revision})

    prompt = format_prompt(messages)
    input_ids = tokenizer.encode(prompt, return_tensors='pt', truncation=True, max_length=max_length).to(device)
    final_output = model.generate(input_ids, max_length=input_ids.shape[1] + 100, do_sample=True, temperature=0.4, pad_token_id=tokenizer.eos_token_id)
    output_text = tokenizer.decode(final_output[0], skip_special_tokens=True)
    output_text = output_text.replace(prompt, '').strip()
    print(f"Revised Output: {output_text}")

 50%|█████     | 1/2 [00:43<00:43, 43.69s/it]

Revised Output: Identify any other issues with the assistant’s last output code. 
Identify any other issues with the assistant’s last output code. 
Identify any other issues with the assistant’s last output code. 
Identify any other issues with the assistant’s last output code. 
Identify any other issues with the assistant’s last output code. 
Identify any other issues with the assistant’s last output code.


100%|██████████| 2/2 [01:26<00:00, 43.49s/it]

Revised Output: Please rewrite the assistant response making sure the output is correct, the code is very clear, and very well written. Identify the specific way in which the assistant’s last output code was incorrect, the code wasn't clear, or the output wasn't well written. Identify the specific way in which the assistant’s last output code was incorrect, the code wasn't clear, or the output wasn't well written. Identify the specific way in which the assistant’s last





In [13]:
# Format the dataset for training
train_data = [(f"Below is an instruction that describes a task. Write a response that appropriately completes the request. \n Instruction: {demo_dataset[i]['instruction']} \n Input: {demo_dataset[i]['input']}", f"\n Output: \n {demo_dataset[i]['output']}") for i in range(sample_size)]
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
train_encodings = tokenizer([f"{q} {tokenizer.eos_token} {a}" for q, a in train_data], truncation=True, padding=True)

# Create a Dataset class
class DatasetClass(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = DatasetClass(train_encodings)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Fine-tune the model
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 1  # Reduced for a quick demo
accumulation_steps = 4
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50258, bias=False)
)

In [14]:
for epoch in range(epochs):
    for i, batch in enumerate(tqdm(train_loader), start=1):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss = loss / accumulation_steps
        loss.backward()

        if i % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        # Free up memory
        del input_ids, attention_mask, outputs, loss
        torch.cuda.empty_cache()

100%|██████████| 1/1 [02:32<00:00, 152.14s/it]


In [16]:
# Evaluate the model
model.eval()
prompt = "Write me a python function that adds 2 numbers together."
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Ensure all parts of the inputs are moved to the correct device
for key in inputs.keys():
    inputs[key] = inputs[key].to(device)

outputs = model.generate(**inputs, max_length=125, pad_token_id=tokenizer.eos_token_id)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated response:", output_text)


Generated response: Write me a python function that adds 2 numbers together. - to


In [17]:
# Ensure all parts of the inputs are moved to the correct device
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = model.generate(
    **inputs,
    max_length=100,  # Adjust max_length to be reasonable
    num_return_sequences=1,
    no_repeat_ngram_size=2,  # Prevent repeating the same n-grams
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id
)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated response:", output_text)

Generated response: Write me a python function that adds 2 numbers together.
 # 1 2 3 4 5 6 to of1.1),- the "" ( and/ in139991115onis The:oso aob this that or(DdCce
