Citation: Notebook used as reference guide for fine-tuning Gemma 2B models
https://colab.research.google.com/github/adithya-s-k/LLM-Cookbook/blob/main/Finetuning/Gemma_finetuning_notebook.ipynb

In [None]:
!pip install -q -U bitsandbytes==0.42.0
!pip install -q -U peft==0.8.2 # for parameter efficient fine-tuning
!pip install -q -U trl==0.7.10
!pip install -q -U accelerate==0.27.1
!pip install -q -U datasets==2.17.0
!pip install -q -U transformers==4.38.0

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_name = 'google/gemma-2b-it' # we use the google gemma 2B instruct model

# we use the 4 bit quantized version to perform qLora
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd outputs

/content/drive/MyDrive/CS 224N Dataset/outputs


In [None]:
cd '/content/drive/MyDrive/CS 224N Dataset'

In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="MALLS-v0.1-train.json", split="train")

In [None]:
def prompt(row):
  text = f"<start_of_turn>user Convert this natural language statement to its first order logic statement: {row['NL']} <end_of_turn>\n<start_of_turn>model {row['FOL']} <end_of_turn>"
  return text


prompt_column = [prompt(row) for row in dataset]
dataset = dataset.add_column("prompt", prompt_column)

In [None]:
dataset = dataset.shuffle(seed=1234)
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)
dataset = dataset.train_test_split(test_size=0.55)
train_data = dataset["train"]
test_data = dataset["test"]

In [None]:
train_data

Dataset({
    features: ['FOL', 'NL', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 12277
})

In [None]:
# we use parameter efficient fine-tuning
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable() # enable gradient checkpointing for faster training and low memory usage
model = prepare_model_for_kbit_training(model) # for k-bit training (we use 8-bits)

In [None]:
import bitsandbytes as bnb
# returns the names of 8-bit quantized layers
def find_all_linear_names(model):
  cls = bnb.nn.Linear8bitLt
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)
modules

['up_proj', 'down_proj', 'k_proj', 'v_proj', 'gate_proj', 'q_proj', 'o_proj']

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=64, # rank: capacity for LoRA layers
    lora_alpha=32, # influence of LoRA layers
    target_modules=modules,
    lora_dropout=0.03, # dropout probability for LoRA layers for regularization
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [None]:
import transformers

from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    # eval_dataset=test_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=3,
        gradient_accumulation_steps=4, #number of steps to accumulate gradients before performing an update step.
        warmup_steps=0.1, #number of steps to gradually increase the learning rate from 0 to the initial learning rate value.
        num_train_epochs=2, #number of epochs to train for.
        learning_rate=2e-4, #learning rate for training.
        fp16=True, #use mixed precision training.
        logging_steps=10,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)



Map:   0%|          | 0/12277 [00:00<?, ? examples/s]



In [None]:
model.config.use_cache = False
trainer.train()

Step,Training Loss
10,3.1028
20,1.214
30,0.8549
40,0.757
50,0.7324
60,0.6702
70,0.6024
80,0.6039
90,0.5882
100,0.5934


Checkpoint destination directory outputs/checkpoint-1023 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=2046, training_loss=0.44146922286071855, metrics={'train_runtime': 6052.1308, 'train_samples_per_second': 4.057, 'train_steps_per_second': 0.338, 'total_flos': 2.838040032927744e+16, 'train_loss': 0.44146922286071855, 'epoch': 2.0})

In [None]:
test_data[0]

In [None]:
test_dataset = load_dataset("json", data_files="MALLS-v0.1-test.json", split="train")
def prompt(row):
  text = f"<start_of_turn>user Convert the following natural language statement to a first-order logic notation: {row['NL']} <end_of_turn>\n<start_of_turn>model {row['FOL']} <end_of_turn>"
  return text


prompt_column = [prompt(row) for row in test_dataset]
test_dataset = test_dataset.add_column("prompt", prompt_column)

test_dataset = test_dataset.shuffle(seed=1234)
test_dataset = test_dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

test_dataset = test_dataset.train_test_split(test_size=0.3)
dev_data = test_dataset["train"]
test_data = test_dataset["test"]

In [None]:
dev_data

Dataset({
    features: ['FOL', 'NL', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 700
})

In [None]:
import pandas as pd

df = pd.DataFrame(dev_data)
NL_list = df['NL'].tolist()
true_FOL = df['FOL'].tolist()
predicted_FOL = []
tokenizer.padding_side = "left"

def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  <start_of_turn>user
  Convert the following natural language statement to a first-order logic notation. The output should ONLY be First order logic statement. It should include nothing else:
  {query}
  <end_of_turn>\n<start_of_turn>model


  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)


  generated_ids = model.generate(**model_inputs, max_new_tokens=150, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  # decoded = tokenizer.batch_decode(generated_ids)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
  return (decoded)

for i in range(0, len(NL_list)):  # Batch size of 8
  prediction = get_completion(NL_list[i], model, tokenizer)
  print(prediction)
  predicted_FOL.append(prediction)



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

In [None]:
predicted_FOL

NameError: name 'predicted_FOL' is not defined

In [None]:
import json

# Function to clean and extract FOL statements
def extract_fol_statements(entries):
    cleaned_data = []
    for entry in entries:
        start = entry.find('model\n\n\n') + len('model\n\n\n')
        if start > len('model\n\n\n') - 1:  # Check if 'model\n\n\n' was found
            cleaned_data.append(entry[start:].strip())
    return cleaned_data

# Clean the predicted FOL statements
cleaned_predicted_FOL = extract_fol_statements(predicted_FOL)

# Combine NL_list, cleaned_predicted_FOL, and true_FOL into a single list of dictionaries
combined_data = [
    {"NL": nl, "predicted_FOL": pred_fol, "true_FOL": true_fol}
    for nl, pred_fol, true_fol in zip(NL_list, cleaned_predicted_FOL, true_FOL)
]

# Convert the combined data to a JSON object
json_object = json.dumps(combined_data, indent=4)

# Write the JSON object to a file
with open('combined_data.json', 'w') as f:
    f.write(json_object)

print("Combined data has been saved to 'combined_data.json'")

Redundant Code:

In [None]:
import pandas as pd

df = pd.DataFrame(test_data)
NL_list = df['NL'].tolist()
true_FOL = df['FOL'].tolist()


def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  <start_of_turn>user
  Convert the following natural language statement to a first-order logic notation:
  {query}
  <end_of_turn>\n<start_of_turn>model


  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)


  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, num_beams=3, pad_token_id=tokenizer.eos_token_id)
  # decoded = tokenizer.batch_decode(generated_ids)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
  return (decoded)

get_completion(nl_list[0], model, tokenizer)



In [None]:
cd '/content/drive/MyDrive/CS 224N Dataset'

/content/drive/MyDrive/CS 224N Dataset


In [None]:
cd outputs

/content/drive/MyDrive/CS 224N Dataset/outputs


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, LoraConfig
from peft import LoraConfig, get_peft_model


# Specify the directory where the model and tokenizer are saved
output_dir = "checkpoint-878"


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(output_dir)

# Load the LoRA configuration
lora_config = LoraConfig(
    r=64, # rank: capacity for LoRA layers
    lora_alpha=32, # influence of LoRA layers
    target_modules=modules,
    lora_dropout=0.03, # dropout probability for LoRA layers for regularization
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply the LoRA configuration to the base model
model = get_peft_model(base_model, lora_config)

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU 

In [None]:
import pandas as pd

df = pd.DataFrame(test_data)
NL_list = df['NL'].tolist()
true_FOL = df['FOL'].tolist()

print(NL_list)
#print(true_FOL)

inputs = tokenizer(NL_list, return_tensors='pt', padding=True, truncation=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predicted_FOL = []
model.eval()

output_list = []
with torch.no_grad():
  for i in range(0, len(true_FOL[:3]), 8):  # Batch size of 8
        batch_inputs = {k: v[i:i + 8].to(device) for k, v in inputs.items()}
        outputs = model.generate(**batch_inputs, max_length=200, do_sample=False, num_beams=3)  # Adjust max_length as needed
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predicted_FOL.extend(decoded_outputs)

print(predicted_FOL)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


['If people have erratic and diverse eating habits, then they do not have consistent everyday routines and like sticking to a solid schedule.', 'No buildings in New Haven are higher than 400 meters. ', 'Humans are mammals.', 'All players who successfully shoot a high percentage of 3-pointers are solid at shooting 2-pointers.', "The security deposit can be either equal to one month's rent or more.", 'All people who prefer working at home over going to the office every day do not have regular 9-5 jobs.', 'If the monitor has a type-c port, then it is not produced by AOC.', '"Your Woman" is a song by British one-man band White Town.', 'Peter tidies his space consistently and enjoys the process of cleaning.', 'BERT is a giant language model. ', 'Nothing preceded by data processing acquires data.', "Mia's favorite season is not the same as Emma's. ", "Tom's license plate is from Istanbul.", 'Dagfinn Aarskog is a Norwegian physician.', 'Yale University has been organized into 27 constituent c

OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 GiB. GPU 