Citation: Notebook used as reference guide for fine-tuning Gemma 2B models
https://colab.research.google.com/github/adithya-s-k/LLM-Cookbook/blob/main/Finetuning/Gemma_finetuning_notebook.ipynb

In [None]:
!pip install -q -U bitsandbytes==0.42.0
!pip install -q -U peft==0.8.2 # for parameter efficient fine-tuning
!pip install -q -U trl==0.7.10
!pip install -q -U accelerate==0.27.1
!pip install -q -U datasets==2.17.0
!pip install -q -U transformers==4.38.0

In [None]:
!pip install transformers[cohere]



In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ls

[0m[01;34mcheckpoint-100[0m/  [01;34mcheckpoint-1023[0m/  [01;34mcheckpoint-2046[0m/  [01;34mcheckpoint-439[0m/  [01;34mcheckpoint-878[0m/  [01;34mruns[0m/


In [None]:
cd '/content/drive/MyDrive/CS 224N Dataset'

/content/drive/MyDrive/CS 224N Dataset


In [None]:
cd outputs

/content/drive/MyDrive/CS 224N Dataset/outputs


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, LoraConfig
from peft import LoraConfig, get_peft_model


# Specify the directory where the model and tokenizer are saved
output_dir = "checkpoint-878"


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(output_dir)
#modules = find_all_linear_names(base_model)
# Load the LoRA configuration
lora_config = LoraConfig(
    r=64, # rank: capacity for LoRA layers
    lora_alpha=32, # influence of LoRA layers
    target_modules=['up_proj', 'k_proj', 'v_proj', 'gate_proj', 'o_proj', 'down_proj', 'q_proj'],
    lora_dropout=0.03, # dropout probability for LoRA layers for regularization
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply the LoRA configuration to the base model
model = get_peft_model(base_model, lora_config)

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

OSError: checkpoint-878 does not appear to have a file named config.json. Checkout 'https://huggingface.co/checkpoint-878/tree/None' for available files.

In [None]:
ls

combined_data_corrected_11k.json  dev_data.json          output_file.json  train_data.json
combined_data_corrected.json      folio_parsed.json      [0m[01;34moutputs[0m/
combined_data.json                MALLS-v0.1-test.json   test_data.json
combined_data_vanilla_model.json  MALLS-v0.1-train.json  test_data.pkl


In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="MALLS-v0.1-test.json", split="train")

In [None]:
def prompt(row):
  text = f"<start_of_turn>user Convert the following natural language statement to a first-order logic notation: {row['NL']} <end_of_turn>\n<start_of_turn>model {row['FOL']} <end_of_turn>"
  return text


prompt_column = [prompt(row) for row in dataset]
dataset = dataset.add_column("prompt", prompt_column)

In [None]:
#dataset = dataset.shuffle(seed=1234)
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)
dataset = dataset.train_test_split(test_size=0.4)
train_data = dataset["train"]
test_data = dataset["test"]


#test_data = test_data.train_test_split(test_size=0.6)
#dev_data = test_data['train']
#test_data = test_data["test"]


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
test_data

{'NL': 'A beverage that is carbonated or caffeinated can be considered refreshing.',
 'FOL': '∀x (Beverage(x) ∧ (Carbonated(x) ∨ Caffeinated(x)) → Refreshing(x))',
 'prompt': '<start_of_turn>user Convert the following natural language statement to a first-order logic notation: A beverage that is carbonated or caffeinated can be considered refreshing. <end_of_turn>\n<start_of_turn>model ∀x (Beverage(x) ∧ (Carbonated(x) ∨ Caffeinated(x)) → Refreshing(x)) <end_of_turn>',
 'input_ids': [2,
  106,
  1645,
  25151,
  573,
  2412,
  4158,
  5255,
  6218,
  577,
  476,
  1370,
  235290,
  2308,
  14855,
  36932,
  235292,
  586,
  51877,
  674,
  603,
  8933,
  840,
  689,
  137865,
  43446,
  798,
  614,
  5604,
  47225,
  235265,
  235248,
  107,
  108,
  106,
  2516,
  235248,
  239196,
  235297,
  591,
  88078,
  732,
  235278,
  235297,
  235275,
  235248,
  242555,
  591,
  36439,
  840,
  235278,
  235297,
  235275,
  235248,
  243804,
  5861,
  10303,
  43446,
  235278,
  235297,
  126

In [None]:
# we use parameter efficient fine-tuning
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable() # enable gradient checkpointing for faster training and low memory usage
model = prepare_model_for_kbit_training(model) # for k-bit training (we use 8-bits)

In [None]:
import bitsandbytes as bnb
# returns the names of 8-bit quantized layers
def find_all_linear_names(model):
  cls = bnb.nn.Linear8bitLt
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)
modules

['up_proj', 'k_proj', 'v_proj', 'gate_proj', 'o_proj', 'down_proj', 'q_proj']

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=64, # rank: capacity for LoRA layers
    lora_alpha=32, # influence of LoRA layers
    target_modules=modules,
    lora_dropout=0.03, # dropout probability for LoRA layers for regularization
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

NameError: name 'modules' is not defined

In [None]:
import transformers

from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    # eval_dataset=test_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4, #number of steps to accumulate gradients before performing an update step.
        warmup_steps=0.1, #number of steps to gradually increase the learning rate from 0 to the initial learning rate value.
        num_train_epochs=2, #number of epochs to train for.
        learning_rate=2e-4, #learning rate for training.
        fp16=True, #use mixed precision training.
        logging_steps=10,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)



Map:   0%|          | 0/1756 [00:00<?, ? examples/s]



In [None]:
model.config.use_cache = False
trainer.train()



Step,Training Loss
10,4.6664
20,2.1529
30,1.5788
40,1.4381
50,1.3299
60,1.2788
70,1.1708
80,1.1811
90,1.1084
100,1.1144


Checkpoint destination directory outputs/checkpoint-439 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory outputs/checkpoint-878 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=878, training_loss=0.8114158374030389, metrics={'train_runtime': 3572.8432, 'train_samples_per_second': 0.983, 'train_steps_per_second': 0.246, 'total_flos': 2350631647027200.0, 'train_loss': 0.8114158374030389, 'epoch': 2.0})

In [None]:
test_data[0]

{'NL': 'A beverage that is carbonated or caffeinated can be considered refreshing.',
 'FOL': '∀x (Beverage(x) ∧ (Carbonated(x) ∨ Caffeinated(x)) → Refreshing(x))',
 'prompt': '<start_of_turn>user Convert the following natural language statement to a first-order logic notation: A beverage that is carbonated or caffeinated can be considered refreshing. <end_of_turn>\n<start_of_turn>model ∀x (Beverage(x) ∧ (Carbonated(x) ∨ Caffeinated(x)) → Refreshing(x)) <end_of_turn>',
 'input_ids': [2,
  106,
  1645,
  25151,
  573,
  2412,
  4158,
  5255,
  6218,
  577,
  476,
  1370,
  235290,
  2308,
  14855,
  36932,
  235292,
  586,
  51877,
  674,
  603,
  8933,
  840,
  689,
  137865,
  43446,
  798,
  614,
  5604,
  47225,
  235265,
  235248,
  107,
  108,
  106,
  2516,
  235248,
  239196,
  235297,
  591,
  88078,
  732,
  235278,
  235297,
  235275,
  235248,
  242555,
  591,
  36439,
  840,
  235278,
  235297,
  235275,
  235248,
  243804,
  5861,
  10303,
  43446,
  235278,
  235297,
  126

In [None]:
import pandas as pd

df = pd.DataFrame(test_data)
NL_list = df['NL'].tolist()
true_FOL = df['FOL'].tolist()
predicted_FOL = []

def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  <start_of_turn>user
  Convert the following natural language statement to a first-order logic notation. The output should only be a First order logic statement. Do not add any explanations/other text.
  {query}
  <end_of_turn>\n<start_of_turn>model


  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)


  generated_ids = model.generate(**model_inputs, max_new_tokens=60, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  # decoded = tokenizer.batch_decode(generated_ids)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
  return (decoded)

for i in range(0, len(NL_list), 40):  # Batch size of 8
  prediction = get_completion(NL_list[i], model, tokenizer)
  print('Predicted FOL', prediction)
  print('True FOL', true_FOL[i])
  predicted_FOL.append(prediction)



Predicted FOL 
  user
  Convert the following natural language statement to a first-order logic notation. The output should only be a First order logic statement. Do not add any explanations/other text.
  A beverage that is carbonated or caffeinated can be considered refreshing.
  
model


  ∧(Beverage is Carbonated or Beverage is Caffeinated) → Refreshing
True FOL ∀x (Beverage(x) ∧ (Carbonated(x) ∨ Caffeinated(x)) → Refreshing(x))
Predicted FOL 
  user
  Convert the following natural language statement to a first-order logic notation. The output should only be a First order logic statement. Do not add any explanations/other text.
  A mountain with snow is suitable for skiing or snowboarding.
  
model


  ∧ (mountain ∧ (snow → skiing ∨ snowboarding))
True FOL ∀x (Mountain(x) ∧ HasSnow(x) → SuitableForSkiing(x) ∨ SuitableForSnowboarding(x))
Predicted FOL 
  user
  Convert the following natural language statement to a first-order logic notation. The output should only be a First order lo

In [None]:
cd '/content/drive/MyDrive/CS 224N Dataset'

/content/drive/MyDrive/CS 224N Dataset


In [None]:
# prompt: save the test_data reference: df = pd.DataFrame(train_data). Save to a file that I can load again. Download that file. I want the file to be saved in the drive

df.to_parquet("test_data.parquet")
!cp test_data.parquet "/content/drive/MyDrive/CS 224N Dataset"
files.download("/content/drive/MyDrive/CS 224N Dataset/test_data.parquet")


In [None]:
cd outputs

/content/drive/MyDrive/CS 224N Dataset/outputs


In [None]:
# prompt: how to save a list as a json object

import json

# Define the list to be saved
my_list = predicted_FOL

# Convert the list to a JSON object
json_object = json.dumps(my_list)

# Write the JSON object to a file
with open('my_list.json', 'w') as f:
    f.write(json_object)


In [None]:
def extract_fol_statements(json_file_path, output_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    def extract_fol_from_json(data):
        cleaned_data = []
        for entry in data:
            start = entry.find('model\n\n\n') + len('model\n\n\n')
            if start > len('model\n\n\n') - 1:  # Check if 'model\n\n\n' was found
                cleaned_data.append(entry[start:].strip())
        return cleaned_data

    cleaned_data = extract_fol_from_json(data)

    with open(output_file_path, 'w') as outfile:
        json.dump(cleaned_data, outfile, indent=4)

    print(f"Cleaned data has been saved to {output_file_path}")

# Replace 'input_file.json' with the path to your input JSON file
# Replace 'output_file.json' with the path to your desired output JSON file
extract_fol_statements('my_list.json', 'output_file.json')

Cleaned data has been saved to output_file.json


In [None]:
import json

# Function to clean and extract FOL statements
def extract_fol_statements(entries):
    cleaned_data = []
    for entry in entries:
        start = entry.find('model\n\n\n') + len('model\n\n\n')
        if start > len('model\n\n\n') - 1:  # Check if 'model\n\n\n' was found
            cleaned_data.append(entry[start:].strip())
    return cleaned_data

# Clean the predicted FOL statements
cleaned_predicted_FOL = extract_fol_statements(predicted_FOL)

# Combine NL_list, cleaned_predicted_FOL, and true_FOL into a single list of dictionaries
combined_data = [
    {"NL": nl, "predicted_FOL": pred_fol, "true_FOL": true_fol}
    for nl, pred_fol, true_fol in zip(NL_list, cleaned_predicted_FOL, true_FOL)
]

# Convert the combined data to a JSON object
json_object = json.dumps(combined_data, indent=4)

# Write the JSON object to a file
with open('combined_data_corrected_11k.json', 'w') as f:
    f.write(json_object)

print("Combined data has been saved to 'combined_data.json'")


Combined data has been saved to 'combined_data.json'


In [None]:
# Convert each dataset to a list of dictionaries
train_list = train_data.to_dict()
dev_list = dev_data.to_dict()
test_list = test_data.to_dict()

# Save each list to a JSON file
with open('train_data.json', 'w') as train_file:
    json.dump(train_list, train_file, indent=4)

with open('dev_data.json', 'w') as dev_file:
    json.dump(dev_list, dev_file, indent=4)

with open('test_data.json', 'w') as test_file:
    json.dump(test_list, test_file, indent=4)

print("Datasets have been saved as JSON files.")

Datasets have been saved as JSON files.
