#### *Define* constants & setup


In [None]:
PATH_TO_DATA = '/content/drive/MyDrive/cleaned_api_bank_data.xlsx'
MODEL = "microsoft/Phi-3-mini-4k-instruct"#"microsoft/phi-2"# "microsoft/phi-2"
LOAD_IN_4_BIT = True

SAMPLE = None
MAX_STEPS=2000

R = 32
ALPHA = R*2

LR= 4e-4 # 5e-5
hub_name = "ac-99/phi-3"


MODULES_TO_TUNE = ["o_proj", "qkv_proj", "gate_up_proj","down_proj"]

EVAL_FRACTION = 0.5 # 1.0 is all 10% of the existing one

WARMUP_STEPS = 50
LOGGING_STEPS = 200
EVAL_STEPS = 200
SAVE_STEPS = 200
PER_DEVICE_TRAIN_BATCH_SIZE = 1
PER_DEVICE_EVAL_BATCH_SIZE = 4
GRAD_ACCUMULATION_STEPS = 5
METRIC_FOR_BEST_MODEL = 'eval_loss'
LORA_DROPOUT=0.05

EARLY_STOPPING_EVALS = 3

config ={
    "path_to_data": PATH_TO_DATA,
    "model": MODEL,
    "load_in_4_bit": LOAD_IN_4_BIT,
    "alpha": ALPHA,
    "r": R,
    "LORA_DROPOUT":LORA_DROPOUT,
    "learning_rate": LR,
    "max_steps": MAX_STEPS,
    "hub_name": hub_name,
    "modules_to_tune": MODULES_TO_TUNE,
    "warmup_steps": WARMUP_STEPS,
    "logging_steps": LOGGING_STEPS,
    "eval_steps": EVAL_STEPS,
    "save_steps": SAVE_STEPS,
    "per_device_train_batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
    "per_device_eval_batch_size": PER_DEVICE_EVAL_BATCH_SIZE,
    "grad_accumulation_steps": GRAD_ACCUMULATION_STEPS,
}




#### Preamble and Setup

In [None]:

from google.colab import drive
drive.mount('/content/drive', force_remount=False)

In [None]:
# !pip install flash-attn --no-build-isolation -q
!pip install peft -q
!pip install datasets -q
!pip install trl transformers -q
!pip install -q -U bitsandbytes>=0.44.0
!pip install rouge-score -q
!pip install accelerate -q
!pip install wandb -q

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes
import wandb
import pandas as pd
from peft import LoraConfig
from datasets import load_metric
import numpy as np
from rouge_score import rouge_scorer
from datasets import Dataset
from google.colab import userdata
import huggingface_hub
import os
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, EarlyStoppingCallback




#### Config

In [None]:
WANDB_LOGIN_KEY =userdata.get('WANDB_LOGIN')
wandb.login("false",WANDB_LOGIN_KEY)

HF_WRITE_KEY = userdata.get('HF_WRITE_KEY')
huggingface_hub.login(token=HF_WRITE_KEY)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=LOAD_IN_4_BIT,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
     bnb_4bit_use_double_quant=True
)

tokeniser = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL,
                                             quantization_config=bnb_config,
                                             trust_remote_code=True
                                             )

model_name_only = MODEL.split("/")[-1]
run_name = f"{model_name_only}-LR-{LR}-R-{R}-ALPHA-{ALPHA}-steps-{MAX_STEPS}"

if LOAD_IN_4_BIT:
  run_name+="_4_BIT"

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="tool-learning",
    name=run_name,

    # track hyperparameters and run metadata
    config=config
)
lora_config = LoraConfig(
    r=R,
    lora_alpha=ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=MODULES_TO_TUNE,
    task_type="CAUSAL_LM",
)


model

#### Data

In [None]:


api_bank_df = pd.read_excel(PATH_TO_DATA)

if SAMPLE:
  api_bank_df=api_bank_df.sample(SAMPLE)
else:
  print("No sampling!")

api_train = Dataset.from_pandas(api_bank_df[api_bank_df['split']=='train'])
api_test = api_bank_df[api_bank_df['split']=='test']
api_eval = Dataset.from_pandas(api_bank_df[api_bank_df['split']=='eval'].sample(frac=EVAL_FRACTION))



output_dir = f"drive/MyDrive/{run_name}"

try:
  os.mkdir(output_dir)
except Exception as e:
  print(e)

#### Training

In [None]:

import gc
torch.cuda.empty_cache()
gc.collect()

early_stopping = EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_EVALS)

training_args = TrainingArguments(
            per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
            per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
            gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
            warmup_steps=WARMUP_STEPS,
            max_steps=MAX_STEPS,
            learning_rate=LR,
            fp16=False,
            logging_steps=LOGGING_STEPS,
            output_dir=output_dir,
            optim="paged_adamw_8bit",
            load_best_model_at_end=True,
            save_total_limit=1,
            report_to="wandb",
            do_eval=True,
            evaluation_strategy="steps",
            eval_steps=EVAL_STEPS,
            save_steps=SAVE_STEPS,
            metric_for_best_model = METRIC_FOR_BEST_MODEL,
        )

def format_prompt_phi(prompt, completion=None):
    if completion:
      text = f"<|user|>\n {prompt} <|end|>\n<|assistant|> {completion} <|end|>"
    else:
      text = f"<|user|>\n {prompt} <|end|>\n<|assistant|>"
    return text

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['prompt'])):
        text = format_prompt_phi(prompt=example['prompt'][i],completion=example['completion'][i])
        output_texts.append(text)
    return output_texts


trainer = SFTTrainer(
    model=model,
    train_dataset=api_train,
    eval_dataset=api_eval,
        args=training_args,
    peft_config=lora_config,
    formatting_func = formatting_prompts_func,
    callbacks=[early_stopping]
)


print(trainer)

trainer.train()


#### Evaluate Best Model




In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:



from tqdm import tqdm

tqdm.pandas()

def generate_completion(input_str, max_new_tokens):
  inputs = tokeniser(input_str, return_tensors="pt")

  outputs = model.generate(input_ids=inputs['input_ids'],max_new_tokens=max_new_tokens,do_sample=True,temperature=0.01)

  prompt_length = inputs['input_ids'].shape[1]

  answer = tokeniser.decode(outputs[0][prompt_length:])

  return answer


  # api_bank_df.head(1)

def generate_model_answers(test_results, model_object, model_name, tokeniser=tokeniser):

    test_results['answer_tokens'] = test_results['function_calls'].apply(lambda x: len(tokeniser.encode(x)))
    test_results['prompt_tokens'] = test_results['prompt'].apply(lambda x: len(tokeniser.encode(x)))

    test_results['formatted_prompt'] = test_results['prompt'].apply(lambda x: format_prompt_phi(x))
    test_results['model_answer'] = test_results.progress_apply(lambda x: generate_completion(x['formatted_prompt'], max_new_tokens=x['answer_tokens']+20) if x['split'] == 'test' else None, axis=1)

    test_results['model'] = model_name


    return test_results


best_model = trainer.model

test_results = generate_model_answers(api_test, model_object=best_model,model_name=run_name,tokeniser=tokeniser)

In [None]:

import re

test_results['completion']=test_results['completion'].astype(str).str.strip()
test_results['function_call']=test_results['function_call'].astype(str).str.strip()
test_results['function_name']=test_results['function_name'].astype(str).str.strip()

scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
# scores =
def evaluate_test_results(df,rouge_scorer=scorer):

  test_results = df.copy()
  test_results['contained_correct_call'] = test_results.apply(lambda x: x['completion'] in x['model_answer'], axis=1)
  test_results['exact_match'] = test_results.apply(lambda x: x['function_call'] == x['model_answer'] if x['model_answer'] else None, axis=1)
  test_results['contains_function_name'] = test_results.apply(lambda x: x['function_name'] in x['model_answer'] if x['model_answer'] else None, axis=1)
  test_results['answer_included_list'] = test_results.apply(lambda x: bool(re.search(r'\[.*\]', x['model_answer'])) if x['model_answer'] else None, axis=1)
  test_results['answer_list'] = test_results.apply(lambda x: re.search(r'\[.*\]', x['model_answer']).group(0) if x['answer_included_list'] else '', axis=1)



  test_results['rouge_1_score'] = test_results.apply(lambda x: rouge_scorer.score(target=x['completion'],prediction=x['answer_list']),axis=1)

  test_results['rouge_1_precision_list'] = test_results['rouge_1_score'].apply(lambda x: x['rouge1'].precision)
  test_results['rouge_1_recall_list'] =  test_results['rouge_1_score'].apply(lambda x: x['rouge1'].recall)
  test_results['rouge_1_f1_list'] =  test_results['rouge_1_score'].apply(lambda x: x['rouge1'].fmeasure)


  return test_results

evaluated_results = evaluate_test_results(test_results)
agg_results = evaluated_results[['contained_correct_call','exact_match','answer_included_list','contains_function_name','rouge_1_precision_list','rouge_1_recall_list','rouge_1_f1_list']].mean()
agg_results

In [None]:
evaluated_results[['model_answer','completion','formatted_prompt']].sample(5)

In [None]:
OUTPUT_PATH = f"/content/drive/MyDrive/api_bank_results_{run_name}_fine_tune.xlsx".replace("-","_")

evaluated_results.to_excel(OUTPUT_PATH)
print(OUTPUT_PATH)

In [None]:
best_checkpoint = str(trainer.state.best_model_checkpoint)
agg_results = agg_results.to_dict()
agg_results['best_checkpoint']=best_checkpoint
wandb.log(agg_results)

In [None]:
wandb.finish()

In [None]:
from google.colab import runtime
runtime.unassign()