#### *Define* constants & setup


In [1]:
PATH_TO_DATA = '/content/drive/MyDrive/cleaned_api_bank_data.xlsx'
MODEL = "microsoft/Phi-3-mini-4k-instruct"#"microsoft/phi-2"# "microsoft/phi-2"
LOAD_IN_4_BIT = True

SAMPLE = None
MAX_STEPS=2000

R = 32
ALPHA = R*2

LR= 2e-4 # 5e-5
hub_name = "ac-99/phi-3"


MODULES_TO_TUNE = ["o_proj", "qkv_proj", "gate_up_proj","down_proj"]

EVAL_FRACTION = 0.5 # 1.0 is all 10% of the existing one

WARMUP_STEPS = 50
LOGGING_STEPS = 200
EVAL_STEPS = 200
SAVE_STEPS = 200
PER_DEVICE_TRAIN_BATCH_SIZE = 1
PER_DEVICE_EVAL_BATCH_SIZE = 4
GRAD_ACCUMULATION_STEPS = 5
METRIC_FOR_BEST_MODEL = 'eval_loss'
LORA_DROPOUT=0.05

EARLY_STOPPING_EVALS = 3

config ={
    "path_to_data": PATH_TO_DATA,
    "model": MODEL,
    "load_in_4_bit": LOAD_IN_4_BIT,
    "alpha": ALPHA,
    "r": R,
    "LORA_DROPOUT":LORA_DROPOUT,
    "learning_rate": LR,
    "max_steps": MAX_STEPS,
    "hub_name": hub_name,
    "modules_to_tune": MODULES_TO_TUNE,
    "warmup_steps": WARMUP_STEPS,
    "logging_steps": LOGGING_STEPS,
    "eval_steps": EVAL_STEPS,
    "save_steps": SAVE_STEPS,
    "per_device_train_batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
    "per_device_eval_batch_size": PER_DEVICE_EVAL_BATCH_SIZE,
    "grad_accumulation_steps": GRAD_ACCUMULATION_STEPS,
}




#### Preamble and Setup

In [2]:

from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# !pip install flash-attn --no-build-isolation -q
!pip install peft -q
!pip install datasets -q
!pip install trl transformers -q
!pip install -q -U bitsandbytes>=0.44.0
!pip install rouge-score -q
!pip install accelerate -q
!pip install wandb -q

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes
import wandb
import pandas as pd
from peft import LoraConfig
from datasets import load_metric
import numpy as np
from rouge_score import rouge_scorer
from datasets import Dataset
from google.colab import userdata
import huggingface_hub
import os
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, EarlyStoppingCallback




[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.8/266.8 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h

#### Config

In [4]:
WANDB_LOGIN_KEY =userdata.get('WANDB_LOGIN')
wandb.login("false",WANDB_LOGIN_KEY)

HF_WRITE_KEY = userdata.get('HF_WRITE_KEY')
huggingface_hub.login(token=HF_WRITE_KEY)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=LOAD_IN_4_BIT,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
     bnb_4bit_use_double_quant=True
)

tokeniser = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL,
                                             quantization_config=bnb_config,
                                             trust_remote_code=True
                                             )

model_name_only = MODEL.split("/")[-1]
run_name = f"{model_name_only}-LR-{LR}-R-{R}-ALPHA-{ALPHA}-steps-{MAX_STEPS}"

if LOAD_IN_4_BIT:
  run_name+="_4_BIT"

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="tool-learning",
    name=run_name,

    # track hyperparameters and run metadata
    config=config
)
lora_config = LoraConfig(
    r=R,
    lora_alpha=ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=MODULES_TO_TUNE,
    task_type="CAUSAL_LM",
)


model

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

[34m[1mwandb[0m: Currently logged in as: [33mac-99[0m. Use [1m`wandb login --relogin`[0m to force relogin


Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, o

#### Data

In [5]:


api_bank_df = pd.read_excel(PATH_TO_DATA)

if SAMPLE:
  api_bank_df=api_bank_df.sample(SAMPLE)
else:
  print("No sampling!")

api_train = Dataset.from_pandas(api_bank_df[api_bank_df['split']=='train'])
api_test = api_bank_df[api_bank_df['split']=='test']
api_eval = Dataset.from_pandas(api_bank_df[api_bank_df['split']=='eval'].sample(frac=EVAL_FRACTION))



output_dir = f"drive/MyDrive/{run_name}"

try:
  os.mkdir(output_dir)
except Exception as e:
  print(e)

No sampling!
[Errno 17] File exists: 'drive/MyDrive/Phi-3-mini-4k-instruct-LR-0.0002-R-32-ALPHA-64-steps-2000_4_BIT'


#### Training

In [6]:

import gc
torch.cuda.empty_cache()
gc.collect()

early_stopping = EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_EVALS)

training_args = TrainingArguments(
            per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
            per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
            gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
            warmup_steps=WARMUP_STEPS,
            max_steps=MAX_STEPS,
            learning_rate=LR,
            fp16=False,
            logging_steps=LOGGING_STEPS,
            output_dir=output_dir,
            optim="paged_adamw_8bit",
            load_best_model_at_end=True,
            save_total_limit=1,
            report_to="wandb",
            do_eval=True,
            evaluation_strategy="steps",
            eval_steps=EVAL_STEPS,
            save_steps=SAVE_STEPS,
            metric_for_best_model = METRIC_FOR_BEST_MODEL,
        )

def format_prompt_phi(prompt, completion=None):
    if completion:
      text = f"<|user|>\n {prompt} <|end|>\n<|assistant|> {completion} <|end|>"
    else:
      text = f"<|user|>\n {prompt} <|end|>\n<|assistant|>"
    return text

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['prompt'])):
        text = format_prompt_phi(prompt=example['prompt'][i],completion=example['completion'][i])
        output_texts.append(text)
    return output_texts


trainer = SFTTrainer(
    model=model,
    train_dataset=api_train,
    eval_dataset=api_eval,
        args=training_args,
    peft_config=lora_config,
    formatting_func = formatting_prompts_func,
    callbacks=[early_stopping]
)


print(trainer)

trainer.train()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/9321 [00:00<?, ? examples/s]

Map:   0%|          | 0/524 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


<trl.trainer.sft_trainer.SFTTrainer object at 0x783a70aac070>




Step,Training Loss,Validation Loss
200,0.3529,0.255043
400,0.2092,0.237308
600,0.1916,0.220169
800,0.1769,0.202278
1000,0.1598,0.185085
1200,0.1413,0.170068
1400,0.1303,0.151645
1600,0.1152,0.138659
1800,0.1051,0.129376
2000,0.0794,0.125227


TrainOutput(global_step=2000, training_loss=0.1661754755973816, metrics={'train_runtime': 26889.8953, 'train_samples_per_second': 0.372, 'train_steps_per_second': 0.074, 'total_flos': 1.6837686532572365e+17, 'train_loss': 0.1661754755973816, 'epoch': 1.0728462611307799})

#### Evaluate Best Model




In [7]:
import gc
torch.cuda.empty_cache()
gc.collect()

63

In [8]:



from tqdm import tqdm

tqdm.pandas()

def generate_completion(input_str, max_new_tokens):
  inputs = tokeniser(input_str, return_tensors="pt")

  outputs = model.generate(input_ids=inputs['input_ids'],max_new_tokens=max_new_tokens,do_sample=True,temperature=0.01)

  prompt_length = inputs['input_ids'].shape[1]

  answer = tokeniser.decode(outputs[0][prompt_length:])

  return answer


  # api_bank_df.head(1)

def generate_model_answers(test_results, model_object, model_name, tokeniser=tokeniser):

    test_results['answer_tokens'] = test_results['function_calls'].apply(lambda x: len(tokeniser.encode(x)))
    test_results['prompt_tokens'] = test_results['prompt'].apply(lambda x: len(tokeniser.encode(x)))

    test_results['formatted_prompt'] = test_results['prompt'].apply(lambda x: format_prompt_phi(x))
    test_results['model_answer'] = test_results.progress_apply(lambda x: generate_completion(x['formatted_prompt'], max_new_tokens=x['answer_tokens']+20) if x['split'] == 'test' else None, axis=1)

    test_results['model'] = model_name


    return test_results


best_model = trainer.model

test_results = generate_model_answers(api_test, model_object=best_model,model_name=run_name,tokeniser=tokeniser)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_results['answer_tokens'] = test_results['function_calls'].apply(lambda x: len(tokeniser.encode(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_results['prompt_tokens'] = test_results['prompt'].apply(lambda x: len(tokeniser.encode(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tes

In [9]:

import re

test_results['completion']=test_results['completion'].astype(str).str.strip()
test_results['function_call']=test_results['function_call'].astype(str).str.strip()
test_results['function_name']=test_results['function_name'].astype(str).str.strip()

scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
# scores =
def evaluate_test_results(df,rouge_scorer=scorer):

  test_results = df.copy()
  test_results['contained_correct_call'] = test_results.apply(lambda x: x['completion'] in x['model_answer'], axis=1)
  test_results['exact_match'] = test_results.apply(lambda x: x['function_call'] == x['model_answer'] if x['model_answer'] else None, axis=1)
  test_results['contains_function_name'] = test_results.apply(lambda x: x['function_name'] in x['model_answer'] if x['model_answer'] else None, axis=1)
  test_results['answer_included_list'] = test_results.apply(lambda x: bool(re.search(r'\[.*\]', x['model_answer'])) if x['model_answer'] else None, axis=1)
  test_results['answer_list'] = test_results.apply(lambda x: re.search(r'\[.*\]', x['model_answer']).group(0) if x['answer_included_list'] else '', axis=1)



  test_results['rouge_1_score'] = test_results.apply(lambda x: rouge_scorer.score(target=x['completion'],prediction=x['answer_list']),axis=1)

  test_results['rouge_1_precision_list'] = test_results['rouge_1_score'].apply(lambda x: x['rouge1'].precision)
  test_results['rouge_1_recall_list'] =  test_results['rouge_1_score'].apply(lambda x: x['rouge1'].recall)
  test_results['rouge_1_f1_list'] =  test_results['rouge_1_score'].apply(lambda x: x['rouge1'].fmeasure)


  return test_results

evaluated_results = evaluate_test_results(test_results)
agg_results = evaluated_results[['contained_correct_call','exact_match','answer_included_list','contains_function_name','rouge_1_precision_list','rouge_1_recall_list','rouge_1_f1_list']].mean()
agg_results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_results['completion']=test_results['completion'].astype(str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_results['function_call']=test_results['function_call'].astype(str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_results['function_name']=test_results[

contained_correct_call    0.457372
exact_match               0.000000
answer_included_list      0.988967
contains_function_name    0.910732
rouge_1_precision_list    0.772752
rouge_1_recall_list       0.827263
rouge_1_f1_list           0.790048
dtype: float64

In [10]:
evaluated_results[['model_answer','completion','formatted_prompt']].sample(5)

Unnamed: 0,model_answer,completion,formatted_prompt
10821,[OrganizationMembers(organization='ByteDance'...,[OrganizationMembers(organization='ByteDance')],<|user|>\n \nGenerate an API request in the fo...
11160,[ToolSearcher(keywords='get likes for a post'...,[UserPosts(user_id='5')],<|user|>\n \nYou will be tested on your abilit...
10577,[Wiki(keyword='artificial intelligence')] <|e...,[Wiki(keyword='artificial intelligence')],<|user|>\n \nGenerate an API request in the fo...
10677,"[GetUserToken(username='newuser', password='n...","[GetUserToken(username='newuser', password='ne...",<|user|>\n \nGenerate an API request in the fo...
11107,[ToolSearcher(keywords='get_movies')] <|end|>,[ToolSearcher(keywords='UserWatchedMovies')],<|user|>\n \nYou will be tested on your abilit...


In [11]:
OUTPUT_PATH = f"/content/drive/MyDrive/api_bank_results_{run_name}_fine_tune.xlsx".replace("-","_")

evaluated_results.to_excel(OUTPUT_PATH)
print(OUTPUT_PATH)

/content/drive/MyDrive/api_bank_results_Phi_3_mini_4k_instruct_LR_0.0002_R_32_ALPHA_64_steps_2000_4_BIT_fine_tune.xlsx


In [12]:
best_checkpoint = str(trainer.state.best_model_checkpoint)
agg_results = agg_results.to_dict()
agg_results['best_checkpoint']=best_checkpoint
wandb.log(agg_results)

In [13]:
wandb.finish()

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
answer_included_list,▁
contained_correct_call,▁
contains_function_name,▁
eval/loss,█▇▆▅▄▃▂▂▁▁
eval/runtime,▄█▂▄▃▁▃▄▆▃
eval/samples_per_second,█▁██████▁█
eval/steps_per_second,▁▁▁▁▁▁▁▁▁▁
exact_match,▁
rouge_1_f1_list,▁
rouge_1_precision_list,▁

0,1
answer_included_list,0.98897
best_checkpoint,drive/MyDrive/Phi-3-...
contained_correct_call,0.45737
contains_function_name,0.91073
eval/loss,0.12523
eval/runtime,639.0328
eval/samples_per_second,0.82
eval/steps_per_second,0.205
exact_match,0.0
rouge_1_f1_list,0.79005


In [14]:
from google.colab import runtime
runtime.unassign()