In [4]:
import torch
from accelerate import Accelerator
from transformers import AutoModelForCausalLM,AutoTokenizer,TrainingArguments,Trainer,BitsAndBytesConfig,DataCollatorForLanguageModeling
import pandas as pd
import numpy as np
import datasets
import random
from tqdm import tqdm
tqdm.pandas()
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
accelerator = Accelerator()
from peft import PeftModel

In [2]:
df = pd.read_json("function_call.jsonl",lines = True)

In [3]:
ids = random.choices(range(0,101),k = 10)

In [4]:
ids

[30, 61, 88, 36, 99, 20, 71, 35, 63, 85]

In [5]:
val = df.iloc[ids]

In [6]:
df = df.drop(ids)

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained('mistralai/Mistral-7B-Instruct-v0.3', quantization_config=bnb_config)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
tokenizer = AutoTokenizer.from_pretrained(
    'mistralai/Mistral-7B-Instruct-v0.3',
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

In [9]:
df.loc[0,'instruction'],df.loc[0,'output']

('Reply with JSON for the following question: I want to do a total of 8945 and 1352',
 'Here is your generated JSON: \n```json\n{\n    "function_name": "add",\n    "parameter_1": "8945",\n    "parameter_2": "1352"\n}\n```')

In [10]:
def genPrompt(ex):
    prompt = f"""<s>[INST]You are an Expert Computer Programmer.
    You will Recieve a prompt to generate JSON for the given mathematical operation.
    You can only return the numbers and the operation as a function as JSON.
    Example:
    - User: Reply with JSON for the following question: I want to do a total of 8945 and 1352
    - Assistant: Here is your generated JSON: \n```json\n{{\n    "function_name": "add",\n    "parameter_1": "8945",\n    "parameter_2": "1352"\n}}\n```'
    [/INST]</s>
    <s>
    User:{ex['instruction']}
    Assistant:{ex['output']}
    </s>
    """
    return prompt

In [11]:
def tokenize(ex):
    return tokenizer(genPrompt(ex),truncation = True,padding = 'max_length',return_tensors = 'pt')

In [12]:
df = datasets.Dataset.from_pandas(df)
prompts = df.map(tokenize,batched = True,batch_size = 1)

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

In [13]:
prompts

Dataset({
    features: ['instruction', 'output', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 91
})

In [14]:
len(prompts['input_ids'][0])

512

In [15]:
val = datasets.Dataset.from_pandas(val).map(tokenize)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [16]:
model = prepare_model_for_kbit_training(model)

In [17]:
prompts = prompts.remove_columns(['instruction','output','__index_level_0__'])
val = val.remove_columns(['instruction','output','__index_level_0__'])
prompts = prompts.with_format("torch")
val = val.with_format('torch')

In [18]:
def correctFormat(ex):
    ex['input_ids'] = ex['input_ids'].squeeze(dim = 0)
    ex['attention_mask'] = ex['attention_mask'].squeeze(dim = 0)
    return ex

In [19]:
val = val.map(correctFormat)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [20]:
config = LoraConfig(
    r=8,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

In [21]:
trainer = Trainer(
    model=model,
    train_dataset=prompts,
    eval_dataset=val,
    args=TrainingArguments(
        output_dir="mistralMathTune",
        do_train=True,
        warmup_steps=5,
        per_device_train_batch_size=12,
        per_device_eval_batch_size = 10,
        gradient_accumulation_steps=1,
        weight_decay = 0.05,
        num_train_epochs=10,
        learning_rate=2e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=1,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=4,                # Save checkpoints every 50 steps
        eval_strategy="steps", # Evaluate the model every logging step
        eval_steps=1,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to=None,
        gradient_checkpointing = False,
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [22]:
trainer.train('mistralMathTune/checkpoint-40')

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
41,0.3038,0.409521
42,0.2994,0.403842
43,0.2838,0.398489
44,0.2787,0.392871
45,0.2725,0.381754
46,0.2826,0.374038
47,0.2677,0.373761
48,0.2734,0.374218
49,0.2469,0.373374
50,0.262,0.372184




TrainOutput(global_step=80, training_loss=0.1181545952335, metrics={'train_runtime': 5899.1545, 'train_samples_per_second': 0.154, 'train_steps_per_second': 0.014, 'total_flos': 1.994623719309312e+16, 'train_loss': 0.1181545952335, 'epoch': 10.0})

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="cuda",
    trust_remote_code=True,
    use_auth_token=True
)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [27]:
model_input = tokenizer("What is 12*2.5/6?\nOnly give me the answer, I do not need the process.",return_tensors = "pt").to("cuda")

In [28]:
ft_model = PeftModel.from_pretrained(base_model, "mistralMathTune/checkpoint-72")
ft_model.eval() 
''''''

''

In [29]:
with torch.no_grad():
    print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100, pad_token_id=2,repetition_penalty = 1.2,do_sample = True,temperature = 0.1)[0], skip_special_tokens=True))

What is 12*2.5/6?
Only give me the answer, I do not need the process.
ANSWER: 2
EXPLANATION: The order of operations (PEMDAS) states that you should perform exponentiation before division or multiplication, and division before addition or subtraction. So in this problem, we first multiply 12 by 2.5 to get 30, then divide 30 by 6 to get 5. However, since all numbers are integers, there will be no decimal places when performing integer arith


In [30]:
model_input = tokenizer("What is 50+125+49?\nOnly give me the answer, I do not need the process.",return_tensors = "pt").to("cuda")

In [32]:
with torch.no_grad():
    print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100, pad_token_id=2,repetition_penalty = 1.2,do_sample = True,temperature = 0.1)[0], skip_special_tokens=True))

What is 50+125+49?
Only give me the answer, I do not need the process.
ANSWER: The sum of 50 + 125 + 49 equals 224.
