In [1]:
import pandas as pd 
import torch
import torch.nn as nn
import transformers
import json
from datasets import load_dataset
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging,Trainer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch
from datasets import load_dataset
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= True,
)
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)
model.config.use_cache = False # silence the warnings
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.47s/it]


In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'left'
tokenizer.add_eos_token = True
tokenizer.add_bos_token=True
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

In [7]:
def formatting_text(data):
    prompt = '''[INST]Below is an English context with  user's question, situational background information, and my response. Your task is to judge whether my response to the user's question is appropriate or not only according to the situational background information provided. The user's question, situational background information and my response are specified with [User's question], [Background] and [My response] in the context respectively. If you think my response is appropriate, please answer "Yes"; if you think my response is inappropriate, please answer "No".'''
    question = data['u']
    response = data['r']
    backgorund_info = f"[Background]:"
    for i,(type, bg) in enumerate(zip(data['s.type'],data['s'])):
        # backgorund_info += f"\n {i+1}. [{type}] {bg} [/{type}]"
        backgorund_info +=  f"\n {i+1}. {bg}"
    backgorund_info +=f"\n[/Background]"
    # backgorund_info = f"[Background]: {background} [/Background]"
    text= f"{prompt} \n\n[User's question]: {question} [/User's question] \n{backgorund_info}\n[My response]: {response} [/My response][/INST]"
    return text
def formatting_label(data):
    label = "yes"if data['r.label'] ==1 else "no"
    relevant_background = [d+1 for d in sorted(data['s.gold.index'])]
    if label=="yes":
        label_text = f"First, I identify the background statements relevant to user's question by number: {relevant_background}.\nBased on the relevant background statements, the answer is <answer>{label.capitalize()}, your response is appropriate.</answer>"
    else:
        label_text = f"First, I identify the background statements relevant to user's question by number: {relevant_background}.\nBased on the relevant background statements, the answer is <answer>{label.capitalize()}, your response is inappropriate.</answer>"
    text = f"{label_text}"
    return text
def format_prompt(data):

    prompt = f"{data['text']}\n{data['label']}"
    return prompt
def read_json(mode):
    with open(f"dataset/{mode}.json", 'r') as file:
        data = json.load(file)
    data = pd.DataFrame(data)
    return data
train_data = read_json("train")
valid_data = read_json("val")
test_data = read_json("test")
train_data['text'] = train_data.apply(formatting_text,axis=1)
train_data['label'] = train_data.apply(formatting_label,axis=1)
train_data['prompt'] = train_data.apply(format_prompt,axis=1)
train_data = train_data[['text','label','prompt']]
valid_data['text'] = valid_data.apply(formatting_text,axis=1)
valid_data['label'] = valid_data.apply(formatting_label,axis=1)
valid_data['prompt'] = valid_data.apply(format_prompt,axis=1)
valid_data = valid_data[['text','label','prompt']]

In [8]:
print(train_data['prompt'][1001])

[INST]Below is an English context with  user's question, situational background information, and my response. Your task is to judge whether my response to the user's question is appropriate or not only according to the situational background information provided. The user's question, situational background information and my response are specified with [User's question], [Background] and [My response] in the context respectively. If you think my response is appropriate, please answer "Yes"; if you think my response is inappropriate, please answer "No". 

[User's question]: Can you turn on the TV for me, please? [/User's question] 
[Background]:
 1. [user] has a letter to send.
 2. There are some soft drinks in the refrigerator.
 3. It is Sunday afternoon now.
 4. [user] has two friends visiting.
 5. [user] has protein powder in the kitchen.
 6. [user] has a TV in the house.
 7. [user] has milk in the fridge.
 8. [user] is home.
 9. The TV is currently off.
 10. There are drinks and app

In [9]:
train_data['prompt'].apply(len).describe()

count    3696.000000
mean     1416.364989
std        47.081108
min      1278.000000
25%      1384.000000
50%      1414.500000
75%      1446.000000
max      1622.000000
Name: prompt, dtype: float64

In [15]:
def preprocess(data): #full prompt
    model_inputs = tokenizer(data['prompt'], padding="max_length", max_length=1650,truncation=True, return_tensors='pt')
    # label = tokenizer(data['label'], padding="max_length", max_length=50,truncation=True, return_tensors='pt')
    model_inputs["labels"] = model_inputs["input_ids"].clone()
    return model_inputs
new_train_data = datasets.Dataset.from_pandas(train_data,split='train')
new_valid_data = datasets.Dataset.from_pandas(valid_data,split='train')
tokenized_train_data = new_train_data.map(preprocess,remove_columns = new_train_data.column_names,batched=True)
tokenized_valid_data = new_valid_data.map(preprocess,remove_columns = new_valid_data.column_names,batched=True)
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)


Map: 100%|██████████| 3696/3696 [00:01<00:00, 2970.62 examples/s]
Map: 100%|██████████| 792/792 [00:00<00:00, 3110.92 examples/s]


In [10]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[ "q_proj",
        "k_proj",""
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",]
)
model = get_peft_model(model, peft_config)

In [11]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
print_trainable_parameters(model)

trainable params: 85041152 || all params: 3837112320 || trainable%: 2.2162799758751914


In [12]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
model = accelerator.prepare_model(model)

In [16]:
training_arguments = TrainingArguments(
    output_dir="./models/chat_model_finetune", #baseline_with_complete_prompt
    logging_dir="./logs/chat_model_finetune",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=100,
    save_total_limit = 4,
    logging_steps=100,
    evaluation_strategy="steps",
    save_strategy="steps",
    learning_rate=2e-5,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    do_eval=True,
    lr_scheduler_type="constant",
    metric_for_best_model = "eval_loss",    
)
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_data,
    eval_dataset = tokenized_valid_data,
    tokenizer=tokenizer,
    args=training_arguments,
    data_collator=data_collator,
)
# trainer = SFTTrainer(
#     model=model,
#     train_dataset=new_train_data,
#     eval_dataset = new_valid_data,
#     peft_config=peft_config,
#     max_seq_length= 1150,
#     dataset_text_field="prompt",
#     tokenizer=tokenizer,
#     args=training_arguments,
#     packing= False,
#     compute_metrics = compute_metrics
# )

In [18]:
trainer.train()

## test

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
base_model_id = "mistralai/Mistral-7B-instruct-v0.2"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)


Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.01it/s]


In [5]:
ft_model = PeftModel.from_pretrained(base_model, "./models/chat_version_with_full_prompt7/checkpoint-1800")

In [21]:
import json
import pandas as pd
def formatting_text(data):
    # prompt = '''Below is an English context with  user's question, situational background information, and my response. Your task is to judge whether my response to the user's question is appropriate or not only according to the situational background information provided. The user's question, situational background information and my response are specified with [User's question], [Background] and [My response] in the context respectively. If you think my response is appropriate, please answer "Yes"; if you think my response is inappropriate, please answer "No".'''
    prompt = '''[INST]Below is an English context with  user's question, situational background information, and my response. Your task is to judge whether my response to the user's question is appropriate or not only according to the situational background information provided. The user's question, situational background information and my response are specified with [User's question], [Background] and [My response] in the context respectively. If you think my response is appropriate, please answer "Yes"; if you think my response is inappropriate, please answer "No".'''
    question = data['u']
    response = data['r']
    backgorund_info = f"[Background]:"
    for i,(type, bg) in enumerate(zip(data['s.type'],data['s'])):
        backgorund_info +=  f"\n {i+1}. {bg}"
    backgorund_info +=f"\n[/Background]"
    text= f"{prompt} \n\n[User's question]: {question} [/User's question] \n{backgorund_info}\n[My response]: {response} [/My response][/INST]"
    return text
def formatting_label(data):
    label = "yes"if data['r.label'] ==1 else "no"
    relevant_background = sorted(data['s.gold.index'])
    if label=="yes":
        label_text = f"First, I identify the background statements relevant to user's question by number: {relevant_background}.\nBased on the relevant background statements, the answer is <answer>{label.capitalize()}, your response is appropriate.</answer>"
    else:
        label_text = f"First, I identify the background statements relevant to user's question by number: {relevant_background}.\nBased on the relevant background statements, the answer is <answer>{label.capitalize()}, your response is inappropriate.</answer>"
    text = f"{label_text}"
    return text
def format_prompt(data):

    prompt = f"{data['text']}\n{data['label']}"
    return prompt
def read_json(mode):
    with open(f"dataset/{mode}.json", 'r') as file:
        data = json.load(file)
    data = pd.DataFrame(data)
    return data
test_data = read_json("test")
test_data['text'] = test_data.apply(formatting_text,axis=1)

In [24]:
import re
from tqdm import tqdm, trange
from transformers import StoppingCriteriaList,StoppingCriteria
model.eval()
predictions = []
class StopWords(StoppingCriteria):
    def __init__(self, tk, stop_words: list[str]):
        self.tk = tk
        self.stop_tokens = stop_words

    def __call__(self, input_ids, *_) -> bool:
        s = self.tk.batch_decode(input_ids)[0]
        for t in self.stop_tokens:
            if s.endswith(t):
                return True
        return False
sw = StopWords(tokenizer, ["</answer>"])
scl = StoppingCriteriaList([sw])
with torch.no_grad():
    for i in trange(len(test_data)):
        eval_prompt = test_data['text'][i]
        model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
        pred = tokenizer.decode(model.generate(**model_input, max_new_tokens=500,pad_token_id = tokenizer.eos_token_id,stopping_criteria=scl)[0], skip_special_tokens=True)
        pred = pred.replace(eval_prompt,"")
        predictions.append(pred)
        print(pred)


In [9]:
ans = []
for pre in predictions:
    if "Yes" in pre or "yes" in pre:
        ans.append(1)
    elif "No" in pre or "no" in pre:
        ans.append(0)
    else:
        ans.append(2)

In [10]:
len([a for a in ans if a==1]),len([a for a in ans if a==0])

(247, 545)

In [31]:
import re
ans= []
predictions = pd.read_csv("predictions.csv")['result']
for pred in predictions:
    text = pred
    # Updated regex pattern to extract "yes" or "no" after "### Answer:"
    pattern_flexible = r'<answer>(.*?)</answer>'


    # Extracting the answer using the updated pattern
    match_updated = re.search(pattern_flexible, text, re.IGNORECASE | re.DOTALL)

    # Extracting the answer
    answer_updated = match_updated.group(1) if match_updated else "No match found"
    ans.append(answer_updated)

In [43]:
final_ans = [1 if a=='Yes' else 0 for a in ans]
sum(final_ans)
pd.DataFrame({"response_quality":final_ans}).to_csv("submission.csv",index_label="index")