In [17]:
import torch
import transformers
import sys
import os
import json
import os.path as osp

from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
from transformers import EarlyStoppingCallback, IntervalStrategy
from transformers import set_seed
from typing import List
from datasets import load_dataset
from peft import (LoraConfig, get_peft_model, get_peft_model_state_dict, prepare_model_for_int8_training, set_peft_model_state_dict)
from transformers import LlamaForCausalLM, LlamaTokenizer
from typing import Union
from peft import (LoraConfig, get_peft_model, get_peft_model_state_dict, prepare_model_for_int8_training, set_peft_model_state_dict,)



os.environ["WANDB_DISABLED"] = "true"
set_seed(42)
tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


## Alpaca Instruction Frormat

In [18]:
def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""

In [19]:
class Prompter(object):
    __slots__ = ("template", "_verbose")

    def __init__(self, template_name: str = ""):
        file_name = f"data/{template_name}.json"
        with open(file_name) as fp:
            self.template = json.load(fp)
        

    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        label: Union[None, str] = None,
    ) -> str:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            res = self.template["prompt_input"].format(
                instruction=instruction, input=input
            )
        else:
            res = self.template["prompt_no_input"].format(
                instruction=instruction
            )
        if label:
            res = f"{res}{label}"
        return res

    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip()

## Instruction Format for Sentiment Analysis

In [20]:

import pandas as pd
from datasets import Dataset,DatasetDict 

mapping = {2: "Positive", 1: "Negative"}
mapping_labels = {"Positive": 2, "Negative" :1}


def generate_prompting_instructions(dataset, train_dataset):
    examples = ""
    for i,example in train_dataset.iterrows():
        examples += f"review: {example['review']}\nResponse: {mapping[example['rating']]}\n"
        
    data = {"instruction":[], "input":[], "output":[]}
    for i, record in dataset.iterrows():
        rating = record["rating"]
        text = record["review"]
        data["instruction"].append(f"Classify the following review into two categories: 1) positive, and 2) negative based on its content, given the following examples:\n"+examples)
        data["output"].append(mapping[rating])
        data["input"].append(f"Review: {text}")
    return data



def generate_instructions(dataset):
    
    data = {"instruction":[], "input":[], "output":[]}
    for i, record in dataset.iterrows():
        rating = record["rating"]
        text = record["review"]
        data["instruction"].append(f"Classify the following review into two categories: 1) positive, and 2) negative based on its content.")
        data["output"].append(mapping[rating])
        data["input"].append(f"Review: {text}")
    return data
                           


In [21]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = prompter.generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"],
    )
    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = prompter.generate_prompt(
            data_point["instruction"], data_point["input"]
        )
        tokenized_user_prompt = tokenize(
            user_prompt, add_eos_token=add_eos_token
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [-100] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:]  
    return tokenized_full_prompt


def tokenize(prompt, add_eos_token=True):
    result = tokenizer(prompt, truncation=True, max_length=cutoff_len, padding=False, return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

## Loading Data

In [22]:
def load_splits( training_samples = 2, prompting=False):
    df_training = pd.read_csv("data/train.csv", sep=",", encoding="utf-8")
    df_validation = pd.read_csv("data/val-tiny.csv", sep=",", encoding="utf-8")
    df_test = pd.read_csv("data/test.csv", sep=",", encoding="utf-8")
    df_training = df_training.sample(training_samples)
    dataset = {"train": df_training,"validation":df_validation , "test": df_test, }

    for split in dataset:
        if prompting:
            if split != "train":
                dataset[split]= generate_prompting_instructions(dataset[split], dataset["train"])
        else:
            dataset[split] = generate_instructions(dataset[split])
    return dataset

## Model Initilaization

In [23]:
generation_config = GenerationConfig()
prompter = Prompter("alpaca")
cutoff_len=1024
train_on_inputs=True
add_eos_token=False
prompt_template_name="alpaca"
lora_r = 8
lora_alpha = 16
lora_dropout = 0.05
lora_target_modules= ["q_proj", "v_proj",]

config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=lora_target_modules,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
    )
    

## Training Method

In [24]:
def train(model,
          data,
    
    batch_size: int = 32,
    micro_batch_size: int = 4,
    num_epochs: int = 110,
    learning_rate: float = 3e-4,    
    train_on_inputs: bool = True,  # if False, masks out inputs in loss
    add_eos_token: bool = False,
    early_stopping_threshold = 0.01,
    prompt_template_name: str = "alpaca",  # The prompt template to use, will default to alpaca.
):

    gradient_accumulation_steps = batch_size // micro_batch_size
    prompter = Prompter(prompt_template_name)

    device_map = "auto"

    tokenizer.pad_token_id = (0)
    tokenizer.padding_side = "left"  # Allow batched inference

    
    train_data = (data["train"].shuffle().map(generate_and_tokenize_prompt))
    val_data = (data["test"].shuffle().map(generate_and_tokenize_prompt))

    trainer = transformers.Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=train_data,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=early_stopping_threshold)],
        data_collator=transformers.DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True),
        
        args=transformers.TrainingArguments(
            per_device_train_batch_size=micro_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            warmup_steps=100,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
            fp16=True,
            logging_steps=10,
            lr_scheduler_type='linear',
            optim="adamw_torch",
            evaluation_strategy="steps", 
            metric_for_best_model="loss",
            save_strategy="steps",
            eval_steps=5 ,
            save_steps=5,
            save_total_limit=3,
            load_best_model_at_end = True,
            ddp_find_unused_parameters=None,
            group_by_length=False,
            do_eval=True,
            output_dir="/tmp/alpaca",
            report_to=None,
        ), 
    )
    model.config.use_cache = False
    trainer.train()
    return model

## Loading and Preprocesing Data

In [25]:
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score,f1_score
from tqdm import tqdm


## Evaluation

In [26]:
def main(few_shot_size, prompting=True, split="validation"):
    
    model_name = "wxjiao/alpaca-7b"   
    model = LlamaForCausalLM.from_pretrained(model_name,  device_map = 'auto', torch_dtype=torch.float16, )
    peft_model = get_peft_model(model, config)

    dataset = load_splits(prompting=prompting, training_samples=few_shot_size)
    data = DatasetDict()
    for k,v in dataset.items():
        data[k] = Dataset.from_dict(v)

    if prompting:
        
        data[split] = (data[split].shuffle().map(generate_and_tokenize_prompt))    
    if not prompting:
        learning_rate = 1e-3
        early_stopping_threshold=1e-2
        peft_model = train(peft_model,data, learning_rate=learning_rate, early_stopping_threshold=early_stopping_threshold)

    test_dataloader = DataLoader(data[split],batch_size=1)

    all_test_preds = []
    all_test_labels = []

    for step, (test_input) in enumerate(test_dataloader):

        prompt = generate_prompt(test_input["instruction"], test_input["input"])
        inputs = tokenizer(prompt, return_tensors="pt")
        input_ids = inputs["input_ids"].cuda()
        generation_output = peft_model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=3
        )

        for s in generation_output.sequences:
            output = tokenizer.decode(s)
            all_test_preds.append(output.split("### Response:")[1].strip())
        all_test_labels.append(test_input["output"])
    all_test_preds = list(map( lambda x: 2 if "Positive" in x else 1, all_test_preds))
    all_test_labels = list(map( lambda x:2 if "Positive" in x else 1, all_test_labels))
    test_accuracy = accuracy_score(all_test_labels, all_test_preds)
    print(f"{test_accuracy}")
    return test_accuracy

In [29]:
## Few Shot size Analysis

acc = main(16, prompting=False, split="test")


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.24s/it]
Map: 100%|██████████| 16/16 [00:00<00:00, 962.29 examples/s]
Map: 100%|██████████| 4000/4000 [00:03<00:00, 1252.96 examples/s]
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss
5,No log,3.535245


KeyboardInterrupt: 