In [1]:
# https://colab.research.google.com/drive/1ggaa2oRFphdBmqIjSEbnb_HGkcIRC2ZB?usp=sharing#scrollTo=GlxX7p6Jdcmg
# https://www.kaggle.com/code/lucamassaron/fine-tune-llama-2-for-sentiment-analysis/comments
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install trl

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
from datasets import load_dataset, Dataset, DatasetDict
from dataclasses import dataclass, field
from typing import Optional
import torch
from peft import LoraConfig
from tqdm import tqdm
import pandas as pd
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, AutoTokenizer, pipeline
from trl import SFTTrainer

import json
import requests
tqdm.pandas()

# Read the whole Emotional Support Conversation corpus
def read_ESConv():
    url = 'https://raw.githubusercontent.com/thu-coai/Emotional-Support-Conversation/main/ESConv.json'
    response = requests.get(url)
    raw_data = response.json()

    print('Amount of data: {}'.format(len(raw_data)))
    return raw_data

raw_data = read_ESConv()

Amount of data: 1300


In [3]:
# Prepare data for problem_type classification.
def llama2_process_data(raw_data):
  processed_data = []
  text = ''
  for i in range(len(raw_data)):
    for j in range(len(raw_data[i]['dialog'])):
      content = raw_data[i]['dialog'][j]['content'].rstrip('\n')
      label = raw_data[i]['problem_type']
      text = '### Instance: ' + content + '### Class: ' + label
      processed_data.append(text)
  return processed_data

processed_data = llama2_process_data(raw_data)
print(len(processed_data))
processed_data[2]

38365


'### Instance: I am having a lot of anxiety about quitting my current job. It is too stressful but pays well### Class: job crisis'

In [4]:
# Split the dataset.
import random

processed_data = processed_data[:3000]  ###

train_size = int(len(processed_data) * 0.7)
random.shuffle(processed_data)

train_set = processed_data[:train_size]
val_set = processed_data[train_size:]

ds_train = Dataset.from_dict({"text": train_set})
ds_validation = Dataset.from_dict({"text": val_set})
instructions_ds_dict = DatasetDict({"train": ds_train, "eval": ds_validation})

In [5]:
instructions_ds_dict['train']['text'][1]

'### Instance: Sorry i was having wifi issues ### Class: breakup with partner'

In [6]:
model_name = "NousResearch/Llama-2-7b-chat-hf"


@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default=model_name, metadata={"help": "the model name"})
    dataset_text_field: Optional[str] = field(default="text", metadata={"help": "the text field of the dataset"})
    log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
    learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"})
    batch_size: Optional[int] = field(default=4, metadata={"help": "the batch size"})
    seq_length: Optional[int] = field(default=512, metadata={"help": "Input sequence length"})
    gradient_accumulation_steps: Optional[int] = field(
        default=2, metadata={"help": "the number of gradient accumulation steps"}
    )
    load_in_8bit: Optional[bool] = field(default=False, metadata={"help": "load the model in 8 bits precision"})
    load_in_4bit: Optional[bool] = field(default=True, metadata={"help": "load the model in 4 bits precision"})
    use_peft: Optional[bool] = field(default=True, metadata={"help": "Wether to use PEFT or not to train adapters"})
    trust_remote_code: Optional[bool] = field(default=True, metadata={"help": "Enable `trust_remote_code`"})
    output_dir: Optional[str] = field(default="output", metadata={"help": "the output directory"})
    peft_lora_r: Optional[int] = field(default=64, metadata={"help": "the r parameter of the LoRA adapters"})
    peft_lora_alpha: Optional[int] = field(default=16, metadata={"help": "the alpha parameter of the LoRA adapters"})
    logging_steps: Optional[int] = field(default=1, metadata={"help": "the number of logging steps"})
    use_auth_token: Optional[bool] = field(default=False, metadata={"help": "Use HF auth token to access the model"})
    num_train_epochs: Optional[int] = field(default=3, metadata={"help": "the number of training epochs"})
    max_steps: Optional[int] = field(default=-1, metadata={"help": "the number of training steps"})
    save_steps: Optional[int] = field(
        default=100, metadata={"help": "Number of updates steps before two checkpoint saves"}
    )
    save_total_limit: Optional[int] = field(default=10, metadata={"help": "Limits total number of checkpoints."})
    push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Push the model to HF Hub"})
    hub_model_id: Optional[str] = field(default=None, metadata={"help": "The name of the model on HF Hub"})


script_args = ScriptArguments()

In [7]:
if script_args.load_in_8bit and script_args.load_in_4bit:
    raise ValueError("You can't load the model in 8 bits and 4 bits at the same time")
elif script_args.load_in_8bit or script_args.load_in_4bit:
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=script_args.load_in_8bit, load_in_4bit=script_args.load_in_4bit
    )
    device_map = {"": 0}
    torch_dtype = torch.bfloat16
else:
    device_map = None
    quantization_config = None
    torch_dtype = None

model = AutoModelForCausalLM.from_pretrained(
    script_args.model_name,
    quantization_config=quantization_config,
    device_map=device_map,
    trust_remote_code=script_args.trust_remote_code,
    torch_dtype=torch_dtype,
    use_auth_token=script_args.use_auth_token,
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [8]:
dataset = instructions_ds_dict

training_args = TrainingArguments(
    output_dir=script_args.output_dir,
    per_device_train_batch_size=script_args.batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    learning_rate=script_args.learning_rate,
    logging_steps=script_args.logging_steps,
    num_train_epochs=script_args.num_train_epochs,
    max_steps=script_args.max_steps,
    report_to=script_args.log_with,
    save_steps=script_args.save_steps,
    save_total_limit=script_args.save_total_limit,
    push_to_hub=script_args.push_to_hub,
    hub_model_id=script_args.hub_model_id,
)

if script_args.use_peft:
    peft_config = LoraConfig(
        r=script_args.peft_lora_r,
        lora_alpha=script_args.peft_lora_alpha,
        bias="none",
        task_type="CAUSAL_LM",
    )
else:
    peft_config = None

trainer = SFTTrainer(
    model=model,
    args=training_args,
    max_seq_length=script_args.seq_length,
    train_dataset=dataset['train'],
    eval_dataset=dataset['eval'],
    dataset_text_field=script_args.dataset_text_field,
    peft_config=peft_config,
)

Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]



In [9]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,5.633
2,4.7042
3,5.3733
4,4.9514
5,4.6903
6,4.4793
7,4.5012
8,5.2069
9,5.8982
10,5.1507


TrainOutput(global_step=786, training_loss=2.597572098706515, metrics={'train_runtime': 3327.8415, 'train_samples_per_second': 1.893, 'train_steps_per_second': 0.236, 'total_flos': 1.1623059145162752e+16, 'train_loss': 2.597572098706515, 'epoch': 2.99})

In [10]:
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_name)

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map={'':0},
)

In [11]:
query = instructions_ds_dict['eval']['text'][1].split('### Class: ')[0] + '### Class:'
queries = [instructions_ds_dict['eval']['text'][i].split('### Class: ')[0] + '### Class:' for i in range(len(instructions_ds_dict['eval']))]
sequences = pipeline(
    queries,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=10,
    early_stopping=True,
    # do_sample=True,
)



In [16]:
results = []

for seq in sequences:
  result = seq[0]['generated_text'].split('### Class:')[1]
  results.append(result)

labels = []

for label in instructions_ds_dict['eval']['text']:
  result = label.split('### Class:')[1]
  labels.append(result)

print("Accuracy: ", (len([1 for x, y in zip(results, labels) if y in x]) / len(labels)))

In [16]:
results[:10]

[' job crisis crisis crisis### Subclass: job',
 ' breakup with partner ### Subclass: ###Instance',
 ' breakup with partner. Unterscheidung: breakup with',
 " breakup with partner ### Instance: I'",
 ' breakup with partner ### Subclass: ###Instance',
 ' breakup with partner.',
 ' job crisis### Type: job crisis crisis##',
 ' job crisis crisis crisis ',
 ' breakup with partner. Unterscheidung: breakup with',
 ' breakup with partner### Assistant: How']

In [17]:
count = 0
for x, y in zip(results, labels):
  if count == 50:
    break
  else:
    print(x, y)
    count += 1


 job crisis crisis crisis### Subclass: job  job crisis
 breakup with partner ### Subclass: ###Instance  problems with friends
 breakup with partner. Unterscheidung: breakup with  ongoing depression
 breakup with partner ### Instance: I'  breakup with partner
 breakup with partner ### Subclass: ###Instance  problems with friends
 breakup with partner.  breakup with partner
 job crisis### Type: job crisis crisis##  breakup with partner
 job crisis crisis crisis   ongoing depression
 breakup with partner. Unterscheidung: breakup with  job crisis
 breakup with partner### Assistant: How  ongoing depression
 breakup with partner. ### Subclass: I  job crisis
 breakup with partner. ### Subclass: friends  breakup with partner
 job crisis crisis ### Subclass: job crisis crisis  job crisis
 ### ### ### ###############  breakup with partner
 job crisis  job crisis
 job crisis ### Instance: I see! I  academic pressure
 breakup with partner   ongoing depression
 job crisis ### Instance: I have been 