In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
with open('data/data_full.json') as f:
    data = json.load(f)

data['train'].extend(data['oos_train'])
data['val'].extend(data['oos_val'])
data['test'].extend(data['oos_test'])

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, TrainingArguments, BitsAndBytesConfig
import torch
from datasets import Dataset
from trl import SFTTrainer
from transformers import Trainer

In [4]:
model_id = 'google/flan-t5-base'
ssd_cache_dir = "/Volumes/LaCie/huggingface_cache"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id,cache_dir=ssd_cache_dir)
tokenizer.pad_token = tokenizer.eos_token

In [6]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

In [7]:
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,   
    # quantization_config=quantization_config,
    cache_dir=ssd_cache_dir,
    device_map="auto"
)

In [8]:
data['train'][:4]

[['what expression would i use to say i love you if i were an italian',
  'translate'],
 ["can you tell me how to say 'i do not speak much spanish', in spanish",
  'translate'],
 ["what is the equivalent of, 'life is good' in french", 'translate'],
 ["tell me how to say, 'it is a beautiful morning' in italian", 'translate']]

In [9]:
def format_data(data):
    return {
        "input": f"Classify the intent: {data[0]}",
        "output": data[1]
    }

In [10]:
train_data = Dataset.from_list([format_data(x) for x in data['train']])
val_data = Dataset.from_list([format_data(x) for x in data['val']])
test_data = Dataset.from_list([format_data(x) for x in data['test']])

In [11]:
def tokenize_data(data):
    return tokenizer(
        data["input"],
        text_target=data["output"],
        padding="max_length",
        max_length=256,
        truncation=True,
    )

In [12]:
tokenized_train_data = train_data.map(tokenize_data)
tokenized_val_data = val_data.map(tokenize_data)
tokenized_test_data = test_data.map(tokenize_data)

Map:   0%|          | 0/15100 [00:00<?, ? examples/s]

Map:   0%|          | 0/3100 [00:00<?, ? examples/s]

Map:   0%|          | 0/5500 [00:00<?, ? examples/s]

In [13]:
from peft import get_peft_model, LoraConfig, TaskType

In [14]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],  # for Mistral
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

In [15]:
model = get_peft_model(model, lora_config)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [16]:
tokenized_train_data

Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 15100
})

In [17]:
output_dir = "/Volumes/LaCie/Projects_portfolio/NLP/SupportIQ/data/"
training_args = TrainingArguments(
                output_dir=output_dir,
                learning_rate=1e-5,
                num_train_epochs=1,
                weight_decay=0.01,
                logging_steps=1,
                label_names=["labels"],
                )

In [18]:
trainer = Trainer(
                model=model,
                train_dataset=tokenized_train_data,
                eval_dataset=tokenized_val_data,
                # peft_config=lora_config,
                args=training_args,
                tokenizer=tokenizer,
                # formatting_func=None
            )

  trainer = Trainer(


In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,4.3984
