In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer
model = AutoModelForCausalLM.from_pretrained(
    "openchat/openchat-3.5-0106", 
    device_map="auto", 
    torch_dtype="auto"
)

tokenizer = AutoTokenizer.from_pretrained("openchat/openchat-3.5-0106")

Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  9.30it/s]
Some parameters are on the meta device because they were offloaded to the disk and cpu.


In [12]:
# apply lora config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model,lora_config)

In [13]:
# loading the dataset

dataset = load_dataset("nbertagnolli/counsel-chat")
dataset

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['questionID', 'questionTitle', 'questionText', 'questionLink', 'topic', 'therapistInfo', 'therapistURL', 'answerText', 'upvotes', 'views'],
        num_rows: 2775
    })
})

In [14]:
dataset["train"][0]

{'questionID': 0,
 'questionTitle': 'Do I have too many issues for counseling?',
 'questionText': 'I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.\n   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?',
 'questionLink': 'https://counselchat.com/questions/do-i-have-too-many-issues-for-counseling',
 'topic': 'depression',
 'therapistInfo': 'Jennifer MolinariHypnotherapist & Licensed Counselor',
 'therapistURL': 'https://counselchat.com/therapists/jennifer-molinari',
 'answerText': 'It is very common for\xa0people to have multiple issues that they want to (and need to) address in counseling.\xa0 I have had clients ask that same question and through more exploration, there is often an underlying fear that they\xa0 "can\

In [15]:
'''
The dataset isnt in the correct format for training.
we are going to need to preprocess the data.
'''

def preprocess_data(example):
    return {
        'text':f"User: {example["questionTitle"]}\nUser Details: {example["questionTitle"]}\nAssistant: {example["answerText"]}"
    }

processed_df = dataset['train'].map(preprocess_data)
processed_df

Dataset({
    features: ['questionID', 'questionTitle', 'questionText', 'questionLink', 'topic', 'therapistInfo', 'therapistURL', 'answerText', 'upvotes', 'views', 'text'],
    num_rows: 2775
})

In [16]:
# tokenize

# pad token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = processed_df.map(tokenize_function, batched=True)
tokenized_datasets

Dataset({
    features: ['questionID', 'questionTitle', 'questionText', 'questionLink', 'topic', 'therapistInfo', 'therapistURL', 'answerText', 'upvotes', 'views', 'text', 'input_ids', 'attention_mask'],
    num_rows: 2775
})

In [17]:
tokenized_datasets = tokenized_datasets.remove_columns([col for col in tokenized_datasets.column_names if col != "input_ids" and col != "attention_mask"])
tokenized_datasets.set_format("torch")
tokenized_datasets

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2775
})

In [19]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./openchat-counsel-chat",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    push_to_hub=False # push to HG hub
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets, # try splitting after
    tokenizer=tokenizer
)

trainer.train()

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
model.save_pretrained("./openchat-counsel-chat")
tokenizer.save_pretrained("./openchat-counsel-chat")