In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_int8_training
import bitsandbytes as bnb

model = AutoModelForCausalLM.from_pretrained(
    "NYTK/PULI-GPT-3SX",
    load_in_8bit=True,
    device_map="auto",
)

model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=8, lora_alpha=16, target_modules=["query", "value"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

tokenizer = AutoTokenizer.from_pretrained("NYTK/PULI-GPT-3SX")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
from datasets import load_from_disk
from datasets import load_dataset

dataset = load_dataset("csv", data_files="datasetv2.csv")
dataset

In [None]:
def fill_none(data):
    for field in ['input', 'instruction', 'output']:
        if data[field] is None:
            data[field] = ''
    return data
#dataset = dataset.map(fill_none)

In [None]:
dataset['train'][0]

In [None]:
prompt = 'Alább található egy instrukció, ami egy feladatot ír le, valamint egy bemenet, ami további kontextust ad. Írj egy illő választ, ami helyesen teljesíti a kérést!\n\n'
prompt_no_input = 'Alább található egy instrukció, ami egy feladatot ír le. Írj egy illő választ, ami helyesen teljesíti a kérést!\n\n'
dataset = dataset.map(lambda data: {'text': (prompt if data['input'] is not None else prompt_no_input)+'### Instrukció:\n' + data['instruction'].strip() + ('\n\n### Bemenet:\n'+data['input'].strip() if data['input'] is not None else '') + '\n\n### Válasz:\n'+data['output'].strip()+'<|endoftext|>'}, remove_columns=['input', 'instruction', 'output'])

In [None]:
print(dataset['train'][1]['text'])

In [None]:
dataset['train'][69]

In [None]:
dataset = dataset.map(lambda data: tokenizer(data['text']), batched=False)

In [None]:
dataset

In [None]:
dataset = dataset.filter(lambda data: len(data['input_ids']) <= 2048)

In [None]:
import transformers
from transformers import Trainer

trainer = Trainer(
    model=model,
    train_dataset=dataset['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        num_train_epochs=3,
        gradient_accumulation_steps=16,
        warmup_steps=50,
        learning_rate=3e-4,
        fp16=True,
        weight_decay=0.0,
        logging_steps=1,
        output_dir="outputs",
        save_total_limit=1,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False
trainer.train()

In [None]:
model.save_pretrained("./szurkemarha-6.5k")