In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_int8_training
import bitsandbytes as bnb

model = AutoModelForCausalLM.from_pretrained(
    "NYTK/PULI-GPT-3SX",
    #load_in_8bit=True,
    #device_map="auto",
    low_cpu_mem_usage=True
)

#model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=8, lora_alpha=16, target_modules=["query", "value"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

tokenizer = AutoTokenizer.from_pretrained("NYTK/PULI-GPT-3SX")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /home/boa/.conda/envs/ai/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/boa/.conda/envs/ai/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

0

In [2]:
# load a previous model for further finetuning

from peft import set_peft_model_state_dict

adapters_weights = torch.load('kmonitor-gpt-fixed-loss-v2-40k/adapter_model.bin')
model = set_peft_model_state_dict(model, adapters_weights)

In [3]:
model = model.merge_and_unload()


In [4]:
model = model.half()


In [6]:
tokenizer.save_pretrained('tokmod')

('tokmod/tokenizer_config.json',
 'tokmod/special_tokens_map.json',
 'tokmod/vocab.json',
 'tokmod/merges.txt',
 'tokmod/added_tokens.json',
 'tokmod/tokenizer.json')

In [1]:
from datasets import load_from_disk

data = load_from_disk('dataset-fixed')



In [2]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'title', 'description', 'keywords', 'theme', 'url', 'authors', 'date', 'kmonitor_description', 'kmonitor_title', 'kmonitor_tags', 'institutes', 'people'],
        num_rows: 43060
    })
    test: Dataset({
        features: ['text', 'title', 'description', 'keywords', 'theme', 'url', 'authors', 'date', 'kmonitor_description', 'kmonitor_title', 'kmonitor_tags', 'institutes', 'people'],
        num_rows: 3618
    })
    validation: Dataset({
        features: ['text', 'title', 'description', 'keywords', 'theme', 'url', 'authors', 'date', 'kmonitor_description', 'kmonitor_title', 'kmonitor_tags', 'institutes', 'people'],
        num_rows: 6598
    })
})

In [None]:
def trunc_to(descr, n):
    if '.' in descr[int(n/2):n]:
        return descr[:descr[:n].rfind('.')+1]
    elif ' ' in descr[int(n/2):n]:
        return descr[:descr[:n].rfind(' ')+1]
    else:
        return descr[:n]

data = data.map(lambda samples: tokenizer('[klasszifikáció]\n' + trunc_to(samples["title"], 150) + '\n\n' + trunc_to(samples["description"], 700) + '\n\n' + trunc_to(samples['text'], 2500) + '\n\n###\n\ntéma: '+('korrupció' if samples['theme'] == 1 else 'egyéb')+'\n'), batched=False)

In [None]:
data['train'] = data['train'].remove_columns(['text', 'title', 'description', 'keywords', 'theme', 'url', 'authors', 'date', 'kmonitor_description', 'kmonitor_title', 'kmonitor_tags', 'institutes', 'people'])

data

In [None]:
# select which samples we train on
strain = data['train'].select(range(0, 20000))

In [None]:
from transformers import Trainer


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        if hasattr(self, 'n_step'):
            self.n_step += 1
        else:
            self.n_step = 1
        if self.n_step % 1000 == 0:
            model.save_pretrained("./kmonitor-gpt-fixed-loss-checkpoint-"+str(self.n_step))
        label = tokenizer.decode([inputs.get("input_ids")[0][-2]]).strip()
        output = model(**inputs)
        l = output['loss']

        outputs = model.generate(input_ids=inputs.get("input_ids")[:, :-2], attention_mask=inputs.get("attention_mask")[:, :-2], max_new_tokens=1)
        result = tokenizer.decode([outputs[0][-1]]).strip()
        loss = 0.5
        # TODO better loss calculation
        if label.strip() in result.strip():
            loss = 0.0
            l /= 2
        elif label.strip() not in ['egyéb', 'korrupció']:
            loss = 1.0

        l.backward()
        return torch.tensor(loss, device='cuda:0', requires_grad=True)

In [None]:
import transformers

trainer = CustomTrainer(
    model=model,
    train_dataset=strain,
    args=transformers.TrainingArguments(
        report_to='wandb',
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=1,
        gradient_accumulation_steps=1,
        warmup_steps=300,
        max_steps=20000,
        learning_rate=9e-3,
        fp16=True,
        weight_decay=0.0,
        logging_steps=10,
        #callbacks=[SavePeftModelCallback],
        output_dir="outputs",
        resume_from_checkpoint=True,
        save_total_limit=1,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False
trainer.train()

In [None]:
model.save_pretrained("./kmonitor-gpt-fixed-loss-v2-10k")