In [1]:
from collections import Counter
import numpy as np
import os
from pathlib import Path
import random

import torch

from datasets import load_dataset
import evaluate
import transformers
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from transformers import BitsAndBytesConfig
from transformers import T5ForConditionalGeneration
from transformers import MT5ForConditionalGeneration

In [2]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(42)


Random seed set as 42


# Seq2Cls

In [3]:
DATA_ROOT = '/mnt/data/factcheck/wiki/cs/20230220/qacg'
NER_DIR = 'PAV-ner-CNEC'
QG_DIR = 'mt5-large-cp59k'
QA2D_DIR = 'mbart-large-cc25_cp26k'

CLAIM_ROOT = Path(DATA_ROOT, 'claim', NER_DIR, QG_DIR, QA2D_DIR)
CLAIM_QUALITY_DIR = 'claim_quality_v3'
CLAIM_QUALITY_ROOT = Path(DATA_ROOT, CLAIM_QUALITY_DIR, NER_DIR, QG_DIR, QA2D_DIR)

cq = load_dataset("json", data_files={
    "train": str(Path(CLAIM_QUALITY_ROOT, "train.jsonl")),
    "dev": str(Path(CLAIM_QUALITY_ROOT, "dev.jsonl")),
    "test": str(Path(CLAIM_QUALITY_ROOT, "test.jsonl"))
    })

Found cached dataset json (/home/drchajan/.cache/huggingface/datasets/json/default-4f9dc646e89ac590/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
cq["test"][1]

{'text': 'Eostre byl zasvěcen v měsíci Eosturmōnaþ.', 'label': 'ok'}

In [4]:
model_name = "deepset/xlm-roberta-large-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
def preprocess_text(examples):
    return tokenizer(examples["text"], truncation=True)

def preprocess_labels(examples):
    l2id = {"ok": 0, "bad": 1}
    label_id = [l2id[l] for l in examples["label"]]
    return {"label": label_id}

tokenized_cq = cq.map(preprocess_text, batched=True)
tokenized_cq = tokenized_cq.map(preprocess_labels, batched=True)

Loading cached processed dataset at /home/drchajan/.cache/huggingface/datasets/json/default-4f9dc646e89ac590/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-110bd72d5ee4e1b4.arrow


Loading cached processed dataset at /home/drchajan/.cache/huggingface/datasets/json/default-4f9dc646e89ac590/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-7cb1e2c96ad349aa.arrow
Loading cached processed dataset at /home/drchajan/.cache/huggingface/datasets/json/default-4f9dc646e89ac590/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-14f895b68e33ff17.arrow
Loading cached processed dataset at /home/drchajan/.cache/huggingface/datasets/json/default-4f9dc646e89ac590/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-b4496621d41011b7.arrow
Loading cached processed dataset at /home/drchajan/.cache/huggingface/datasets/json/default-4f9dc646e89ac590/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-76a2e36382dede19.arrow
Loading cached processed dataset at /home/drchajan/.cache/huggingface/datasets/json/default-4f9dc646e89ac590/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd531

In [6]:
tokenized_cq["train"][1]

{'text': 'šéfem firmy je doktor.',
 'label': 1,
 'input_ids': [0, 106066, 195, 13679, 55, 11761, 5, 2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "ok", 1: "bad"}
label2id = {"bad": 0, "ok": 1}

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at deepset/xlm-roberta-large-squad2 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classi

In [9]:
batch_size=1
eval_steps = 128

training_args = TrainingArguments(
    output_dir=f"EXP/classification-playground/{model_name}",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    # num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_first_step=True,
    logging_steps=eval_steps,
    eval_steps=eval_steps,
    save_steps=eval_steps,
    max_steps=16384,
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cq["train"],
    eval_dataset=tokenized_cq["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mdrchajan[0m ([33maic-l2c[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
128,1.2036,0.719009,0.61
256,1.3103,1.304315,0.61


# Seq2Seq classification

In [3]:
DATA_ROOT = '/mnt/data/factcheck/wiki/cs/20230220/qacg'
NER_DIR = 'PAV-ner-CNEC'
QG_DIR = 'mt5-large-cp59k'
QA2D_DIR = 'mbart-large-cc25_cp26k'

CLAIM_ROOT = Path(DATA_ROOT, 'claim', NER_DIR, QG_DIR, QA2D_DIR)
CLAIM_QUALITY_DIR = 'claim_quality_v3'
CLAIM_QUALITY_ROOT = Path(DATA_ROOT, CLAIM_QUALITY_DIR, NER_DIR, QG_DIR, QA2D_DIR)

cq = load_dataset("json", data_files={
    "train": str(Path(CLAIM_QUALITY_ROOT, "train.jsonl")),
    "dev": str(Path(CLAIM_QUALITY_ROOT, "dev.jsonl")),
    "test": str(Path(CLAIM_QUALITY_ROOT, "test.jsonl"))
    })

Found cached dataset json (/home/drchajan/.cache/huggingface/datasets/json/default-4f9dc646e89ac590/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
model_name = "google/umt5-base"
# model_name = "google/umt5-xl"
# model_name = "google/flan-t5-base"
# model_name = "google/flan-t5-large"
# model_name = "google/flan-t5-xl"
# model_name = "google/flan-t5-xxl"
# model_name = "google/mt5-large"
# model_name = "google/mt5-base"
# model_name = "t5-base"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    # load_in_4bit=True,
    # bnb_4bit_use_double_quant=True,
    # bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=torch.bfloat16
)

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name, quantization_config=bnb_config, device_map={"":0})

T5ForConditionalGeneration._keep_in_fp32_modules = None
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")

In [5]:
model

UMT5ForConditionalGeneration(
  (shared): Embedding(256384, 768)
  (encoder): UMT5Stack(
    (embed_tokens): Embedding(256384, 768)
    (block): ModuleList(
      (0-11): 12 x UMT5Block(
        (layer): ModuleList(
          (0): UMT5LayerSelfAttention(
            (SelfAttention): UMT5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): UMT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): UMT5LayerFF(
            (DenseReluDense): UMT5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=

In [6]:
model.get_memory_footprint()
!nvidia-smi

Fri Aug  4 12:26:28 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB           On | 00000000:81:00.0 Off |                    0 |
| N/A   25C    P0               57W / 400W|   3127MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [7]:
def preprocess_function(examples):
    text_column = "text"
    label_column = "label"
    max_length = 128
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

# tokenized_cq = cq.map(preprocess_text, batched=True)
# tokenized_cq = tokenized_cq.map(preprocess_labels, batched=True)
tokenized_cq = cq.map(preprocess_function, batched=True,  
                      remove_columns=cq["train"].column_names,
                      load_from_cache_file=False)

Map:   0%|          | 0/1201 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [8]:
tokenizer.decode(tokenized_cq["test"][3]["input_ids"])

'MKULTRA prohlásila, že MKULTRA byla zkoumána.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [9]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # TODO FIX --- ids needed herem, predict_with_generate=True does not work????
    print(predictions)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    print(predictions)
    print(labels)
    acc = metric.compute(predictions=predictions, references=labels)
    print(acc)
    assert False
    return acc

EVAL_STEPS=16
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_cq["train"],
    eval_dataset=tokenized_cq["dev"],
    args=transformers.Seq2SeqTrainingArguments(
        # auto_find_batch_size=True,
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        # per_device_train_batch_size=1,
        # gradient_accumulation_steps=32,
        # warmup_steps=100,
        learning_rate=2e-5,
        # fp16=True,
        bf16=True,
        output_dir=f"outputs/{model_name}",
        optim="adamw_torch",
        # optim="paged_adamw_8bit",
        # optim="paged_adamw_32bit",
        
        logging_first_step=True,
        logging_steps=EVAL_STEPS,
        eval_steps=EVAL_STEPS,
        save_steps=EVAL_STEPS,
        max_steps=3*1024,
        # max_steps=320,
        seed=42,
        evaluation_strategy="steps",
        save_strategy="steps",
        save_total_limit=3,
        load_best_model_at_end=True,
        # load_best_model_at_end=False
        # predict_with_generate=True,
    ),
    # compute_metrics=compute_mestrics,
    data_collator=transformers.DataCollatorForSeq2Seq(tokenizer, model=model,
        label_pad_token_id=-100),
)
# model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mdrchajan[0m ([33maic-l2c[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
16,27.8263,25.454683
32,26.759,25.180508
48,26.6164,24.242197
64,25.028,23.24297
80,24.6384,21.946133
96,26.2681,21.102385
112,23.5139,21.198549
128,22.9409,20.570091
144,20.8202,19.651747
160,21.1867,18.542112


In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix
from aic_nlp_utils.batch import batch_apply

def split_predict(model, split, batch_size=128):
    def predict(txts):
        X = tokenizer(txts, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
        input_ids = X["input_ids"].to("cuda")
        attention_mask = X["attention_mask"].to("cuda")
        Y = model.generate(input_ids=input_ids, attention_mask=attention_mask)
        C = tokenizer.batch_decode(Y, skip_special_tokens=True)
        return C
        
    txts = [f"{txt}</s>" for txt in split["text"]]
    C = batch_apply(predict, txts, batch_size=batch_size)
    T = [l for l in split["label"]]
    return C, T

C, T = split_predict(model, cq["dev"])
print(f"acc: {accuracy_score(T, C)}")
print(f"cm:\n{confusion_matrix(T, C)}")

acc: 0.53
cm:
[[19 20]
 [27 34]]


In [11]:
C, T = split_predict(model, cq["test"])
print(f"acc: {accuracy_score(T, C)}")
print(f"cm:\n{confusion_matrix(T, C)}")

acc: 0.58
cm:
[[22 18]
 [24 36]]


In [12]:
C, T = split_predict(model, cq["train"])
print(f"acc: {accuracy_score(T, C)}")
print(f"cm:\n{confusion_matrix(T, C)}")

acc: 0.9117402164862615
cm:
[[451  23]
 [ 83 644]]


In [10]:
with torch.no_grad():
    for i in range(30):
        sample = cq["test"][i]
        txt = sample["text"] + "</s>"
        l = sample["label"]
        print(txt)
        # X = tokenizer(text_target=txt, return_tensors="pt")
        X = tokenizer(txt, max_length=128, padding="max_length", truncation=True, return_tensors="pt")

        # print(X)
        input_ids = X["input_ids"].to("cuda")
        attention_mask = X["attention_mask"].to("cuda")
        Y = model.generate(input_ids=input_ids, attention_mask=attention_mask)
        C = tokenizer.batch_decode(Y)
        print(C, l)
        print("---------")

Challenger přistál ve vesmíru 6. listopadu.</s>
['<pad> bad</s>'] bad
---------
Eostre byl zasvěcen v měsíci Eosturmōnaþ.</s>
['<pad> ok</s>'] ok
---------
Jméno Christian Rosencreutz se překládá jako křesťan Rosenkreuz.</s>
['<pad> bad</s>'] ok
---------
MKULTRA prohlásila, že MKULTRA byla zkoumána.</s>
['<pad> bad</s>'] bad
---------
Bývalý pilot F1, který podpořil projekt, byl Adrian Newey.</s>
['<pad> ok</s>'] bad
---------
Nová hoľa se nachází v oblasti Donovalského sedla na Slovensku.</s>
['<pad> ok</s>'] ok
---------
Magnus IV. začal vládnout v roce 1319.</s>
['<pad> ok</s>'] ok
---------
Josef Gočár se narodil 13. března 1880.</s>
['<pad> ok</s>'] ok
---------
Martine Moïse byla zastřelená.</s>
['<pad> bad</s>'] ok
---------
Armáda začala s ošetřovatelskými sbory v roce 1902.</s>
['<pad> ok</s>'] bad
---------
Spolu s Arnoldem von Schönerem byl Ferdinand Kronawetter během 80. let v Říšské radě.</s>
['<pad> ok</s>'] ok
---------
Vojtěcha Preissiga je nejcennější na Karáskově gal

In [31]:
model.save_pretrained('/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/outputs/checkpoint-last')
accelerate.save_model(model, '/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/outputs/checkpoint-last')



In [22]:
from accelerate import Accelerator
accelerate = Accelerator()

In [8]:
# torch.save(model.state_dict(), '/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/outputs/checkpoint-last/pytorch_model.bin2')

In [9]:
from accelerate.utils import BnbQuantizationConfig
from accelerate.utils import load_and_quantize_model

quantization_config = BnbQuantizationConfig(load_in_8bit=True, llm_int8_threshold = 10)

In [13]:
# model_path = '/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/outputs/google/flan-t5-xl/checkpoint-160'
# model_path = '/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/outputs/checkpoint-160'
model_path = '/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/outputs/google/flan-t5-large/checkpoint-160'
model2 = AutoModelForSeq2SeqLM.from_pretrained(model_path)
model2.to("cuda")
# from accelerate import init_empty_weights

# with init_empty_weights():
    # model2 = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# model2 = load_and_quantize_model(model2, weights_location=model_path, bnb_quantization_config=quantization_config)


T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [60]:
w = model.encoder.block[0].layer[0].SelfAttention.q
w.weight

Parameter containing:
Parameter(Int8Params([[ 91, -56,  37,  ..., -21,   5,  67],
            [  7,  18,   3,  ..., -17,  -4,  38],
            [ 14,   7, -60,  ..., -12,  20,  50],
            ...,
            [ 21,  48,  37,  ..., -31, -26,  79],
            [ 30,  29,  39,  ...,   5,  21,  19],
            [-25,  -5,  -3,  ..., -16,  42, -30]], device='cuda:0',
           dtype=torch.int8))

In [11]:
w2 = model2.encoder.block[0].layer[0].SelfAttention.q
w2.weight

Parameter containing:
Parameter(Int8Params([[ 91, -56,  37,  ..., -21,   5,  67],
            [  7,  18,   3,  ..., -17,  -4,  38],
            [ 14,   7, -60,  ..., -12,  20,  50],
            ...,
            [ 21,  48,  37,  ..., -31, -26,  79],
            [ 35,  34,  45,  ...,   6,  24,  22],
            [-26,  -5,  -3,  ..., -17,  44, -32]], device='cuda:0',
           dtype=torch.int8))

In [18]:
w2.weight.SCB

In [40]:
model2

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear8bitLt(in_features=768, out_features=768, bias=False)
              (k): Linear8bitLt(in_features=768, out_features=768, bias=False)
              (v): Linear8bitLt(in_features=768, out_features=768, bias=False)
              (o): Linear8bitLt(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear8bitLt(in_features=768, out_features=3072, bias=False)
              (wo): Linear8bitLt(in_features=3072, out_features=768, bias

In [57]:
def compare_models(m1, m2):
    for (name1, param1), (name2, param2)in zip(m1.named_parameters(), m2.named_parameters()):
        assert name1 == name2, (name1, name2)
        same = (param1.data == param2.data).all().item()
        print(name1, same)
        if not same:
            print(param1)
            print(param2)
            break


compare_models(model, model2)

shared.weight True
encoder.block.0.layer.0.SelfAttention.q.weight False
Parameter containing:
Parameter(Int8Params([[ 91, -56,  37,  ..., -21,   5,  67],
            [  7,  18,   3,  ..., -17,  -4,  38],
            [ 14,   7, -60,  ..., -12,  20,  50],
            ...,
            [ 21,  48,  37,  ..., -31, -26,  79],
            [ 30,  29,  39,  ...,   5,  21,  19],
            [-25,  -5,  -3,  ..., -16,  42, -30]], device='cuda:0',
           dtype=torch.int8))
Parameter containing:
Parameter(Int8Params([[ 91, -56,  37,  ..., -21,   5,  67],
            [  7,  18,   3,  ..., -17,  -4,  38],
            [ 14,   7, -60,  ..., -12,  20,  50],
            ...,
            [ 21,  48,  37,  ..., -31, -26,  79],
            [ 35,  34,  45,  ...,   6,  24,  22],
            [-26,  -5,  -3,  ..., -17,  44, -32]], device='cuda:0',
           dtype=torch.int8))


# Seq2Seq QLORA

see https://www.philschmid.de/fine-tune-flan-t5-peft

In [2]:
DATA_ROOT = '/mnt/data/factcheck/wiki/cs/20230220/qacg'
NER_DIR = 'PAV-ner-CNEC'
QG_DIR = 'mt5-large-cp59k'
QA2D_DIR = 'mbart-large-cc25_cp26k'

CLAIM_ROOT = Path(DATA_ROOT, 'claim', NER_DIR, QG_DIR, QA2D_DIR)
CLAIM_QUALITY_DIR = 'claim_quality_v3'
CLAIM_QUALITY_ROOT = Path(DATA_ROOT, CLAIM_QUALITY_DIR, NER_DIR, QG_DIR, QA2D_DIR)

cq = load_dataset("json", data_files={
    "train": str(Path(CLAIM_QUALITY_ROOT, "train.jsonl")),
    "dev": str(Path(CLAIM_QUALITY_ROOT, "dev.jsonl")),
    "test": str(Path(CLAIM_QUALITY_ROOT, "test.jsonl"))
    })

Found cached dataset json (/home/drchajan/.cache/huggingface/datasets/json/default-4f9dc646e89ac590/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
model_id="google/flan-t5-xl"
# model_id="google/flan-t5-xxl"

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [4]:
def preprocess_function(examples):
    text_column = "text"
    label_column = "label"
    max_length = 128
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

# tokenized_cq = cq.map(preprocess_text, batched=True)
# tokenized_cq = tokenized_cq.map(preprocess_labels, batched=True)
tokenized_cq = cq.map(preprocess_function, batched=True,  
                      remove_columns=cq["train"].column_names,
                      load_from_cache_file=False)

Map:   0%|          | 0/1201 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
#  r=8,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



trainable params: 9,437,184 || all params: 2,859,194,368 || trainable%: 0.33006444422319176


In [7]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [8]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="outputs/lora-flan-t5-xl"
EVAL_STEPS = 16

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    # learning_rate=2e-3, # higher learning rate
    logging_dir=f"{output_dir}/logs",

    logging_first_step=True,
    logging_steps=EVAL_STEPS,
    eval_steps=EVAL_STEPS,
    save_steps=EVAL_STEPS,
    max_steps=16*1024,
    # max_steps=160,
    seed=42,
    logging_strategy="steps",
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=3,
    load_best_model_at_end=True,
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_cq["train"],
    eval_dataset=tokenized_cq["dev"],
)
model.config.use_cache = False

In [9]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mdrchajan[0m ([33maic-l2c[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669364483095705, max=1.0…

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
16,1.7456,0.279393
32,0.2944,0.277203
48,0.3687,0.410017
64,0.325,0.274539
80,0.2595,0.26365
96,0.3233,0.267424
112,0.2674,0.333046
128,0.5644,0.297
144,0.2769,0.268535
160,0.2884,0.270468




KeyboardInterrupt: 

In [5]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc.
peft_model_id = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/outputs/lora-flan-t5-xl/checkpoint-240"
config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=True,  device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
model.eval()

print("Peft model loaded")

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Peft model loaded


In [15]:
with torch.no_grad():
    sample = cq["train"][10]
    txt = sample["text"] + "</s>"
    l = sample["label"]
    print(txt)
    # X = tokenizer(text_target=txt, return_tensors="pt")
    X = tokenizer(txt, max_length=128, padding="max_length", truncation=True, return_tensors="pt")

    # print(X)
    input_ids = X["input_ids"].to("cuda")
    attention_mask = X["attention_mask"].to("cuda")
    Y = model.generate(input_ids=input_ids, attention_mask=attention_mask)
    C = tokenizer.batch_decode(Y)
    print(C, l)

"Ukřižovaná" vyšla v roce 1964.</s>
['<pad> ok</s>'] ok


In [16]:
def split_predict(model, split):
    txts = [f"{txt}</s>" for txt in split["text"]]
    X = tokenizer(txts, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
    input_ids = X["input_ids"].to("cuda")
    attention_mask = X["attention_mask"].to("cuda")
    Y = model.generate(input_ids=input_ids, attention_mask=attention_mask)
    C = tokenizer.batch_decode(Y, skip_special_tokens=True)
    
    return C

C = split_predict(model, cq["dev"][:50])
Counter(C)

Counter({'ok': 50})