In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from transformers import TrainingArguments
import os, wandb
from trl import SFTTrainer
from peft import LoraConfig, PeftModel
from peft import AutoPeftModelForCausalLM
from vllm import LLM, SamplingParams
import torch
import os
from datasets import load_dataset


[2024-02-12 16:31:03,615] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2024-02-12 16:31:05.167294: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-12 16:31:06.210969: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /mnt/appl/software/Python/3.10.4-GCCcore-11.3.0-bare/lib:/mnt/appl/software/libffi/3.4.2-GCCcore-11.3.0/lib64:/mnt/appl/software/XZ/5.2.5-GCCcore-11.3.0/lib:/mnt/appl/software/SQLite/3.38.3-GCCcore-11.3.0/lib:/mnt/appl/software/Tcl/8.6.12-GCCcore-11.3.0/lib:/mnt/appl/software/libreadline/8.1.2-GCCcore-11.3.0/lib:/mnt/appl/software/ncurses/6.3-GCCcore-11.3.0/lib:/mnt/appl/software/bzip2/1.0.8-GCCcor

In [2]:
import pandas as pd
# load "/mnt/data/factcheck/claim_extraction/feversum/train.jsonl", "/mnt/data/factcheck/claim_extraction/feversum/validation.jsonl", "/mnt/data/factcheck/claim_extraction/feversum/test.jsonl"
df_train = pd.read_json("/mnt/data/factcheck/claim_extraction/feversum/hf/train.jsonl", lines=True)
df_val = pd.read_json("/mnt/data/factcheck/claim_extraction/feversum/hf/validation.jsonl", lines=True)
df_test = pd.read_json("/mnt/data/factcheck/claim_extraction/feversum/hf/test.jsonl", lines=True)
#make datasetdict
raw_dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "validation": Dataset.from_pandas(df_val),
    "test": Dataset.from_pandas(df_test)
})

In [3]:
raw_dataset["train"]["sentence_context"][0]

"Napoleon\nHe won most of these wars and the vast majority of his battles, building a large empire that ruled over continental Europe before its final collapse in 1815.\nOne of the greatest commanders in history, his wars and campaigns are studied at military schools worldwide.\nNapoleon's political and cultural legacy has endured as one of the most celebrated and controversial leaders in human history."

In [5]:
raw_dataset["train"]["claim"][0]

'Napoleon was a commander.'

In [None]:
raw_dataset["validation"]

In [11]:
FORMAT_1 = """Extract a single factual claim from the following Wikipedia sentence:
{context}
----
You must print only the one claim extracted from this context, as a sentence that does not require additional context to interpret, printed as a single-line output."""


def format_prompt(datapoint, hide_output=False, format=FORMAT_1):
    result = [
        {"role": "user", "content": format.format(context="\n".join(datapoint["sentence_context"]))}
    ]
    if not hide_output:
        result.append({"role": "assistant", "content": datapoint["claim"]})
    return result

In [None]:
print(format_prompt(raw_dataset["train"][0]))

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2")

tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

In [8]:
compute_dtype = getattr(torch, "float16")
use_4bit = True
   
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

device_map = "auto"
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,  # loading model in 4-bit
    device_map=device_map, # to use max gpu resources if exist
)

#Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## 🧪 Test prompts

In [9]:
raw_dataset["train"][2911]

{'id': 1747004,
 'source': 'Italy',
 'sentence_id': 1747,
 'claim': 'Italy had a civil war.',
 'source_text': "Italy\nItaly (Italia [iˈtaːlja]), officially the Italian Republic ([Repubblica italiana, links = no]), is a unitary parliamentary republic in Europe.The Italian peninsula is geographically located in Southern Europe, while North Italy can be placed partly or totally in Central Europe. Due to cultural, political and historical reasons, Italy is a Western European country. Located in the heart of the Mediterranean Sea, Italy shares open land borders with France, Switzerland, Austria, Slovenia, San Marino and Vatican City. Italy covers an area of 301338 km2 and has a largely temperate seasonal and Mediterranean climate. Due to its shape, it is often referred to in Italy as lo Stivale (the Boot). With 61 million inhabitants, it is the fourth most populous EU member state.\nSince classical times, ancient Phoenicians, Carthaginians and Greeks established settlements in the south of 

In [12]:
formats = [
    """Extract a single factual claim that follows from the following Wikipedia sentence:
{context}
----
You must print only the one claim extracted from this context, as a sentence that does not require additional context to interpret, printed as a single-line output.""",
    """Article: {context}
----
You are an expert semanticist tasked to find all the facts mentioned in the context above. Print one factual claim per line, each a single sentence that does not require additional context be understood. Print as many lines as it takes to cover all the input facts.""",
    """You are an expert journalist assistant, you summarize the Article into a list of atomic factual claim it makes. I.e., your task is to enumerate all the most relevant claims this article makes. Please print one claim per line and make them interpretable on their own.
Article: {context}"""
]

In [13]:
import json

In [15]:
sample = raw_dataset["train"][11]
for prompt in [format_prompt(sample, format=el, hide_output=True) for el in formats]:
    model_inputs = tokenizer.apply_chat_template(prompt, return_tensors="pt").to("cuda")
    # prettyprint json prompt
    print(json.dumps(prompt, indent=4))
    generated_ids = model.generate(
        model_inputs,
        max_new_tokens=180, # set accordingly to your test_output
        do_sample=False
    )

    decoded_output = tokenizer.batch_decode(generated_ids,skip_special_tokens=True)
    # only preserve the output after last '[/INST]'
    decoded_output = [el.split("[/INST]")[-1].strip() for el in decoded_output]
    # Output results for comparison
    print(f"---\nGenerated Output: \n{decoded_output[0]}\n")
    print(f"---\nExpected Output: {sample['claim']}\n")
    print("-" * 75)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[
    {
        "role": "user",
        "content": "Extract a single factual claim that follows from the following Wikipedia sentence:\n2\n4\n \n(\n2\n0\n1\n6\n \nf\ni\nl\nm\n)\n\n\n2\n4\n \ni\ns\n \na\n \n2\n0\n1\n6\n \nI\nn\nd\ni\na\nn\n \nT\na\nm\ni\nl\n-\nl\na\nn\ng\nu\na\ng\ne\n \ns\nc\ni\ne\nn\nc\ne\n \nf\ni\nc\nt\ni\no\nn\n \nt\nh\nr\ni\nl\nl\ne\nr\n \nf\ni\nl\nm\n \nw\nr\ni\nt\nt\ne\nn\n \na\nn\nd\n \nd\ni\nr\ne\nc\nt\ne\nd\n \nb\ny\n \nV\ni\nk\nr\na\nm\n \nK\nu\nm\na\nr\n.\n\n\nB\na\ns\ne\nd\n \no\nn\n \nt\nh\ne\n \nc\no\nn\nc\ne\np\nt\n \no\nf\n \nt\ni\nm\ne\n-\nt\nr\na\nv\ne\nl\n,\n \nt\nh\ne\n \nf\ni\nl\nm\n \ns\nt\na\nr\ns\n \na\nc\nt\no\nr\n \nS\nu\nr\ni\ny\na\n \ni\nn\n \nt\nr\ni\np\nl\ne\n \nr\no\nl\ne\ns\n,\n \nw\ni\nt\nh\n \na\nc\nt\nr\ne\ns\ns\ne\ns\n \nS\na\nm\na\nn\nt\nh\na\n \nR\nu\nt\nh\n \nP\nr\na\nb\nh\nu\n,\n \nN\ni\nt\nh\ny\na\n \nM\ne\nn\ne\nn\n \na\nn\nd\n \nS\na\nr\na\nn\ny\na\n \nP\no\nn\nv\na\nn\nn\na\nn\n \ni\nn\n \nl\ne\na\nd\n \nr\no\nl\ne\ns\n.\n----

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


---
Generated Output: 
Vikram Kumar directed the film "Indian Tamil-language film Thiruvithamizh" (2001).

---
Expected Output: 24 is a 2016 film.

---------------------------------------------------------------------------
[
    {
        "role": "user",
        "content": "Article: 2\n4\n \n(\n2\n0\n1\n6\n \nf\ni\nl\nm\n)\n\n\n2\n4\n \ni\ns\n \na\n \n2\n0\n1\n6\n \nI\nn\nd\ni\na\nn\n \nT\na\nm\ni\nl\n-\nl\na\nn\ng\nu\na\ng\ne\n \ns\nc\ni\ne\nn\nc\ne\n \nf\ni\nc\nt\ni\no\nn\n \nt\nh\nr\ni\nl\nl\ne\nr\n \nf\ni\nl\nm\n \nw\nr\ni\nt\nt\ne\nn\n \na\nn\nd\n \nd\ni\nr\ne\nc\nt\ne\nd\n \nb\ny\n \nV\ni\nk\nr\na\nm\n \nK\nu\nm\na\nr\n.\n\n\nB\na\ns\ne\nd\n \no\nn\n \nt\nh\ne\n \nc\no\nn\nc\ne\np\nt\n \no\nf\n \nt\ni\nm\ne\n-\nt\nr\na\nv\ne\nl\n,\n \nt\nh\ne\n \nf\ni\nl\nm\n \ns\nt\na\nr\ns\n \na\nc\nt\no\nr\n \nS\nu\nr\ni\ny\na\n \ni\nn\n \nt\nr\ni\np\nl\ne\n \nr\no\nl\ne\ns\n,\n \nw\ni\nt\nh\n \na\nc\nt\nr\ne\ns\ns\ne\ns\n \nS\na\nm\na\nn\nt\nh\na\n \nR\nu\nt\nh\n \nP\nr\na\nb\nh\nu\n,\n \nN\

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


---
Generated Output: 
1. Vikram Kumar is the director of the film.
2. The film is called "Indian Tamil-language film".
3. The film is a sci-fi fiction.
4. The film is about a scientist.
5. The film takes place in a tranquil village.
6. The village is in Suriyana.
7. The film has Menen, Prabhuhu, and Ruthtah as roles.
8. The film has actors like Nithyasree, Samantha, and Saranya in it.
9. The film is produced by Prabhuhu and Ruthtah.
10. The film is penned by Menen.
11. The film is written, directed, and directed by Vikram Kumar.

---
Expected Output: 24 is a 2016 film.

---------------------------------------------------------------------------
[
    {
        "role": "user",
        "content": "You are an expert journalist assistant, you summarize the Article into a list of atomic factual claim it makes. I.e., your task is to enumerate all the most relevant claims this article makes. Please print one claim per line and make them interpretable on their own.\nArticle: 2\n4\n \n(\n2\n0\

## ✅ Pick the best

In [16]:
prompt_dataset = DatasetDict(
    {
        k: Dataset.from_dict(
            {"text": [tokenizer.apply_chat_template(format_prompt(el, format=formats[0]), tokenize=False) for el in v]}
        )
        for k, v in raw_dataset.items()
    }
)

In [17]:
prompt_dataset["train"]["text"][0]

"<s>[INST] Extract a single factual claim that follows from the following Wikipedia sentence:\nN\na\np\no\nl\ne\no\nn\n\n\nH\ne\n \nw\no\nn\n \nm\no\ns\nt\n \no\nf\n \nt\nh\ne\ns\ne\n \nw\na\nr\ns\n \na\nn\nd\n \nt\nh\ne\n \nv\na\ns\nt\n \nm\na\nj\no\nr\ni\nt\ny\n \no\nf\n \nh\ni\ns\n \nb\na\nt\nt\nl\ne\ns\n,\n \nb\nu\ni\nl\nd\ni\nn\ng\n \na\n \nl\na\nr\ng\ne\n \ne\nm\np\ni\nr\ne\n \nt\nh\na\nt\n \nr\nu\nl\ne\nd\n \no\nv\ne\nr\n \nc\no\nn\nt\ni\nn\ne\nn\nt\na\nl\n \nE\nu\nr\no\np\ne\n \nb\ne\nf\no\nr\ne\n \ni\nt\ns\n \nf\ni\nn\na\nl\n \nc\no\nl\nl\na\np\ns\ne\n \ni\nn\n \n1\n8\n1\n5\n.\n\n\nO\nn\ne\n \no\nf\n \nt\nh\ne\n \ng\nr\ne\na\nt\ne\ns\nt\n \nc\no\nm\nm\na\nn\nd\ne\nr\ns\n \ni\nn\n \nh\ni\ns\nt\no\nr\ny\n,\n \nh\ni\ns\n \nw\na\nr\ns\n \na\nn\nd\n \nc\na\nm\np\na\ni\ng\nn\ns\n \na\nr\ne\n \ns\nt\nu\nd\ni\ne\nd\n \na\nt\n \nm\ni\nl\ni\nt\na\nr\ny\n \ns\nc\nh\no\no\nl\ns\n \nw\no\nr\nl\nd\nw\ni\nd\ne\n.\n\n\nN\na\np\no\nl\ne\no\nn\n'\ns\n \np\no\nl\ni\nt\ni\nc\na\nl\n \na\nn\nd\n \

## 🏋️ Train

In [18]:
peft_config = LoraConfig(
    lora_alpha=64,
    lora_dropout=0.1,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
)

In [19]:
run_name = "mistral_single_" + os.environ["SLURM_JOB_ID"]
training_arguments = TrainingArguments(
    output_dir="/home/ullriher/ullriher/models/claim_extraction/paper/" + run_name,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,  # Increase, if still giving OOM error
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    save_steps=500,
    logging_steps=200,
    learning_rate=1e-4,
    fp16=True,  # Enable fp16, bf16 only if your gfx card supports it
    evaluation_strategy="steps",
    max_grad_norm=0.3,
    num_train_epochs=4.0,
    weight_decay=0.001,
    warmup_steps=50,
    lr_scheduler_type="linear",
    run_name=run_name,
    report_to="wandb",
)

In [20]:
run = wandb.init(project="Mistral-Inst-7b-paper", name=run_name)

trainer = SFTTrainer(
    model=model,
    train_dataset=prompt_dataset["train"].shuffle(seed=42),
    eval_dataset=prompt_dataset["validation"].shuffle(seed=45),  # remove you have low VRAM and getting OOM errors
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=4096,  # depends on your dataset
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mbertik[0m ([33maic_nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin




Map:   0%|          | 0/13620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1678 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
200,0.3678,0.280408
400,0.2351,0.270532
600,0.2098,0.267078
800,0.1849,0.268418
1000,0.1745,0.268588
1200,0.159,0.27402
1400,0.1451,0.271952
1600,0.1362,0.27364
1800,0.1232,0.277287
2000,0.1158,0.283275




TrainOutput(global_step=13620, training_loss=0.05675035911597169, metrics={'train_runtime': 65666.6418, 'train_samples_per_second': 0.83, 'train_steps_per_second': 0.207, 'total_flos': 2.1254752148693975e+18, 'train_loss': 0.05675035911597169, 'epoch': 4.0})

In [None]:
%pip install bitsandbytes==0.42.0

In [None]:
%pip show bitsandbytes

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# reload bitsandbytes
import importlib
import bitsandbytes
importlib.reload(bitsandbytes)
bitsandbytes.__version__

In [None]:
trainer.save_model()

In [None]:
len(df)

In [None]:
df["text"][32999],df["summary"][32999]

In [None]:
from transformers import MistralForCausalLM

In [None]:
len(raw_dataset["test"])

In [55]:
import pysbd
seg = pysbd.Segmenter()

In [56]:
seg.segment("Hi! I am Bertie. How are You?")

['Hi! ', 'I am Bertie. ', 'How are You?']

In [None]:
import json

In [57]:
"aba".split("b")

['a', 'a']

In [22]:
from tqdm import tqdm

In [23]:
with open(training_arguments.output_dir+"/generated_predictions.jsonl", "w", encoding="utf-8") as f:
    pass

for i in tqdm(range(len(raw_dataset["test"][:1]))):
    eval_prompt = format_prompt(raw_dataset["test"][i], format=formats[0], hide_output=True)
    model_input = tokenizer.apply_chat_template(eval_prompt, return_tensors="pt").to("cuda")
    print(eval_prompt)
    
    with torch.no_grad():
        print()
        output = tokenizer.decode(model.generate(model_input, max_new_tokens=100)[0], skip_special_tokens=True)
        sents = [sent.strip() for sent in seg.segment(output.split("[/INST]")[-1])]
        print("\nGenerated:\n",sents)
        print("\nExpected:\n",raw_dataset["test"][i]["claims"])
        with open(training_arguments.output_dir+"/generated_predictions.jsonl", "a", encoding="utf-8") as f:
            print(json.dumps(sents),file=f)

  0%|          | 0/7 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'role': 'user', 'content': 'Extract a single factual claim that follows from the following Wikipedia sentence:\nM\ni\nc\nh\na\ne\nl\n \nF\na\ns\ns\nb\ne\nn\nd\ne\nr\n\n\nM\ni\nc\nh\na\ne\nl\n \nF\na\ns\ns\nb\ne\nn\nd\ne\nr\n \n(\nb\no\nr\nn\n \n2\n \nA\np\nr\ni\nl\n \n1\n9\n7\n7\n)\n \ni\ns\n \na\n \nG\ne\nr\nm\na\nn\n-\nb\no\nr\nn\n \nI\nr\ni\ns\nh\n \na\nc\nt\no\nr\n.\n\n\nH\ni\ns\n \nf\ne\na\nt\nu\nr\ne\n \nf\ni\nl\nm\n \nd\ne\nb\nu\nt\n \nw\na\ns\n \ni\nn\n \nt\nh\ne\n \nf\na\nn\nt\na\ns\ny\n \nw\na\nr\n \ne\np\ni\nc\n \n3\n0\n0\n \n(\n2\n0\n0\n7\n)\n \na\ns\n \na\n \nS\np\na\nr\nt\na\nn\n \nw\na\nr\nr\ni\no\nr\n;\n \nh\ni\ns\n \ne\na\nr\nl\ni\ne\nr\n \nr\no\nl\ne\ns\n \ni\nn\nc\nl\nu\nd\ne\nd\n \nv\na\nr\ni\no\nu\ns\n \ns\nt\na\ng\ne\n \np\nr\no\nd\nu\nc\nt\ni\no\nn\ns\n,\n \na\ns\n \nw\ne\nl\nl\n \na\ns\n \ns\nt\na\nr\nr\ni\nn\ng\n \nr\no\nl\ne\ns\n \no\nn\n \nt\ne\nl\ne\nv\ni\ns\ni\no\nn\n \ns\nu\nc\nh\n \na\ns\n \ni\nn\n \nt\nh\ne\n \nH\nB\nO\n \nm\ni\nn\ni\ns\ne\nr\ni\ne\ns\

  0%|          | 0/7 [00:09<?, ?it/s]


NameError: name 'seg' is not defined

In [None]:
eval_prompt = add_prompt(-1,test_data)

In [None]:


eval_prompt

In [None]:
model_name

In [None]:
model = model.merge()
model.save_pretrained("merged_adapters")

In [None]:
compute_dtype = getattr(torch, "float16")
use_4bit = True
   
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

device_map = "auto"
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,  # loading model in 4-bit
    device_map=device_map, # to use max gpu resources if exist
)

#Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id