In [1]:
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from datasets import load_dataset
import inseq

test_data = load_dataset("xsum", split="test")
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    cache_dir="/mnt/ssd/llms"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

doc = test_data[0]['document']
instruction = "Summarise the document below:"
prompt_message = f"{instruction}\n{doc}"
messages = [{
    "role": "user", 
    "content": prompt_message
}]

prompt = tokenizer.apply_chat_template(messages, 
                                        return_tensors="pt", 
                                        add_generation_prompt=True).to(model.device)
prompt_text = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

output_ids = model.generate(prompt,
                            do_sample=False,
                            max_new_tokens=64,
                            temperature=0.0)

output_text = tokenizer.decode(output_ids[0, prompt.shape[1]:], skip_special_tokens=False)
output_text = output_text.split('.')[0] + "." 

print(output_text)

inseq_model = inseq.load_model(model, "attention", tokenizer=model_name)
out = inseq_model.attribute(
    input_texts=prompt_text,
    generated_texts=prompt_text + output_text,
)

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


ValueError: Unrecognized configuration class <class 'transformers.models.t5.configuration_t5.T5Config'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GemmaConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, LlamaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MvpConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.

In [8]:
# Write json object to a file incrementally
import json
from datasets import load_dataset

test_data = load_dataset("xsum", split="test")
save_path = "test.json"
for idx, sample in enumerate(test_data):
    processed_sample = sample
    processed_sample['attributed sentences'] = [idx]

    with open(save_path, 'a') as fh:
        json.dump(processed_sample, fh, indent=4)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [1]:
# Find the test samples with the longest input context in XSum 

import json
from datasets import load_dataset
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from tqdm import tqdm


# Load model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    cache_dir="/mnt/ssd/llms"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

test_data = load_dataset("xsum", split="test")

token_length = []
for idx, sample in tqdm(enumerate(test_data)):
    doc = sample['document']
    instruction = "Summarise the document below:"
    prompt_message = f"{instruction}\n{doc}"
    messages = [{
        "role": "user", 
        "content": prompt_message
    }]

    prompt = tokenizer.apply_chat_template(messages, 
                                           return_tensors="pt", 
                                           add_generation_prompt=True).to(model.device)
    # print(prompt.shape[-1])
    if idx < 2500:
        token_length.append((idx, prompt.shape[-1]))
    # break

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.13s/it]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
11334it [00:18, 623.95it/s]


In [2]:
# Find the samples with longest input (sample index, token length) sorted in descending order

sorted_token_length = sorted(token_length, key=lambda x: x[1], reverse=True)
print(sorted_token_length)

[(797, 5450), (1497, 4116), (2350, 3161), (2059, 2871), (165, 2528), (240, 2459), (2025, 2410), (1485, 2391), (1859, 2386), (456, 2361), (1099, 2322), (291, 2315), (1349, 2306), (1087, 2207), (663, 2206), (65, 2196), (123, 2184), (802, 2150), (1993, 2128), (140, 2100), (2458, 2065), (1182, 2061), (2081, 2032), (1560, 2018), (1504, 1996), (858, 1994), (398, 1986), (314, 1983), (300, 1959), (1124, 1932), (118, 1901), (2470, 1897), (1490, 1894), (1659, 1894), (188, 1888), (1027, 1874), (966, 1868), (607, 1862), (1358, 1860), (2249, 1860), (1392, 1858), (1142, 1835), (1914, 1831), (178, 1827), (2103, 1824), (43, 1802), (1958, 1799), (705, 1781), (1033, 1762), (232, 1753), (904, 1752), (710, 1746), (1080, 1744), (1035, 1734), (1059, 1724), (424, 1705), (1686, 1699), (548, 1691), (1503, 1689), (1920, 1688), (290, 1687), (908, 1681), (1435, 1674), (539, 1663), (152, 1660), (1022, 1652), (512, 1641), (2308, 1631), (2199, 1623), (1685, 1621), (674, 1612), (1612, 1606), (838, 1598), (363, 1592),

In [None]:
# modify the codes to use inseq_model.attribute() directly
# check the attention sink paper -- how to avoid attention sink?
# technical novelty?
# implement integrated-gradient method instead of using inseq

# what is the goal? generate summary that is faithful to the attributed sentences? 
# check long-form QA dataset; rather than summarisation! it is hard to evaluate

# Run the whole generation pipeline and check the ROUGE scores & BERT scores + faithfulness metrics for different models: Mistral-7b, LLama3-8b
# Measure the quality of attribution? we can check the ROUGE score or NLI-based metric between each attributed sentence and generated summary

In [1]:
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from datasets import load_dataset
import inseq

test_data = load_dataset("xsum", split="test")

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    cache_dir="/mnt/ssd/llms"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

attr_type="attention"

doc = test_data[165]['document']  # 最多支持2500 token length
input_prompt = f"Summarise the document below: {doc}"
messages = [{
    "role": "user", 
    "content": input_prompt
}]

prompt = tokenizer.apply_chat_template(messages, 
                                        return_tensors="pt", 
                                        add_generation_prompt=True).to(model.device)
print(prompt.shape[-1])

# input_prompt = '''A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: A 34-year-old man was arrested in connection with an outstanding warrant and is expected to appear at Glasgow Sherriff Court on Monday. A 15-year-old male was arrested for offensive behaviour and resisting arrest and a 16-year-old male was arrested for offensive behaviour. Three men were arrested outside the stadium in connection with assault. The men, aged 29, 28 and 27, and all from Glasgow, are expected to appear at Aberdeen Sherriff Court on Monday. Police said the two teenagers will be reported to the relevant authorities. Match Commander Supt Innes Walker said: "The vast majority of fans from both football clubs followed the advice given and conducted themselves appropriately. "The policing operation was assisted by specialist resources including the horses, the dog unit and roads policing and we appreciate the support of the overwhelming majority of fans and members of the public in allowing the Friday night game to be enjoyed and pass safely." Celtic won the match 3-1\nSummarize the provided document. The summary should be extremely short. ASSISTANT:'''
inseq_model = inseq.load_model(model, attr_type)
out = inseq_model.attribute(input_prompt, generation_args={"do_sample": False, "max_new_tokens": 100})

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.26s/it]
The model is loaded with a device map. The device cannot be changed after loading.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


2527


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Attributing with attention...: 2521it [00:16, 16.74s/it]
