In [2]:
# Extract top k important sentences from the input document based on attribution scores

import inseq
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from datasets import load_dataset
import evaluate

import json
import argparse
from pathlib import Path
from tqdm import tqdm


input_key = {
    "xsum": "document",
    "cnn_dm": "article"
}

output_key = {
    "xsum": "summary",
    "cnn_dm": "highlights"
}

# Check if the current token is the end of a sentence
# Note that: this algo cannot handle the corner case with abbreviation, e.g. "P.E."
def is_sentence_ending(text):
    if text.endswith(("!", ".", "?")):
        return True
    if text.endswith((".\"", "?\"", "!\"")):
        return True
    
def get_token_length(text, tokenizer):
    encoded_text = tokenizer(text, 
                             return_tensors="pt", 
                             add_special_tokens=False).input_ids
    
    return encoded_text.shape[-1]

In [3]:
login("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")

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
# Load model and tokenizer
# model_name = "meta-llama/Llama-2-7b-hf"
config = AutoConfig.from_pretrained(model_name)
context_window_length = getattr(config, 'max_position_embeddings', 
                                getattr(config, 'n_positions', None))

model = AutoModelForCausalLM.from_pretrained(model_name, 
                                            torch_dtype=torch.bfloat16, 
                                            device_map="auto",
                                            use_auth_token=True,
                                            cache_dir="/mnt/ssd/llms")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = context_window_length

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.01s/it]


In [4]:
test_data = load_dataset("xsum", split="test")
test_data[0]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


{'document': 'Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.\nWorkers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.\nThe Welsh Government said more people than ever were getting help to address housing problems.\nChanges to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.\nPrison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.\nHowever, the same could not be said for men, the charity said, because issues which often affect them, such as post traumatic stress disorder or drug dependency, were often viewed as less of a priority.\nAndrew Stevens, who works in Welsh prisons trying to secure housing for prison leavers, said the

In [8]:
instruction = "Summarise the document below:"
# prompt_message = f"{instruction} If you're famous and performing the American national anthem, be prepared to become a national hero or a national disgrace."
prompt_message = f"{instruction}\n\n{test_data[1099]['document']}"
messages = [{
    "role": "user", 
    "content": prompt_message
}]

prompt = tokenizer.apply_chat_template(messages, 
                                        return_tensors="pt", 
                                        add_generation_prompt=True).to(model.device)
print("Token length: ", prompt.shape)
prompt_text = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

inseq_model = inseq.load_model(model, "attention", tokenizer=model_name)
output_ids = model.generate(prompt,
                            do_sample=False,
                            max_new_tokens=64,
                            temperature=0.0)

output_text = tokenizer.decode(output_ids[0, prompt.shape[1]:], skip_special_tokens=False)
output_text = output_text.split('.')[0] + "."  # Note: only keep the first sentence for debugging; for summarisaiton task: keep until \n\n or the last complete sentence [TODO]
# output_text = tokenizer.decode(output_ids[0, prompt.shape[1]:], skip_special_tokens=True)

print(output_text)
out = inseq_model.attribute(
    input_texts=prompt_text,
    generated_texts=prompt_text + output_text,
)

# out.show()

The model is loaded with a device map. The device cannot be changed after loading.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Token length:  torch.Size([1, 2323])


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


The document discusses the ongoing process of selecting the nominees for the US presidential election, focusing on the Republican and Democratic parties.


Attributing with attention...: 2325it [00:12, 12.18s/it]


In [9]:
print(output_text)
out = inseq_model.attribute(
    input_texts=prompt_text,
    generated_texts=prompt_text + output_text,
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


A multi-millionaire property developer, Peter Morgan, was infatuated with a former burlesque dancer and escort, Georgina Symonds, whom he met three years prior.


Attributing with attention...: 2873it [00:00, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 16.16 GiB. GPU 

In [9]:
# Aggregate the attribution scores for each input sentence
# Process intrucitons and special tokens in chat template separately
start_marker = "<s><s>[INST]"
end_marker = "[/INST]"

# Calculate the token length for each part of the prompt
len_start_marker = get_token_length(start_marker, tokenizer)
len_end_marker = get_token_length(end_marker, tokenizer)
len_instruction = get_token_length(instruction, tokenizer)
len_prompt = get_token_length(prompt_message, tokenizer)
total_prompt_len = len_start_marker + len_prompt

doc_start_pos = len_start_marker + len_instruction
start_span = (0, len_start_marker)
instr_span = (len_start_marker, len_start_marker + len_instruction)
end_span = (total_prompt_len, total_prompt_len + len_end_marker)

ends = [i + 1 for i, t in enumerate(out[0].target) if is_sentence_ending(t.token) and i < total_prompt_len] + [total_prompt_len]
starts = [doc_start_pos] + [i + 1 for i, t in enumerate(out[0].target) if is_sentence_ending(t.token) and i < total_prompt_len]
spans = [start_span, instr_span] + list(zip(starts, ends)) + [end_span]

# Remove empty spans 
processed_spans = []
for span in spans:
    if span[0] + 1 < span[1]:
        processed_spans.append(span)

print(processed_spans)
res = out.aggregate("spans", target_spans=processed_spans)
res.show()

[(0, 5), (5, 12), (12, 40), (40, 59), (59, 100), (100, 140), (140, 179), (179, 186), (186, 204), (204, 269), (269, 353), (353, 390), (390, 423), (423, 487), (487, 498), (498, 526), (526, 534), (534, 581), (581, 622), (622, 678), (678, 720), (720, 760), (760, 771), (771, 794), (794, 833), (833, 851), (851, 869), (869, 896), (896, 954), (954, 995), (995, 1054), (1054, 1136), (1136, 1178), (1178, 1234), (1234, 1315), (1315, 1319), (1319, 1348), (1348, 1364), (1364, 1368), (1368, 1409), (1409, 1443), (1443, 1485), (1485, 1495), (1495, 1527), (1527, 1576), (1576, 1608), (1608, 1624), (1624, 1645), (1645, 1665), (1665, 1674), (1674, 1700), (1700, 1713), (1713, 1772), (1772, 1794), (1794, 1835), (1835, 1838), (1838, 1862), (1862, 1896), (1896, 1918), (1918, 1921), (1921, 1931), (1931, 1934), (1934, 1954), (1954, 1986), (1986, 2006), (2006, 2027), (2027, 2062), (2062, 2097), (2097, 2101), (2101, 2144), (2144, 2173), (2173, 2187), (2187, 2222), (2222, 2252), (2252, 2282), (2282, 2320), (2320, 2

Unnamed: 0_level_0,The,▁document,▁discuss,es,▁the,▁ongoing,▁process,▁of,▁selecting,▁the,▁nom,ine,es,▁for,▁the,▁US,▁presidential,▁election,",",▁focusing,▁on,▁the,▁Republican,▁and,▁Democratic,▁parties,.
<s><s>▁[INST],0.528,0.54,0.555,0.597,0.536,0.487,0.518,0.536,0.51,0.519,0.504,0.585,0.556,0.499,0.507,0.495,0.512,0.5,0.449,0.456,0.511,0.453,0.421,0.48,0.468,0.459,0.435
▁Summarise▁the▁document▁below:,0.049,0.033,0.041,0.018,0.011,0.01,0.005,0.004,0.004,0.002,0.002,0.001,0.005,0.002,0.002,0.002,0.003,0.001,0.002,0.005,0.004,0.003,0.002,0.001,0.001,0.001,0.002
"<0x0A><0x0A>For▁most▁of▁US▁history,▁party▁nominees▁have▁been▁decided▁by▁political▁power▁brokers▁and▁deal-makers▁behind▁closed▁doors.",0.088,0.084,0.063,0.062,0.069,0.076,0.075,0.069,0.085,0.086,0.082,0.092,0.068,0.085,0.079,0.074,0.074,0.079,0.065,0.059,0.059,0.062,0.065,0.072,0.062,0.07,0.061
▁Parties▁operate▁like▁private▁clubs▁-▁they▁make▁their▁own▁rules▁and▁are▁suspicious▁of▁outsiders.,0.005,0.008,0.004,0.003,0.005,0.007,0.003,0.003,0.005,0.003,0.003,0.002,0.002,0.003,0.003,0.002,0.002,0.002,0.003,0.005,0.002,0.003,0.003,0.002,0.001,0.002,0.007
<0x0A>Only▁in▁recent▁history▁has▁a▁more▁open▁system▁of▁primaries▁and▁caucuses▁been▁grafted▁onto▁the▁process▁to▁give▁the▁average▁American▁a▁say▁in▁who▁appears▁on▁the▁general▁election▁ballot.,0.01,0.014,0.006,0.004,0.009,0.016,0.009,0.014,0.012,0.009,0.007,0.004,0.005,0.008,0.006,0.006,0.006,0.005,0.008,0.013,0.004,0.005,0.01,0.004,0.003,0.004,0.005
"<0x0A>In▁a▁close,▁contentious▁primary▁season,▁however,▁the▁veneer▁of▁accountability▁can▁rub▁off,▁exposing▁the▁sometimes▁unsightly▁gears▁that▁still▁power▁the▁US▁political▁system.",0.009,0.015,0.005,0.004,0.011,0.018,0.015,0.008,0.01,0.007,0.008,0.003,0.003,0.006,0.006,0.007,0.015,0.005,0.005,0.008,0.004,0.007,0.011,0.005,0.003,0.004,0.006
<0x0A>This▁has▁prompted▁objection▁from▁the▁supporters▁of▁two▁candidates▁in▁particular▁-▁Donald▁Trump▁and▁Bernie▁Sanders▁-▁who▁feel▁that▁the▁party▁establishments▁are▁arrayed▁against▁their▁presidential▁quests.,0.008,0.005,0.004,0.005,0.009,0.01,0.009,0.005,0.007,0.009,0.01,0.005,0.005,0.006,0.008,0.007,0.006,0.01,0.005,0.007,0.006,0.014,0.008,0.006,0.005,0.005,0.006
<0x0A>But▁are▁their▁concerns▁valid?,0.004,0.003,0.003,0.005,0.008,0.007,0.004,0.002,0.002,0.001,0.001,0.0,0.002,0.001,0.001,0.0,0.0,0.0,0.001,0.004,0.004,0.005,0.004,0.001,0.001,0.001,0.001
▁Here▁are▁answers▁to▁four▁pertinent▁questions▁as▁the▁nomination▁battles▁approach▁its▁final▁months.,0.01,0.008,0.009,0.009,0.011,0.011,0.018,0.011,0.01,0.01,0.01,0.008,0.003,0.006,0.005,0.005,0.003,0.004,0.005,0.005,0.006,0.006,0.009,0.005,0.003,0.004,0.005
"<0x0A>34<0x0A>Number▁of▁delegates▁Ted▁Cruz▁received▁in▁Colorado,▁which▁held▁conventions▁to▁pick▁delegates<0x0A>759▁Trump▁total▁delegate▁count<0x0A>529▁Cruz▁total▁delegate▁count<0x0A>Mr▁Trump▁is▁leading▁the▁race▁for▁the▁Republican▁nomination,▁but▁it's▁starting▁to▁feel▁like▁he's▁not▁winning.",0.015,0.012,0.012,0.008,0.009,0.011,0.012,0.007,0.009,0.011,0.013,0.007,0.007,0.007,0.01,0.013,0.01,0.006,0.006,0.007,0.007,0.013,0.017,0.02,0.007,0.008,0.006
"<0x0A>While▁he's▁comfortably▁ahead,▁with▁757▁delegates▁to▁529▁for▁second-place▁Ted▁Cruz,▁there's▁mounting▁evidence▁that▁he's▁being▁outmanoeuvred▁in▁the▁behind-the-scenes▁political▁process▁that▁could▁come▁into▁play▁if▁he▁doesn't▁reach▁the▁magic▁1,237▁delegate▁number▁necessary▁to▁secure▁the▁nomination▁outright.",0.005,0.004,0.003,0.003,0.005,0.007,0.007,0.007,0.004,0.004,0.004,0.003,0.003,0.003,0.003,0.003,0.003,0.002,0.003,0.003,0.003,0.006,0.007,0.003,0.002,0.002,0.002
<0x0A>In▁Colorado▁-▁which▁selected▁its▁delegates▁at▁party▁gatherings▁last▁week▁and▁not▁through▁primaries▁or▁caucuses▁-▁Mr▁Cruz▁walked▁away▁with▁all▁34▁delegates.,0.003,0.002,0.002,0.001,0.002,0.003,0.004,0.002,0.002,0.007,0.002,0.001,0.002,0.002,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.004,0.004,0.002,0.001,0.001,0.002
"▁Even▁in▁states▁that▁have▁held▁contests▁won▁by▁Mr▁Trump,▁Mr▁Cruz's▁team▁has▁been▁working▁doggedly▁to▁ensure▁that▁their▁people▁become▁delegates.",0.001,0.001,0.001,0.001,0.001,0.002,0.002,0.002,0.002,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.002,0.002,0.001,0.001,0.001,0.001
"<0x0A>While▁Mr▁Trump▁swept▁South▁Carolina's▁50▁delegates,▁for▁instance,▁the▁state's▁convention▁delegation▁will▁be▁riddled▁with▁Cruz▁supporters▁who,▁while▁bound▁to▁Mr▁Trump▁on▁the▁first▁few▁ballots,▁can▁switch▁to▁the▁Texas▁senator▁if▁there▁is▁a▁protracted▁convention▁battle.",0.003,0.002,0.002,0.002,0.003,0.003,0.004,0.002,0.002,0.002,0.002,0.001,0.002,0.001,0.001,0.002,0.001,0.002,0.002,0.002,0.002,0.004,0.003,0.003,0.001,0.001,0.001
<0x0A>It▁has▁Mr▁Trump▁and▁his▁people▁crying▁foul.,0.002,0.001,0.001,0.001,0.002,0.002,0.002,0.001,0.001,0.001,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.003,0.003,0.001,0.001,0.001,0.001
"<0x0A>""This▁is▁happening▁all▁over▁our▁country▁-▁great▁people▁being▁disenfranchised▁by▁politicians,""▁Mr▁Trump▁tweeted▁on▁Monday.",0.002,0.002,0.002,0.001,0.002,0.003,0.003,0.002,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.002,0.002,0.001,0.001,0.001,0.001
"▁""Repub▁party▁is▁in▁trouble!""",0.001,0.001,0.0,0.0,0.001,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.002,0.002,0.002,0.001,0.001,0.001,0.0,0.001,0.001,0.002,0.001,0.001,0.002
"<0x0A>Paul▁Manafort,▁Mr▁Trump's▁new▁aide▁in▁charge▁of▁managing▁the▁delegate-selection▁process,▁accused▁the▁Cruz▁campaign▁of▁using▁""Gestapo▁tactics,▁scorched-earth▁tactics""▁in▁Colorado.",0.004,0.003,0.003,0.002,0.002,0.004,0.004,0.004,0.004,0.004,0.002,0.002,0.003,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.003,0.003,0.003,0.002,0.002,0.001
"<0x0A>If,▁as▁Mr▁Trump▁asserted▁on▁Monday,▁the▁system▁is▁""rigged""▁and▁""crooked"",▁however,▁it▁isn't▁always▁tilted▁in▁favour▁of▁Mr▁Trump's▁opponents.",0.007,0.006,0.005,0.003,0.007,0.009,0.005,0.004,0.004,0.003,0.003,0.002,0.003,0.003,0.002,0.002,0.002,0.002,0.002,0.004,0.003,0.005,0.005,0.003,0.002,0.002,0.003
"▁Thanks▁to▁the▁Republican▁Party's▁delegate-apportioning▁system,▁including▁Florida's▁winner-take-all▁primary,▁Mr▁Trump▁has▁secured▁a▁larger▁share▁of▁the▁delegates▁so▁far▁(45%)▁than▁he▁has▁of▁the▁raw▁primary▁vote▁(37%).",0.002,0.001,0.001,0.001,0.002,0.003,0.004,0.003,0.003,0.003,0.003,0.001,0.002,0.002,0.002,0.003,0.002,0.001,0.001,0.001,0.001,0.003,0.004,0.005,0.002,0.002,0.004
"<0x0A>If▁Mr▁Cruz▁manages▁to▁win▁the▁nomination▁at▁the▁Republican▁convention▁despite▁trailing▁Mr▁Trump▁in▁total▁delegates▁and▁share▁of▁the▁popular▁vote,▁Mr▁Trump▁may▁have▁reason▁to▁feel▁aggrieved.",0.001,0.001,0.001,0.001,0.002,0.002,0.003,0.003,0.003,0.003,0.005,0.003,0.002,0.003,0.004,0.005,0.003,0.003,0.002,0.002,0.001,0.002,0.003,0.005,0.003,0.002,0.002
"<0x0A>But▁before▁he▁complains▁too▁loudly,▁he▁might▁want▁to▁heed▁some▁sage▁advice▁attributed▁(incorrectly)▁to▁Albert▁Einstein:▁""You▁have▁to▁learn▁the▁rules▁of▁the▁game.",0.002,0.001,0.001,0.001,0.002,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
"▁And▁then▁you▁have▁to▁play▁better▁than▁anyone▁else.""",0.001,0.0,0.0,0.0,0.0,0.001,0.001,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"<0x0A>Mr▁Trump▁should▁be▁familiar▁with▁the▁quote,▁since▁he▁tweeted▁it▁in▁October▁2014.",0.005,0.003,0.004,0.003,0.003,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001
<0x0A>17<0x0A>States▁Bernie▁Sanders▁has▁won<0x0A>18<0x0A>States▁Hillary▁Clinton▁has▁won<0x0A>Mr▁Sanders▁has▁won▁five▁state▁contests▁in▁a▁row▁and▁seven▁of▁the▁last▁eight.,0.003,0.002,0.002,0.002,0.002,0.002,0.002,0.001,0.001,0.002,0.002,0.001,0.004,0.002,0.002,0.002,0.002,0.002,0.001,0.001,0.002,0.004,0.002,0.003,0.007,0.003,0.001
▁If▁he▁were▁an▁American▁football▁team▁he'd▁be▁poised▁for▁the▁playoffs.,0.002,0.001,0.001,0.001,0.002,0.002,0.002,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.002,0.003,0.002,0.002,0.004,0.001,0.001
"▁If▁he▁were▁prize▁fighter,▁he'd▁be▁tuning▁up▁for▁a▁title▁bout.",0.001,0.001,0.0,0.0,0.001,0.001,0.001,0.001,0.001,0.0,0.0,0.0,0.001,0.0,0.0,0.001,0.0,0.001,0.0,0.0,0.001,0.001,0.001,0.001,0.001,0.001,0.0
<0x0A>Instead▁his▁pledged-delegate▁deficit▁to▁Mrs▁Clinton▁has▁gone▁from▁daunting▁to▁only▁fractionally▁less▁daunting.,0.003,0.002,0.003,0.002,0.002,0.002,0.002,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.002,0.002,0.002,0.001,0.002,0.001,0.001
"▁Over▁the▁course▁of▁his▁recent▁run,▁the▁Vermont▁Senator▁has▁picked▁up▁a▁net▁of▁just▁91▁delegates,▁despite▁winning▁Wisconsin▁56%▁to▁43%,▁Utah▁79%▁to▁20%▁and▁Washington▁72%▁to▁27%.",0.001,0.001,0.0,0.0,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
"<0x0A>According▁to▁a▁New▁York▁Times▁calculation,▁the▁former▁secretary▁of▁state▁currently▁has▁1,305▁pledged▁delegates,▁while▁Sanders▁stands▁at▁1,086.",0.002,0.001,0.002,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.001
"▁Add▁in▁the▁non-binding▁support▁of▁""super-delegates""▁-▁Democratic▁officeholders▁and▁party▁functionaries▁who▁also▁cast▁ballots▁for▁the▁nominee▁at▁the▁convention▁-▁and▁Mrs▁Clinton's▁lead▁balloons▁to▁1,774▁to▁1,117.",0.002,0.001,0.002,0.001,0.001,0.001,0.002,0.001,0.002,0.003,0.003,0.003,0.004,0.003,0.003,0.004,0.002,0.002,0.002,0.001,0.001,0.002,0.003,0.003,0.006,0.004,0.002
"<0x0A>To▁secure▁the▁Democratic▁nomination▁without▁drama▁at▁the▁convention▁a▁candidate▁needs▁the▁backing▁of▁2,383▁delegates<0x0A>The▁problem▁for▁Mr▁Sanders▁is▁that▁while▁he's▁been▁posting▁sizeable▁wins▁over▁the▁past▁month,▁they've▁largely▁been▁in▁delegate-poor▁states,▁like▁Wyoming▁(14▁delegates),▁Idaho▁(23)▁and▁Alaska▁(16).",0.003,0.002,0.002,0.002,0.004,0.003,0.003,0.002,0.002,0.004,0.005,0.004,0.003,0.004,0.005,0.005,0.003,0.003,0.002,0.002,0.002,0.004,0.004,0.006,0.013,0.009,0.003
"▁His▁successes▁pale▁when▁compared▁to▁Mrs▁Clinton's▁massive▁earlier▁wins▁in▁populous▁southern▁states▁like▁Texas,▁Florida▁and▁Georgia,▁which▁alone▁netted▁her▁184▁delegates▁over▁Mr▁Sanders.",0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.0,0.001,0.0,0.001,0.0,0.001,0.001,0.001,0.001,0.0,0.0,0.001,0.001,0.001,0.001,0.001,0.001,0.0
"<0x0A>If▁Mrs▁Clinton▁performs▁as▁expected▁in▁the▁coming▁contests▁in▁New▁York▁(291▁delegates),▁Maryland▁(118)▁and▁Pennsylvania▁(210),▁she'll▁largely▁erase▁all▁the▁modest▁ground▁Mr▁Sanders▁made▁up▁over▁the▁past▁three▁weeks.",0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
"<0x0A>9,412,426<0x0A>Votes▁for▁Hillary▁Clinton▁during▁primary▁season▁so▁far<0x0A>7,034,997<0x0A>Votes▁for▁Bernie▁Sanders▁during▁primary▁season▁so▁far<0x0A>If▁delegate▁maths▁and▁selection▁rules▁make▁your▁head▁hurt,▁at▁least▁we▁can▁rely▁on▁the▁raw▁vote▁totals▁to▁get▁a▁feel▁for▁how▁popular▁the▁remaining▁candidates▁are,▁right?",0.003,0.002,0.001,0.001,0.002,0.003,0.003,0.002,0.002,0.006,0.002,0.002,0.002,0.002,0.002,0.002,0.001,0.002,0.002,0.001,0.001,0.002,0.002,0.002,0.002,0.002,0.002
<0x0A>Wrong.,0.002,0.001,0.001,0.001,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"<0x0A>According▁to▁current▁tabulations▁Hillary▁Clinton▁has▁received▁9,412,426▁votes▁during▁the▁primary▁season▁so▁far.",0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.001,0.001,0.001
"▁Bernie▁Sanders▁has▁received▁7,034,997.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0
▁That▁2.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4m▁vote▁lead▁has▁been▁relentlessly▁touted▁by▁the▁former▁secretary▁of▁state▁and▁her▁supporters▁to▁counter▁the▁claims▁of▁Sanders▁faithful▁that▁their▁man▁is▁more▁popular▁than▁the▁delegate▁tallies▁indicate.,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
"<0x0A>Some▁states▁that▁hold▁caucuses▁-▁like▁Iowa▁and▁Washington▁-▁aren't▁included▁in▁that▁number,▁however,▁because▁they▁don't▁report▁vote▁totals.",0.005,0.004,0.004,0.002,0.003,0.003,0.002,0.002,0.002,0.002,0.002,0.002,0.004,0.002,0.002,0.002,0.002,0.001,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002
<0x0A>The▁Washington▁Post's▁Glenn▁Kessler▁tried▁to▁extrapolate▁numbers▁for▁the▁remaining▁states▁based▁on▁their▁total▁voter▁turnout▁and▁concluded▁that▁Hillary▁Clinton▁leads▁Bernie▁Sanders▁by▁2.,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.001,0.0,0.0,0.0,0.001,0.001,0.001,0.001,0.001,0.0
3m▁votes▁▁-▁still▁a▁significant▁margin.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"<0x0A>Among▁the▁Republicans,▁who▁are▁better▁about▁providing▁full▁vote▁totals,▁Mr▁Trump▁leads▁with▁8,256,309▁votes.",0.002,0.001,0.001,0.001,0.002,0.002,0.002,0.001,0.001,0.002,0.003,0.002,0.005,0.002,0.003,0.004,0.002,0.002,0.002,0.001,0.001,0.003,0.003,0.006,0.005,0.003,0.002
"▁Mr▁Cruz▁is▁second▁(6,319,244),▁former▁candidate▁Marco▁Rubio▁is▁third▁(3,482,129),▁followed▁by▁Mr▁Kasich▁(2,979,379).",0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001,0.0,0.0,0.001,0.001,0.001,0.001,0.001,0.0
<0x0A>In▁the▁end▁the▁popular▁vote▁may▁give▁the▁leading▁candidates▁a▁claim▁of▁legitimacy▁as▁the▁people's▁choice▁-▁but▁appearances▁can▁be▁deceiving.,0.003,0.003,0.003,0.002,0.002,0.002,0.002,0.001,0.001,0.001,0.002,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
"<0x0A>""The▁media▁has▁created▁the▁perception▁that▁the▁voters▁choose▁the▁nomination.",0.002,0.002,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.003,0.004,0.003,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
"▁That's▁the▁conflict▁here,""▁North▁Dakota▁delegate▁Curly▁Haugland▁told▁a▁television▁interviewer.",0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001,0.0,0.001,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.0,0.001,0.001,0.001,0.001,0.001,0.0
"▁""The▁rules▁are▁still▁designed▁to▁have▁a▁political▁party▁choose▁its▁nominee▁at▁a▁convention.",0.001,0.001,0.001,0.0,0.001,0.002,0.001,0.002,0.002,0.003,0.003,0.003,0.002,0.003,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.001,0.001,0.003
"▁That's▁just▁the▁way▁it▁is.""",0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.001,0.001,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.001,0.0,0.0,0.0,0.0
"<0x0A>""Well,▁there's▁the▁law,▁and▁then▁there's▁ethics,▁and▁then▁there's▁getting▁votes.",0.007,0.004,0.003,0.002,0.005,0.004,0.004,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.003,0.002,0.001,0.001,0.001,0.001
"▁I'm▁not▁going▁to▁get▁into▁what▁tactics▁are▁used.""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"<0x0A>If,▁as▁appears▁to▁be▁increasingly▁likely,▁the▁Republican▁primary▁season▁ends▁without▁Donald▁Trump▁securing▁the▁1,237▁delegates▁needed▁to▁secure▁the▁nomination,▁the▁Republican▁National▁Convention▁could▁turn▁into▁a▁political▁free-for-all▁unrivalled▁in▁modern▁US▁political▁history.",0.004,0.002,0.002,0.001,0.003,0.005,0.007,0.004,0.004,0.004,0.005,0.003,0.003,0.003,0.004,0.006,0.007,0.004,0.003,0.002,0.001,0.004,0.004,0.009,0.003,0.004,0.003
"<0x0A>After▁several▁rounds▁of▁deadlocked▁balloting,▁most▁convention▁delegates▁would▁be▁free▁to▁vote▁their▁conscience.",0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.001,0.0,0.0,0.001,0.001,0.001,0.0,0.001,0.0
"▁But▁could▁that▁conscience▁be▁nudged▁by,▁say,▁a▁free▁weekend▁at▁a▁Donald▁Trump▁golf▁resort,▁a▁nice▁dinner▁with▁the▁Cruz▁family▁or▁even▁a▁choice▁spot▁in▁a▁John▁Kasich▁administration?",0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.001,0.001,0.001
<0x0A>Maybe!,0.001,0.001,0.0,0.0,0.001,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"▁While▁there▁are▁detailed▁anti-corruption▁laws▁governing▁the▁behaviour▁of▁public▁officeholders,▁convention▁delegates▁are▁private▁citizens.",0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.0,0.001,0.001
"▁While▁government▁regulations▁prohibit▁them▁from▁taking▁money▁from▁corporations,▁labour▁unions,▁government▁contractors▁or▁foreign▁nationals,▁the▁law▁beyond▁that▁is▁much▁murkier.",0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"<0x0A>Campaigns▁and▁their▁wealth▁donors▁could▁likely▁cover▁delegate▁travel▁expenses,▁no▁matter▁how▁lavish.",0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0
▁Gold▁watches?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"▁Bags▁of▁small,▁unmarked▁bills?",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
▁Who▁knows?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"▁State▁anti-bribery▁laws▁may▁apply,▁but▁there's▁scant▁legal▁precedent.",0.002,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0
<0x0A>Perhaps▁the▁greatest▁deterrent▁to▁untoward▁action▁by▁campaigns▁is▁the▁negative▁publicity▁such▁naked▁attempts▁at▁influence▁could▁have▁if▁they're▁documented.,0.001,0.001,0.002,0.001,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001
"▁But▁public▁perceptions▁and▁attitudes▁this▁political▁season▁has▁been▁difficult▁to▁predict,▁to▁say▁the▁least.",0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
"<0x0A>Although▁the▁national▁convention▁is▁still▁months▁away,▁accusations▁of▁dirty▁tricks▁have▁already▁started▁flying.",0.001,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
▁On▁Sunday▁Mr▁Trump▁took▁to▁Twitter▁to▁accuse▁the▁Cruz▁campaign▁of▁misdeeds▁during▁the▁South▁Carolina▁state▁party▁convention▁-▁a▁charge▁Mr▁Cruz▁vehemently▁denied.,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.001,0.001,0.001,0.001
"<0x0A>""I▁win▁a▁state▁in▁votes▁and▁then▁get▁non-representative▁delegates▁because▁they▁are▁offered▁all▁sorts▁of▁goodies▁by▁Cruz▁campaign,""▁Mr▁Trump▁wrote.",0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.001,0.001,0.001
"▁""Bad▁system!""",0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0
"<0x0A>During▁a▁television▁interview▁that▁same▁morning,▁however,▁Trump▁adviser▁Paul▁Manafort▁appeared▁to▁acknowledge▁that▁his▁campaign▁won't▁be▁shy▁in▁wooing▁delegates▁at▁the▁national▁convention,▁however.",0.004,0.003,0.003,0.002,0.002,0.002,0.002,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
"<0x0A>""Well,▁there's▁the▁law,▁and▁then▁there's▁ethics,▁and▁then▁there's▁getting▁votes,""▁he▁said.",0.007,0.008,0.007,0.003,0.003,0.003,0.002,0.002,0.002,0.001,0.001,0.001,0.003,0.001,0.001,0.001,0.001,0.001,0.001,0.002,0.002,0.002,0.002,0.001,0.001,0.001,0.001
"▁""I'm▁not▁going▁to▁get▁into▁what▁tactics▁are▁used.",0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"▁I▁happen▁to▁think▁the▁best▁way▁we're▁going▁to▁get▁delegates▁is▁to▁have▁Donald▁Trump▁be▁exposed▁to▁delegates,▁let▁the▁delegates▁hear▁what▁he▁says.""",0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.0,0.001,0.001,0.001,0.0,0.001,0.001
"<0x0A>Another▁Trump▁adviser,▁Barry▁Bennett,▁said▁they▁wouldn't▁be▁offering▁""seats▁on▁the▁Trump▁airplane▁or▁anything▁like▁that"".",0.003,0.002,0.003,0.002,0.002,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
"<0x0A>""There's▁obviously▁a▁big▁line▁-▁we're▁not▁going▁to▁do▁anything▁immoral,▁illegal▁or▁unethical,""▁he▁said.",0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
"<0x0A>But▁when▁a▁presidential▁nomination▁is▁at▁stake,▁and▁it▁comes▁down▁to▁just▁a▁handful▁of▁delegates,▁that▁""big▁line""▁may▁end▁up▁looking▁awfully▁fuzzy.",0.006,0.004,0.002,0.002,0.002,0.003,0.003,0.003,0.003,0.005,0.006,0.007,0.004,0.004,0.003,0.003,0.003,0.007,0.003,0.002,0.002,0.002,0.002,0.003,0.002,0.002,0.002
▁[/INST],0.146,0.098,0.074,0.056,0.043,0.028,0.022,0.025,0.019,0.018,0.018,0.013,0.035,0.018,0.017,0.018,0.019,0.013,0.017,0.025,0.028,0.018,0.013,0.014,0.014,0.012,0.019
The,Unnamed: 1_level_78,0.08,0.052,0.037,0.03,0.02,0.013,0.014,0.011,0.011,0.01,0.006,0.015,0.009,0.01,0.01,0.009,0.006,0.009,0.015,0.014,0.01,0.007,0.006,0.006,0.005,0.009
▁document,Unnamed: 1_level_79,Unnamed: 2_level_79,0.078,0.069,0.042,0.025,0.014,0.012,0.01,0.008,0.008,0.005,0.013,0.006,0.007,0.007,0.007,0.005,0.008,0.017,0.014,0.01,0.007,0.005,0.005,0.004,0.006
▁discuss,Unnamed: 1_level_80,Unnamed: 2_level_80,Unnamed: 3_level_80,0.058,0.057,0.035,0.02,0.018,0.014,0.01,0.009,0.006,0.011,0.009,0.01,0.008,0.007,0.005,0.011,0.022,0.022,0.011,0.008,0.005,0.005,0.005,0.008
es,Unnamed: 1_level_81,Unnamed: 2_level_81,Unnamed: 3_level_81,Unnamed: 4_level_81,0.051,0.066,0.035,0.029,0.024,0.015,0.012,0.007,0.011,0.012,0.012,0.01,0.007,0.005,0.014,0.028,0.025,0.016,0.011,0.006,0.006,0.005,0.007
▁the,Unnamed: 1_level_82,Unnamed: 2_level_82,Unnamed: 3_level_82,Unnamed: 4_level_82,Unnamed: 5_level_82,0.056,0.051,0.031,0.025,0.016,0.012,0.007,0.009,0.012,0.009,0.007,0.006,0.005,0.01,0.017,0.012,0.011,0.012,0.005,0.004,0.005,0.006
▁ongoing,Unnamed: 1_level_83,Unnamed: 2_level_83,Unnamed: 3_level_83,Unnamed: 4_level_83,Unnamed: 5_level_83,Unnamed: 6_level_83,0.067,0.08,0.046,0.025,0.016,0.008,0.007,0.015,0.013,0.011,0.006,0.006,0.013,0.018,0.014,0.012,0.016,0.006,0.005,0.006,0.007
▁process,Unnamed: 1_level_84,Unnamed: 2_level_84,Unnamed: 3_level_84,Unnamed: 4_level_84,Unnamed: 5_level_84,Unnamed: 6_level_84,Unnamed: 7_level_84,0.051,0.069,0.03,0.02,0.012,0.007,0.021,0.013,0.01,0.007,0.008,0.02,0.02,0.011,0.011,0.016,0.008,0.005,0.008,0.009
▁of,Unnamed: 1_level_85,Unnamed: 2_level_85,Unnamed: 3_level_85,Unnamed: 4_level_85,Unnamed: 5_level_85,Unnamed: 6_level_85,Unnamed: 7_level_85,Unnamed: 8_level_85,0.049,0.041,0.024,0.011,0.006,0.018,0.014,0.009,0.006,0.005,0.011,0.013,0.009,0.008,0.007,0.004,0.003,0.003,0.004
▁selecting,Unnamed: 1_level_86,Unnamed: 2_level_86,Unnamed: 3_level_86,Unnamed: 4_level_86,Unnamed: 5_level_86,Unnamed: 6_level_86,Unnamed: 7_level_86,Unnamed: 8_level_86,Unnamed: 9_level_86,0.074,0.089,0.051,0.013,0.051,0.033,0.021,0.014,0.017,0.026,0.025,0.01,0.014,0.017,0.01,0.006,0.01,0.011
▁the,Unnamed: 1_level_87,Unnamed: 2_level_87,Unnamed: 3_level_87,Unnamed: 4_level_87,Unnamed: 5_level_87,Unnamed: 6_level_87,Unnamed: 7_level_87,Unnamed: 8_level_87,Unnamed: 9_level_87,Unnamed: 10_level_87,0.05,0.048,0.014,0.029,0.023,0.017,0.011,0.011,0.011,0.007,0.004,0.006,0.005,0.005,0.004,0.005,0.003
▁nom,Unnamed: 1_level_88,Unnamed: 2_level_88,Unnamed: 3_level_88,Unnamed: 4_level_88,Unnamed: 5_level_88,Unnamed: 6_level_88,Unnamed: 7_level_88,Unnamed: 8_level_88,Unnamed: 9_level_88,Unnamed: 10_level_88,Unnamed: 11_level_88,0.045,0.073,0.045,0.029,0.022,0.014,0.021,0.017,0.008,0.004,0.007,0.008,0.009,0.005,0.01,0.006
ine,Unnamed: 1_level_89,Unnamed: 2_level_89,Unnamed: 3_level_89,Unnamed: 4_level_89,Unnamed: 5_level_89,Unnamed: 6_level_89,Unnamed: 7_level_89,Unnamed: 8_level_89,Unnamed: 9_level_89,Unnamed: 10_level_89,Unnamed: 11_level_89,Unnamed: 12_level_89,0.032,0.018,0.012,0.01,0.008,0.006,0.007,0.008,0.008,0.007,0.005,0.006,0.006,0.005,0.006
es,Unnamed: 1_level_90,Unnamed: 2_level_90,Unnamed: 3_level_90,Unnamed: 4_level_90,Unnamed: 5_level_90,Unnamed: 6_level_90,Unnamed: 7_level_90,Unnamed: 8_level_90,Unnamed: 9_level_90,Unnamed: 10_level_90,Unnamed: 11_level_90,Unnamed: 12_level_90,Unnamed: 13_level_90,0.046,0.06,0.04,0.021,0.028,0.038,0.023,0.01,0.016,0.014,0.014,0.009,0.014,0.013
▁for,Unnamed: 1_level_91,Unnamed: 2_level_91,Unnamed: 3_level_91,Unnamed: 4_level_91,Unnamed: 5_level_91,Unnamed: 6_level_91,Unnamed: 7_level_91,Unnamed: 8_level_91,Unnamed: 9_level_91,Unnamed: 10_level_91,Unnamed: 11_level_91,Unnamed: 12_level_91,Unnamed: 13_level_91,Unnamed: 14_level_91,0.042,0.067,0.043,0.034,0.03,0.011,0.006,0.008,0.006,0.005,0.005,0.006,0.006
▁the,Unnamed: 1_level_92,Unnamed: 2_level_92,Unnamed: 3_level_92,Unnamed: 4_level_92,Unnamed: 5_level_92,Unnamed: 6_level_92,Unnamed: 7_level_92,Unnamed: 8_level_92,Unnamed: 9_level_92,Unnamed: 10_level_92,Unnamed: 11_level_92,Unnamed: 12_level_92,Unnamed: 13_level_92,Unnamed: 14_level_92,Unnamed: 15_level_92,0.045,0.059,0.028,0.018,0.006,0.004,0.005,0.005,0.004,0.006,0.004,0.003
▁US,Unnamed: 1_level_93,Unnamed: 2_level_93,Unnamed: 3_level_93,Unnamed: 4_level_93,Unnamed: 5_level_93,Unnamed: 6_level_93,Unnamed: 7_level_93,Unnamed: 8_level_93,Unnamed: 9_level_93,Unnamed: 10_level_93,Unnamed: 11_level_93,Unnamed: 12_level_93,Unnamed: 13_level_93,Unnamed: 14_level_93,Unnamed: 15_level_93,Unnamed: 16_level_93,0.056,0.052,0.033,0.013,0.008,0.009,0.009,0.01,0.01,0.009,0.009
▁presidential,Unnamed: 1_level_94,Unnamed: 2_level_94,Unnamed: 3_level_94,Unnamed: 4_level_94,Unnamed: 5_level_94,Unnamed: 6_level_94,Unnamed: 7_level_94,Unnamed: 8_level_94,Unnamed: 9_level_94,Unnamed: 10_level_94,Unnamed: 11_level_94,Unnamed: 12_level_94,Unnamed: 13_level_94,Unnamed: 14_level_94,Unnamed: 15_level_94,Unnamed: 16_level_94,Unnamed: 17_level_94,0.062,0.057,0.013,0.006,0.011,0.01,0.013,0.011,0.014,0.008
▁election,Unnamed: 1_level_95,Unnamed: 2_level_95,Unnamed: 3_level_95,Unnamed: 4_level_95,Unnamed: 5_level_95,Unnamed: 6_level_95,Unnamed: 7_level_95,Unnamed: 8_level_95,Unnamed: 9_level_95,Unnamed: 10_level_95,Unnamed: 11_level_95,Unnamed: 12_level_95,Unnamed: 13_level_95,Unnamed: 14_level_95,Unnamed: 15_level_95,Unnamed: 16_level_95,Unnamed: 17_level_95,Unnamed: 18_level_95,0.046,0.039,0.015,0.015,0.015,0.017,0.013,0.017,0.017
",",Unnamed: 1_level_96,Unnamed: 2_level_96,Unnamed: 3_level_96,Unnamed: 4_level_96,Unnamed: 5_level_96,Unnamed: 6_level_96,Unnamed: 7_level_96,Unnamed: 8_level_96,Unnamed: 9_level_96,Unnamed: 10_level_96,Unnamed: 11_level_96,Unnamed: 12_level_96,Unnamed: 13_level_96,Unnamed: 14_level_96,Unnamed: 15_level_96,Unnamed: 16_level_96,Unnamed: 17_level_96,Unnamed: 18_level_96,Unnamed: 19_level_96,0.046,0.047,0.03,0.023,0.012,0.013,0.011,0.021
▁focusing,Unnamed: 1_level_97,Unnamed: 2_level_97,Unnamed: 3_level_97,Unnamed: 4_level_97,Unnamed: 5_level_97,Unnamed: 6_level_97,Unnamed: 7_level_97,Unnamed: 8_level_97,Unnamed: 9_level_97,Unnamed: 10_level_97,Unnamed: 11_level_97,Unnamed: 12_level_97,Unnamed: 13_level_97,Unnamed: 14_level_97,Unnamed: 15_level_97,Unnamed: 16_level_97,Unnamed: 17_level_97,Unnamed: 18_level_97,Unnamed: 19_level_97,Unnamed: 20_level_97,0.063,0.056,0.038,0.017,0.019,0.016,0.028
▁on,Unnamed: 1_level_98,Unnamed: 2_level_98,Unnamed: 3_level_98,Unnamed: 4_level_98,Unnamed: 5_level_98,Unnamed: 6_level_98,Unnamed: 7_level_98,Unnamed: 8_level_98,Unnamed: 9_level_98,Unnamed: 10_level_98,Unnamed: 11_level_98,Unnamed: 12_level_98,Unnamed: 13_level_98,Unnamed: 14_level_98,Unnamed: 15_level_98,Unnamed: 16_level_98,Unnamed: 17_level_98,Unnamed: 18_level_98,Unnamed: 19_level_98,Unnamed: 20_level_98,Unnamed: 21_level_98,0.048,0.059,0.031,0.025,0.024,0.033
▁the,Unnamed: 1_level_99,Unnamed: 2_level_99,Unnamed: 3_level_99,Unnamed: 4_level_99,Unnamed: 5_level_99,Unnamed: 6_level_99,Unnamed: 7_level_99,Unnamed: 8_level_99,Unnamed: 9_level_99,Unnamed: 10_level_99,Unnamed: 11_level_99,Unnamed: 12_level_99,Unnamed: 13_level_99,Unnamed: 14_level_99,Unnamed: 15_level_99,Unnamed: 16_level_99,Unnamed: 17_level_99,Unnamed: 18_level_99,Unnamed: 19_level_99,Unnamed: 20_level_99,Unnamed: 21_level_99,Unnamed: 22_level_99,0.043,0.033,0.021,0.022,0.018
▁Republican,Unnamed: 1_level_100,Unnamed: 2_level_100,Unnamed: 3_level_100,Unnamed: 4_level_100,Unnamed: 5_level_100,Unnamed: 6_level_100,Unnamed: 7_level_100,Unnamed: 8_level_100,Unnamed: 9_level_100,Unnamed: 10_level_100,Unnamed: 11_level_100,Unnamed: 12_level_100,Unnamed: 13_level_100,Unnamed: 14_level_100,Unnamed: 15_level_100,Unnamed: 16_level_100,Unnamed: 17_level_100,Unnamed: 18_level_100,Unnamed: 19_level_100,Unnamed: 20_level_100,Unnamed: 21_level_100,Unnamed: 22_level_100,Unnamed: 23_level_100,0.064,0.095,0.056,0.03
▁and,Unnamed: 1_level_101,Unnamed: 2_level_101,Unnamed: 3_level_101,Unnamed: 4_level_101,Unnamed: 5_level_101,Unnamed: 6_level_101,Unnamed: 7_level_101,Unnamed: 8_level_101,Unnamed: 9_level_101,Unnamed: 10_level_101,Unnamed: 11_level_101,Unnamed: 12_level_101,Unnamed: 13_level_101,Unnamed: 14_level_101,Unnamed: 15_level_101,Unnamed: 16_level_101,Unnamed: 17_level_101,Unnamed: 18_level_101,Unnamed: 19_level_101,Unnamed: 20_level_101,Unnamed: 21_level_101,Unnamed: 22_level_101,Unnamed: 23_level_101,Unnamed: 24_level_101,0.055,0.039,0.023
▁Democratic,Unnamed: 1_level_102,Unnamed: 2_level_102,Unnamed: 3_level_102,Unnamed: 4_level_102,Unnamed: 5_level_102,Unnamed: 6_level_102,Unnamed: 7_level_102,Unnamed: 8_level_102,Unnamed: 9_level_102,Unnamed: 10_level_102,Unnamed: 11_level_102,Unnamed: 12_level_102,Unnamed: 13_level_102,Unnamed: 14_level_102,Unnamed: 15_level_102,Unnamed: 16_level_102,Unnamed: 17_level_102,Unnamed: 18_level_102,Unnamed: 19_level_102,Unnamed: 20_level_102,Unnamed: 21_level_102,Unnamed: 22_level_102,Unnamed: 23_level_102,Unnamed: 24_level_102,Unnamed: 25_level_102,0.049,0.048
▁parties,Unnamed: 1_level_103,Unnamed: 2_level_103,Unnamed: 3_level_103,Unnamed: 4_level_103,Unnamed: 5_level_103,Unnamed: 6_level_103,Unnamed: 7_level_103,Unnamed: 8_level_103,Unnamed: 9_level_103,Unnamed: 10_level_103,Unnamed: 11_level_103,Unnamed: 12_level_103,Unnamed: 13_level_103,Unnamed: 14_level_103,Unnamed: 15_level_103,Unnamed: 16_level_103,Unnamed: 17_level_103,Unnamed: 18_level_103,Unnamed: 19_level_103,Unnamed: 20_level_103,Unnamed: 21_level_103,Unnamed: 22_level_103,Unnamed: 23_level_103,Unnamed: 24_level_103,Unnamed: 25_level_103,Unnamed: 26_level_103,0.044
.,Unnamed: 1_level_104,Unnamed: 2_level_104,Unnamed: 3_level_104,Unnamed: 4_level_104,Unnamed: 5_level_104,Unnamed: 6_level_104,Unnamed: 7_level_104,Unnamed: 8_level_104,Unnamed: 9_level_104,Unnamed: 10_level_104,Unnamed: 11_level_104,Unnamed: 12_level_104,Unnamed: 13_level_104,Unnamed: 14_level_104,Unnamed: 15_level_104,Unnamed: 16_level_104,Unnamed: 17_level_104,Unnamed: 18_level_104,Unnamed: 19_level_104,Unnamed: 20_level_104,Unnamed: 21_level_104,Unnamed: 22_level_104,Unnamed: 23_level_104,Unnamed: 24_level_104,Unnamed: 25_level_104,Unnamed: 26_level_104,Unnamed: 27_level_104


In [10]:
def clean_token(token):
    processed_token = token.replace("▁", " ")  
    return processed_token

# Aggregate the attribution scores for each input sentence
tok_out = res.aggregate()
prompt_last_index = tok_out[0].attr_pos_start

input_sequences = [clean_token(t.token) for t in tok_out[0].target[2:prompt_last_index-1]]
cleaned_sequences = []
for seq in input_sequences:
    processed_seq = seq.replace("<0x0A>", " ").strip()
    cleaned_sequences.append(processed_seq)

attr_scores = tok_out[0].target_attributions[2:prompt_last_index-1].tolist()
assert(len(cleaned_sequences) == len(attr_scores))

# Note: we only consider the maximum attribution score for each sentence
sent_scores = dict()
for seq_ix, seq in enumerate(cleaned_sequences):
    sent_scores[seq] = max(attr_scores[seq_ix])

# Extract top K important sentences
sorted_sent_scores = dict(sorted(sent_scores.items(), key=lambda x: x[1], reverse=True))
top_k_sents = list(sorted_sent_scores.keys())[:3]

print(top_k_sents)
for sent in top_k_sents:
    print(sent_scores[sent])

# Store both the attributed sentences and their aggregated scores
attributed_sents = []
for sent in top_k_sents:
    attributed_sents.append(
        {
            "input_sequence": sent,
            "score": sent_scores[sent]
        }
    )

print(attributed_sents)

['For most of US history, party nominees have been decided by political power brokers and deal-makers behind closed doors.', "34 Number of delegates Ted Cruz received in Colorado, which held conventions to pick delegates 759 Trump total delegate count 529 Cruz total delegate count Mr Trump is leading the race for the Republican nomination, but it's starting to feel like he's not winning.", 'Here are answers to four pertinent questions as the nomination battles approach its final months.']
0.09215372055768967
0.020475581288337708
0.01849883608520031
[{'input_sequence': 'For most of US history, party nominees have been decided by political power brokers and deal-makers behind closed doors.', 'score': 0.09215372055768967}, {'input_sequence': "34 Number of delegates Ted Cruz received in Colorado, which held conventions to pick delegates 759 Trump total delegate count 529 Cruz total delegate count Mr Trump is leading the race for the Republican nomination, but it's starting to feel like he'