## Mistral Attribution Patching

In [1]:
from nnsight import LanguageModel
import torch
import pandas as pd
from tqdm import tqdm
import numpy as np
import pickle
import sys
from accelerate import infer_auto_device_map
import matplotlib.pyplot as plt

In [2]:
model = LanguageModel("/home/gridsan/arunas/models/mistralai/Mistral-7B-v0.1/",  load_in_8bit=True, dispatch=True, device_map='auto') # Load the model

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [3]:
og = pd.read_csv('/home/gridsan/arunas/broca/data-gen/ngs.csv')
og.columns

Index(['ita', 'ita-r-1-null_subject', 'ita-r-2-subordinate', 'ita-r-3-passive',
       'ita-u-1-negation', 'ita-u-2-invert', 'ita-u-3-gender', 'en',
       'en-r-1-subordinate', 'en-r-2-passive', 'en-u-1-negation',
       'en-u-2-inversion', 'en-u-3-qsubordinate', 'en-u-4-wh', 'it',
       'it-r-1-null_subject', 'it-r-2-passive', 'it-r-3-subordinate',
       'it-u-1-negation', 'it-u-2-invert', 'it-u-3-gender', 'jp-r-1-sov',
       'jap-r-1-sov', 'jp-r-2-passive', 'jap-r-2-passive',
       'jp-r-3-subordinate', 'jp-u-1-negation', 'jap-u-1-negation',
       'jp-u-2-invert', 'jap-u-2-invert', 'jp-u-3-past-tense', 'ng-ita',
       'ng-ita-r-1-null_subject', 'ng-ita-r-2-subordinate',
       'ng-ita-r-3-passive', 'ng-ita-u-1-negation', 'ng-ita-u-2-invert',
       'ng-ita-u-3-gender', 'ng-en', 'ng-en-r-1-subordinate',
       'ng-en-r-2-passive', 'ng-en-u-1-negation', 'ng-en-u-2-inversion',
       'ng-en-u-3-qsubordinate', 'ng-en-u-4-wh', 'ng-it',
       'ng-it-r-1-null_subject', 'ng-it-r-2-pa

In [4]:
def get_prompt_from_df(filename):
    data = list(pd.read_csv(filename)['prompt'])
    data = [sentence.strip() for sentence in data]
    data = [sentence for sentence in data if not sentence == '']
    data = [sentence.replace('</s>', '\n') for sentence in data]
    golds = [sentence.strip().split("\n")[-1].strip().split('A:')[-1].strip() for sentence in data]
    data = [sentence[: -len(golds[idx])].strip() for idx, sentence in enumerate(data)]
    return data, golds

In [5]:
# sType=sys.argv[1]
sType='en'

In [57]:
mlp_effects_cache = torch.zeros((model.config.num_hidden_layers, model.config.hidden_size))
attn_effects_cache = torch.zeros((model.config.num_hidden_layers, model.config.hidden_size))

def attrPatching(fullPrompt, gold):
    attn_layer_cache_prompt = {}
    mlp_layer_cache_prompt = {}
    
    attn_layer_cache_patch = {}
    mlp_layer_cache_patch = {}
    if (gold == 'Yes'):
        predictionExample = fullPrompt[fullPrompt[:-2].rfind(':')+1:-2].strip()
        patch = og[og[sType] == predictionExample][f"ng-{sType}"].iloc[0]
        patchPrompt = fullPrompt.replace(predictionExample, patch)
    else:
        patchPrompt = fullPrompt
        patch = fullPrompt[fullPrompt[:-2].rfind(':')+1:-2].strip()
        predictionExample = og[og[f"ng-{sType}"] == patch][sType].iloc[0]
        fullPrompt = patchPrompt.replace(patch, predictionExample)
        gold = "Yes"

    notGold = "No"
    gold = model.tokenizer(gold)["input_ids"]
    notGold = model.tokenizer(notGold)["input_ids"]
    
    # with torch.no_grad():
    #     model.model.layers[31].mlp.down_proj.weight[1239] = torch.zeros_like(model.model.layers[31].mlp.down_proj.weight[1239])

    with model.forward(inference=False) as runner:
        print(model.model.layers[31].self_attn.o_proj.weight[1239].shape)
        with runner.invoke(fullPrompt) as invoker:
            for layer in range(len(model.model.layers)):
                self_attn = model.model.layers[layer].self_attn.o_proj.output
                mlp = model.model.layers[layer].mlp.down_proj.output
    
                attn_layer_cache_prompt[layer] = {"forward": self_attn.detach().save(), "backward": self_attn.grad.detach().save()}
                mlp_layer_cache_prompt[layer] = {"forward": mlp.detach().save(), "backward": mlp.grad.detach().save()}
            
            logits = model.lm_head.output[:, -1, notGold] - model.lm_head.output[:, -1, gold]
            loss = logits.sum()
            loss.backward(retain_graph=False)
    
    with model.forward(inference=False) as runner:
        with runner.invoke(patchPrompt) as invoker:
            for layer in range(len(model.model.layers)):
                self_attn = model.model.layers[layer].self_attn.o_proj.output
                mlp = model.model.layers[layer].mlp.down_proj.output

                attn_layer_cache_patch[layer] = {"forward": self_attn.detach().save()}
                mlp_layer_cache_patch[layer] = {"forward": mlp.detach().save()}
    
    for layer in range(len(model.model.layers)):
        # print(attn_layer_cache_patch[layer]['forward'].shape)
        mlp_effects = mlp_layer_cache_prompt[layer]["backward"].value * (mlp_layer_cache_patch[layer]["forward"].value - mlp_layer_cache_prompt[layer]["forward"].value)
        attn_effects = attn_layer_cache_prompt[layer]["backward"].value * (attn_layer_cache_patch[layer]["forward"].value - attn_layer_cache_prompt[layer]["forward"].value)

        mlp_effects = mlp_effects[:, -1, :] # batch, token, hidden_states
        attn_effects = attn_effects[:, -1, :] # batch, token, hidden_states

        mlp_effects_cache[layer] += mlp_effects[0].cpu()
        attn_effects_cache[layer] += attn_effects[0].cpu()


In [58]:
prompts, golds = get_prompt_from_df(f'/home/gridsan/arunas/broca/llama/experiments/llama-classification-new-prompt-det-{sType}.csv')
for prompt,gold in tqdm(zip(prompts[:1], golds[:1])):
    attrPatching(prompt, gold)

0it [00:00, ?it/s]

torch.Size([4096])


1it [00:18, 18.54s/it]


In [46]:
mlp_effects_cache /= len(prompts)
attn_effects_cache /= len(prompts)
mlp_effects_cache.size()

torch.Size([32, 4096])

In [55]:
mlp_effects_cache = torch.nan_to_num(mlp_effects_cache)
attn_effects_cache = torch.nan_to_num(attn_effects_cache)

In [59]:
mlp_effects_cache[31][1239]

tensor(0.0168)

In [33]:
flattened_effects_cache = mlp_effects_cache.view(-1)
print(flattened_effects_cache.shape[-1])
top_neurons = flattened_effects_cache.topk(k=int((0.01 * flattened_effects_cache.shape[-1])))
two_d_indices = torch.cat((((top_neurons[1] // mlp_effects_cache.shape[1]).unsqueeze(1)), ((top_neurons[1] % mlp_effects_cache.shape[1]).unsqueeze(1))), dim=1)

131072


In [None]:
with open(f'mlp/new-prompt-{sType}.pkl', 'wb') as f:
    pickle.dump(two_d_indices, f)

In [52]:
flattened_effects_cache = attn_effects_cache.view(-1)
top_neurons = flattened_effects_cache.topk(k=40)
two_d_indices = torch.cat((((top_neurons[1] // attn_effects_cache.shape[1]).unsqueeze(1)), ((top_neurons[1] % attn_effects_cache.shape[1]).unsqueeze(1))), dim=1)

In [26]:
with open(f'attn/new-prompt-{sType}.pkl', 'wb') as f:
    pickle.dump(two_d_indices, f)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRM

In [41]:
a = model.model.layers[0].self_attn.o_proj.weight
a
# for layer_idx in range(num_layers):
#     attention_layer = model.model.layers[layer_idx].self_attn
#     print(attention_layer.num_heads, attention_layer.o_proj.out_features)
#     num_heads = attention_layer.o_proj.out_features // attention_layer.num_heads
#     print(f"Layer {layer_idx}: {num_heads} attention heads")

Parameter containing:
tensor(..., device='meta', size=(4096, 4096), requires_grad=True)

### Delete this

In [5]:
from nnsight import LanguageModel
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoConfig
import torch
import pandas as pd
from tqdm import tqdm
import pickle
import argparse
import yaml
import os
import json
parser = argparse.ArgumentParser()

parser.add_argument('--config_file', type=str, help='path to the model training config file, found in broca/config')
parser.add_argument('--stype', type=int, help='structure type idx. Can range from 0-30')

args = { "config_file": "/home/gridsan/arunas/broca/configs/mistral-atp-config", "stype": 7}
with open(args['config_file'], 'r') as f:
    config_file = yaml.safe_load(f)

print(json.dumps(config_file, indent=4))
PREFIX = config_file["prefix"]
MODEL_NAME = config_file["model_name"]
MODEL_PATH = config_file["model_path"]
DATA_PATH = config_file["data_path"]
PROMPT_FILES_PATH = config_file["prompt_files_path"]
PATCH_PICKLES_PATH = config_file["patch_pickles_path"]
PATCH_PICKLES_SUBPATH = config_file["patch_pickles_sub_path"]

if (MODEL_NAME == "llama"):
    nf4_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    config = AutoConfig.from_pretrained(MODEL_PATH, cache_dir=MODEL_CACHE_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, config=config, device_map="auto", padding_side="left", cache_dir=MODEL_CACHE_PATH)
    
    tokenizer.pad_token = tokenizer.eos_token
    
    model = LanguageModel(MODEL_PATH,  quantization_config=nf4_config, tokenizer=tokenizer, device_map='auto', cache_dir=MODEL_CACHE_PATH) # Load the model
elif (MODEL_NAME == "mistral"):
    nf4_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    config = AutoConfig.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, config=config, device_map="auto", padding_side="left")
    tokenizer.pad_token = tokenizer.eos_token
    
    model = LanguageModel(MODEL_PATH,  quantization_config=nf4_config, tokenizer=tokenizer, device_map='auto') # Load the model
    
model.requires_grad_(True)
og = pd.read_csv(DATA_PATH)
types = [col for col in og.columns if not 'ng-' in col]

{
    "model_name": "mistral",
    "model_path": "/home/gridsan/arunas/models/mistralai/Mistral-7B-v0.1",
    "prefix": "/home/gridsan/arunas/",
    "data_path": "/home/gridsan/arunas/broca/data-gen/ngs.csv",
    "prompt_files_path": "/home/gridsan/arunas/broca/mistral/experiments/new-prompt/",
    "patch_pickles_path": "/home/gridsan/arunas/broca/mistral/atp/patches/",
    "patch_pickles_sub_path": "all-neurons"
}


In [6]:
def get_prompt_from_df(filename):
    data = list(pd.read_csv(filename)['prompt'])
    data = [sentence.strip() for sentence in data]
    data = [sentence for sentence in data if not sentence == '']
    data = [sentence.replace('</s>', '\n') for sentence in data]
    golds = [sentence.strip().split("\n")[-1].strip().split('A:')[-1].strip() for sentence in data]
    data = [sentence[: -len(golds[idx])].strip() for idx, sentence in enumerate(data)]
    return data, golds

In [9]:
sType = types[args['stype']]

mlp_effects_cache = torch.zeros((model.config.num_hidden_layers, model.config.hidden_size)).to("cuda")
attn_effects_cache = torch.zeros((model.config.num_hidden_layers, model.config.hidden_size)).to("cuda")

def attrPatching(fullPrompt, gold, idx):
    attn_layer_cache_prompt = {}
    mlp_layer_cache_prompt = {}

    attn_layer_cache_patch = {}
    mlp_layer_cache_patch = {}

    if gold == 'Yes':
        predictionExample = fullPrompt[fullPrompt[:-2].rfind(':')+1:-2].strip()
        patch = og.iloc[idx][f"ng-{sType}"]
        patchPrompt = fullPrompt.replace(predictionExample, patch)
    else:
        patchPrompt = fullPrompt
        patch = fullPrompt[fullPrompt[:-2].rfind(':')+1:-2].strip()
        predictionExample = og.iloc[idx][sType]
        fullPrompt = patchPrompt.replace(patch, predictionExample)
        gold = "Yes"

    if model.tokenizer(fullPrompt, return_tensors="pt").input_ids.shape[-1] != \
        model.tokenizer(patchPrompt, return_tensors="pt").input_ids.shape[-1]:
        return

    notGold = "No"
    gold = model.tokenizer(gold)["input_ids"]
    notGold = model.tokenizer(notGold)["input_ids"]
    with model.forward(inference=False) as runner:
        with runner.invoke(fullPrompt) as invoker:
            for layer in range(len(model.model.layers)):
                self_attn = model.model.layers[layer].self_attn.o_proj.output
                mlp = model.model.layers[layer].mlp.down_proj.output
                mlp.retain_grad()
                self_attn.retain_grad()
    
                attn_layer_cache_prompt[layer] = {"forward": self_attn.save()} # "backward": self_attn.grad.detach().save()}
                mlp_layer_cache_prompt[layer] = {"forward": mlp.save()}# "backward": mlp.grad.detach().save()}
    
        logits = model.lm_head.output.save()
    loss = logits.value[:, -1, notGold] - logits.value[:, -1, gold]
    loss = loss.sum()
    loss.backward()

    with model.forward(inference=False) as runner:
        with runner.invoke(patchPrompt) as invoker:
            for layer in range(len(model.model.layers)):
                self_attn = model.model.layers[layer].self_attn.o_proj.output
                mlp = model.model.layers[layer].mlp.down_proj.output
    
                attn_layer_cache_patch[layer] = {"forward": self_attn.save()}
                mlp_layer_cache_patch[layer] = {"forward": mlp.save()}

    for layer in range(len(model.model.layers)):
        mlp_effects = (mlp_layer_cache_prompt[layer]["forward"].value.grad * (mlp_layer_cache_patch[layer]["forward"].value - mlp_layer_cache_prompt[layer]["forward"].value)).detach()
        attn_effects = (attn_layer_cache_prompt[layer]["forward"].value.grad * (attn_layer_cache_patch[layer]["forward"].value - attn_layer_cache_prompt[layer]["forward"].value)).detach()

        mlp_effects = mlp_effects[0, -1, :] # batch, token, hidden_states
        attn_effects = attn_effects[0, -1, :] # batch, token, hidden_states

        mlp_effects_cache[layer] += mlp_effects
        attn_effects_cache[layer] += attn_effects

prompts, golds = get_prompt_from_df(f'{PROMPT_FILES_PATH}/{sType}.csv')
for idx,(prompt,gold) in tqdm(enumerate(zip(prompts, golds))):
    attrPatching(prompt, gold, idx)
    if idx > 10:
        break

mlp_effects_cache /= len(prompts)
attn_effects_cache /= len(prompts)

with open(f'{PATCH_PICKLES_PATH}/mlp/{PATCH_PICKLES_SUBPATH}/{sType}.pkl', 'wb') as f:
    pickle.dump(mlp_effects_cache, f)

with open(f'{PATCH_PICKLES_PATH}/attn/{PATCH_PICKLES_SUBPATH}{sType}.pkl', 'wb') as f:
    pickle.dump(attn_effects_cache, f)

11it [01:45,  9.63s/it]


In [4]:
from nnsight import LanguageModel
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoConfig
import torch
import pandas as pd
from tqdm import tqdm
import pickle
import argparse
import yaml
import os
import json
parser = argparse.ArgumentParser()

parser.add_argument('--config_file', type=str, help='path to the model training config file, found in broca/config')
parser.add_argument('--stype', type=int, help='structure type idx. Can range from 0-30')

args = parser.parse_args()
with open(args.config_file, 'r') as f:
    config_file = yaml.safe_load(f)

# args = { "config_file": "/mnt/align4_drive/arunas/broca/configs/mistral-atp-config", "stype": 23 }
# with open(args["config_file"], 'r') as f:
#    config_file = yaml.safe_load(f)

print(json.dumps(config_file, indent=4))
PREFIX = config_file["prefix"]
MODEL_NAME = config_file["model_name"]
MODEL_PATH = config_file["model_path"]
DATA_PATH = config_file["data_path"]
PROMPT_FILES_PATH = config_file["prompt_files_path"]
PATCH_PICKLES_PATH = config_file["patch_pickles_path"]
PATCH_PICKLES_SUBPATH = config_file["patch_pickles_sub_path"]

og = pd.read_csv(DATA_PATH)
types = [col for col in og.columns if not 'ng-' in col]
sType = types[args.stype]

sType = types[0]

if (not os.path.exists(f"{PATCH_PICKLES_PATH}/attn/{PATCH_PICKLES_SUBPATH}/{sType}.pkl") or not os.path.exists(f"{PATCH_PICKLES_PATH}/mlp/{PATCH_PICKLES_SUBPATH}/{sType}.pkl")):
    print(f"Running for {sType}")
    
    if (MODEL_NAME == "llama"):
        os.environ["HF_TOKEN"] = config_file["hf_token"]
        nf4_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        config = AutoConfig.from_pretrained(MODEL_PATH, cache_dir=MODEL_CACHE_PATH)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, config=config, device_map="auto", padding_side="left", cache_dir=MODEL_CACHE_PATH)
        
        tokenizer.pad_token = tokenizer.eos_token
        model = LanguageModel(MODEL_PATH,  quantization_config=nf4_config, tokenizer=tokenizer, device_map='auto', cache_dir=MODEL_CACHE_PATH) # Load the model

    elif (MODEL_NAME == "mistral"):
        config = AutoConfig.from_pretrained(MODEL_PATH)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, config=config, device_map="auto", padding_side="left")
        tokenizer.pad_token = tokenizer.eos_token
        model = LanguageModel(MODEL_PATH,  tokenizer=tokenizer, device_map='auto') # Load the model
        
    model.requires_grad_(True)

    def get_prompt_from_df(filename):
        data = list(pd.read_csv(filename)['prompt'])
        questions = list(pd.read_csv(filename)['q'])
        golds = list(pd.read_csv(filename)['gold'])
        return data, questions, golds

    mlp_effects_cache = torch.zeros((model.config.num_hidden_layers, model.config.hidden_size)).to("cuda")
    attn_effects_cache = torch.zeros((model.config.num_hidden_layers, model.config.hidden_size)).to("cuda")

    def attrPatching(cleanPrompt, q, gold, idx):
        attn_layer_cache_prompt = {}
        mlp_layer_cache_prompt = {}

        attn_layer_cache_patch = {}
        mlp_layer_cache_patch = {}

        if gold == 'Yes':
            testQ = q
            patch = og[og[sType] == testQ][f"ng-{sType}"].head(1).item()
            patchPrompt = cleanPrompt.replace(testQ, patch)
        else:
            patchPrompt = cleanPrompt
            testQ = q
            clean = og[og[f"ng-{sType}"] == testQ][sType].head(1).item()
            cleanPrompt = patchPrompt.replace(testQ, clean)
            gold = "Yes"

        if model.tokenizer(cleanPrompt, return_tensors="pt").input_ids.shape[-1] != \
            model.tokenizer(patchPrompt, return_tensors="pt").input_ids.shape[-1]:
            return

        notGold = "No"
        gold = model.tokenizer(gold)["input_ids"]
        notGold = model.tokenizer(notGold)["input_ids"]
        
        with model.trace(cleanPrompt, scan=False, validate=False) as tracer:
            for layer in range(len(model.model.layers)):
                self_attn = model.model.layers[layer].self_attn.o_proj.output
                mlp = model.model.layers[layer].mlp.down_proj.output
                mlp.retain_grad()
                self_attn.retain_grad()

                attn_layer_cache_prompt[layer] = {"forward": self_attn.save()} # "backward": self_attn.grad.detach().save()}
                mlp_layer_cache_prompt[layer] = {"forward": mlp.save()}# "backward": mlp.grad.detach().save()}

            logits = model.lm_head.output.save()
        loss = logits.value[:, -1, notGold] - logits.value[:, -1, gold]
        loss = loss.sum()
        loss.backward()

        with model.trace(patchPrompt, scan=False, validate=False) as tracer:
            for layer in range(len(model.model.layers)):
                self_attn = model.model.layers[layer].self_attn.o_proj.output
                mlp = model.model.layers[layer].mlp.down_proj.output

                attn_layer_cache_patch[layer] = {"forward": self_attn.save()}
                mlp_layer_cache_patch[layer] = {"forward": mlp.save()}

        for layer in range(len(model.model.layers)):
            mlp_effects = (mlp_layer_cache_prompt[layer]["forward"].value.grad * (mlp_layer_cache_patch[layer]["forward"].value - mlp_layer_cache_prompt[layer]["forward"].value)).detach()
            attn_effects = (attn_layer_cache_prompt[layer]["forward"].value.grad * (attn_layer_cache_patch[layer]["forward"].value - attn_layer_cache_prompt[layer]["forward"].value)).detach()

            mlp_effects = mlp_effects[0, -1, :] # batch, token, hidden_states
            attn_effects = attn_effects[0, -1, :] # batch, token, hidden_states

            mlp_effects_cache[layer] += mlp_effects.to(mlp_effects_cache[layer].get_device())
            attn_effects_cache[layer] += attn_effects.to(mlp_effects_cache[layer].get_device())

    prompts, questions, golds = get_prompt_from_df(f'{PROMPT_FILES_PATH}/{sType}.csv')
    for idx,(prompt, q, gold) in tqdm(enumerate(zip(prompts,questions, golds))):
        attrPatching(prompt, q, gold, idx)

    mlp_effects_cache /= len(prompts)
    attn_effects_cache /= len(prompts)

    with open(f'{PATCH_PICKLES_PATH}/mlp/{PATCH_PICKLES_SUBPATH}/{sType}.pkl', 'wb') as f:
        pickle.dump(mlp_effects_cache, f)

    with open(f'{PATCH_PICKLES_PATH}/attn/{PATCH_PICKLES_SUBPATH}/{sType}.pkl', 'wb') as f:
        pickle.dump(attn_effects_cache, f)


{
    "model_name": "mistral",
    "model_path": "mistralai/Mistral-7B-v0.1",
    "prefix": "/mnt/align4_drive/arunas/",
    "data_path": "/mnt/align4_drive/arunas/broca/data-gen/ngs.csv",
    "prompt_files_path": "/mnt/align4_drive/arunas/broca/mistral/experiments/new-prompt-prologue-random-seed/",
    "patch_pickles_path": "/mnt/align4_drive/arunas/broca/mistral/atp/patches/",
    "patch_pickles_sub_path": "all-neurons-new-prompt-prologue-random-seed"
}
Running for ita


0it [00:00, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
15it [00:31,  2.07s/it]


KeyboardInterrupt: 