In [1]:
import pandas as pd
import json
from llama_4bit_wrapper import import_llama
from llama_4bit_wrapper.core import Matmul4BitOptions
import numpy as np
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm
import os

## Reading data

In [2]:
df_data = pd.read_csv("llama-reasoning.csv")
df_data["variables"] = df_data["variables"].apply(json.loads)
df_data.head()

Unnamed: 0,question,variables,target,dataset,formatter,subset,split
0,This question refers to the following informat...,{'A': 'The ideas of personal liberty and natio...,A,mmlu,mmlu,mmlu-high_school_european_history,train
1,This question refers to the following informat...,"{'A': 'Capitalist', 'B': 'Scientific', 'C': 'C...",C,mmlu,mmlu,mmlu-high_school_european_history,train
2,This question refers to the following informat...,{'A': 'They served as a catalyst for the growt...,A,mmlu,mmlu,mmlu-high_school_european_history,train
3,This question refers to the following informat...,{'A': 'give the English king a new position of...,D,mmlu,mmlu,mmlu-high_school_european_history,train
4,This question refers to the following informat...,{'A': 'His domination of the nobility left him...,D,mmlu,mmlu,mmlu-high_school_european_history,test


In [3]:
df_data.loc[df_data["dataset"] == "gsm8k"]

Unnamed: 0,question,variables,target,dataset,formatter,subset,split
828,Artemis is making tea for a party. She knows h...,{'chain_of_thoughts': 'She is making 72 ounces...,9,gsm8k,gsm8k,gsm8k,validation
829,It's Ava's birthday party. Her parents bought ...,{'chain_of_thoughts': 'The four bags of Reese'...,99,gsm8k,gsm8k,gsm8k,train
830,Lee mows one lawn and charges $33. Last week h...,{'chain_of_thoughts': '33 * 16 = $<<33*16=528>...,558,gsm8k,gsm8k,gsm8k,validation
831,Carly collected 7 starfish with 5 arms each an...,{'chain_of_thoughts': 'First find the total nu...,49,gsm8k,gsm8k,gsm8k,train
832,Hannah sold 40 pieces of cookies for $0.8 each...,{'chain_of_thoughts': 'Hannah's earnings from ...,79,gsm8k,gsm8k,gsm8k,train
...,...,...,...,...,...,...,...
1123,Bud makes homemade macaroni and cheese once a ...,{'chain_of_thoughts': 'The gruyere cheese is t...,520,gsm8k,gsm8k,gsm8k,test
1124,A farm has 10 2-legged animals and 15 4-legged...,{'chain_of_thoughts': '2-legged animals have 2...,40,gsm8k,gsm8k,gsm8k,test
1125,The recent floods in Mamou’s country have left...,"{'chain_of_thoughts': 'On Friday, Saturday and...",1218,gsm8k,gsm8k,gsm8k,test
1126,Jake is shopping at a clothing store. The stor...,{'chain_of_thoughts': 'The cost of a T-shirt a...,36,gsm8k,gsm8k,gsm8k,test


In [4]:
df_formatters = pd.read_csv("formatters.csv")
df_formatters.head()

Unnamed: 0,formatter,text
0,bbh-boolean_expressions,USER: Calculate the following expression {ques...
1,bbh-boolean_expressions-no-cot,USER: Calculate the following expression {ques...
2,bbh-boolean_expressions-cot,USER: Calculate the following expression {ques...
3,bbh-causal_judgement,USER: {question}. <s> ASSISTANT: Let's think s...
4,bbh-causal_judgement-cot,USER: {question}. <s> ASSISTANT: Let's think s...


## Applying formatters

In [5]:
def row_apply_formatters(row, df_formatters, suffix):
    pattern = df_formatters.loc[df_formatters["formatter"] == row["formatter"] + suffix, "text"].values[0]
    data = dict(row)
    data = dict(data, **row["variables"])
    return pattern.format(**data)

In [6]:
def apply_formatters(df_data, df_formatters, suffix):
    return df_data.apply(
        lambda row: row_apply_formatters(row, df_formatters, suffix),
        axis=1
    ).tolist()

In [7]:
df_data.groupby("split")[["question"]].count()

Unnamed: 0_level_0,question
split,Unnamed: 1_level_1
test,565
train,479
validation,548


In [8]:
apply_formatters(
    df_data,
    df_formatters,
    "-no-cot"
)[:3]

['USER: This question refers to the following information.\nRead the following excerpt.\nThe revolutionary seed had penetrated into every country and spread more or less. It was greatly developed under the régime of the military despotism of Bonaparte. His conquests displaced a number of laws, institutions, and customs; broke through bonds sacred among all nations, strong enough to resist time itself; which is more than can be said of certain benefits conferred by these innovators.\nThe monarchs will fulfil the duties imposed upon them by Him who, by entrusting them with power, has charged them to watch over the maintenance of justice, and the rights of all, to avoid the paths of error, and tread firmly in the way of truth. Placed beyond the passions which agitate society, it is in days of trial chiefly that they are called upon to despoil realities of their false appearances, and to show themselves as they are, fathers invested with the authority belonging by right to the heads of fam

## Searching for last "ASSISTANT: " answer

To calculate perplexity only on the final answer I should find this answer in the prompt, so I wrote the following function.

In [9]:
def last_subsequence_start(sequence, subsequence):
    seq_len = len(sequence)
    subseq_len = len(subsequence)

    for i in range(seq_len - 1, subseq_len - 2, -1):
        if sequence[i] == subsequence[-1]:
            match = True
            for j in range(subseq_len):
                if sequence[i - j] != subsequence[-(j + 1)]:
                    match = False
                    break
            if match:
                return i - subseq_len + 1
    
    return -1

In [10]:
last_subsequence_start([1, 2, 3, 4, 2, 3, 5], [2, 3])

4

## Loading LLAMA

In [11]:
LLAMA_MODEL = "./Llama-2-13B-chat-GPTQ-localmodels/"
LLAMA_WEIGHTS = "./Llama-2-13B-chat-GPTQ-localmodels/gptq_model-4bit-128g.safetensors"

In [12]:
_, _, load_llama_model_4bit_low_ram, _, model_to_half, _, _, _, AMPWrapper = import_llama(
    use_flash_attention=False,
    use_xformers=False,
    autograd_4bit_cuda=True,
    autograd_4bit_triton=False,
    matmul4bit_options=Matmul4BitOptions.NO_ACT_ORDER | Matmul4BitOptions.ALGORYTHM_FASTER,
)

Triton not found. Please run "pip install triton".
Using CUDA implementation.


In [13]:
llama, tokenizer = load_llama_model_4bit_low_ram(
    config_path=LLAMA_MODEL,
    model_path=LLAMA_WEIGHTS,
    groupsize=128,
    half=True,
    device_map='auto',
    seqlen=2048,
    is_v1_model=False,
    bits=4,
)

Loading Model ...


The safetensors archive passed at ./Llama-2-13B-chat-GPTQ-localmodels/gptq_model-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.


Converted as Half.


You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


Loaded the model in 6.83 seconds.


In [14]:
tokenizer.pad_token_id = 0

In [15]:
amp_wrapper = AMPWrapper(llama)
amp_wrapper.apply_forward()
amp_wrapper.apply_generate()

## Checking prompts

In [16]:
last_assistant_response_tokens = tokenizer.encode("ASSISTANT:")[1:]

In [17]:
for token in last_assistant_response_tokens:
    print(tokenizer.convert_ids_to_tokens([token]), token)

['▁A'] 319
['SS'] 1799
['IST'] 9047
['ANT'] 13566
[':'] 29901


In [18]:
assert all([
    last_subsequence_start(tokenizer.encode(text), last_assistant_response_tokens) != -1
    for text in apply_formatters(
        df_data,
        df_formatters,
        "-no-cot"
    )
])

In [19]:
assert all([
    last_subsequence_start(tokenizer.encode(text), last_assistant_response_tokens) != -1
    for text in apply_formatters(
        df_data,
        df_formatters,
        ""
    )
])

In [20]:
assert all([
    last_subsequence_start(tokenizer.encode(text), last_assistant_response_tokens) != -1
    for text in apply_formatters(
        df_data.assign(chain_of_thoughts=""),
        df_formatters,
        "-cot"
    )
])

## Calculating answers perplexity

In [21]:
def get_text_length_sorted_dataframe(texts):
    df = pd.DataFrame({
        "text": texts,
    })
    df["length"] = df["text"].str.len()
    df["index"] = list(range(len(texts)))
    df = df.sort_values("length")
    return df

In [22]:
def get_batch_perplexities(llama, tokenizer, batch_texts, last_assistant_response_tokens):
    batch_size = len(batch_texts)
    batch = tokenizer.batch_encode_plus(
        batch_texts,
        return_tensors="pt",
        padding=True,
    )
    batch = {
        key: value.to(llama.device)
        for key, value in batch.items()
    }
    llama.eval()
    with torch.no_grad():
        logits = llama(**batch).logits
        probas = F.softmax(logits, dim=-1)
    input_ids_np = batch["input_ids"].detach().cpu().numpy()
    attention_mask_np = batch["attention_mask"].detach().cpu().numpy()
    probas_np = probas.detach().cpu().numpy()
    perplexities = []
    for i in range(batch_size):
        item_mask = attention_mask_np[i].astype(np.bool_)
        item_input_ids = input_ids_np[i][item_mask]
        item_probas = probas_np[i][item_mask]
        answer_start_token_index = last_subsequence_start(item_input_ids, last_assistant_response_tokens)
        # Exclude ending <s> token
        answer_labels = item_input_ids[answer_start_token_index + len(last_assistant_response_tokens):-1]
        answer_probas = item_probas[answer_start_token_index + len(last_assistant_response_tokens) - 1:-2]
        answer_token_probas = []
        for i, token in enumerate(answer_labels):
            answer_token_probas.append(answer_probas[i, token])
        answer_token_probas = np.array(answer_token_probas)
        perplexity = np.exp(-np.log(answer_token_probas).mean())
        perplexities.append(perplexity)
    return perplexities

In [23]:
def get_answers_perplexity(llama, tokenizer, texts, last_assistant_response_tokens, batch_size):
    tokenizer.padding_side = 'right'
    df = get_text_length_sorted_dataframe(texts)
    batch_count = int(np.ceil(len(df) / batch_size))
    perplexities = np.zeros([len(texts)], dtype=np.float32)
    batch_indices = np.arange(batch_count, dtype=np.int32)
    batch_indices = pd.Series(batch_indices).sample(len(batch_indices), random_state=42).values
    for batch_index in tqdm(batch_indices):
        df_batch = df.iloc[batch_index * batch_size : (batch_index + 1) * batch_size]
        batch_real_size = len(df_batch)
        texts_batch = df_batch["text"].tolist()
        batch_perplexities = get_batch_perplexities(llama, tokenizer, texts_batch, last_assistant_response_tokens)
        perplexities[batch_index * batch_size : batch_index * batch_size + batch_real_size] = batch_perplexities
    df["perplexity"] = perplexities
    df = df.sort_values("index")
    return df["perplexity"].values

## Calculate initial answers perplexity

In [24]:
def get_iteration_no_cot_perplexities(step, llama, tokenizer, df_data, df_formatters,
                                      last_assistant_response_tokens, batch_size):
    df_data = df_data.copy()
    fname = f"cached-{step}-no-cot-perplexities.csv"
    if not os.path.exists(fname):
        df_data["no_cot_perplexity"] = get_answers_perplexity(
            llama,
            tokenizer,
            apply_formatters(
                df_data,
                df_formatters,
                "-no-cot"
            ),
            last_assistant_response_tokens,
            batch_size,
        )
        df_data["variables"] = df_data["variables"].apply(json.dumps)
        df_data.to_csv(fname, index=False)
    df_data = pd.read_csv(fname)
    df_data["variables"] = df_data["variables"].apply(json.loads)
    return df_data

In [25]:
df_data = get_iteration_no_cot_perplexities(
    step=0,
    llama=llama,
    tokenizer=tokenizer,
    df_data=df_data,
    df_formatters=df_formatters,
    last_assistant_response_tokens=last_assistant_response_tokens,
    batch_size=4
)
df_data.head()

Unnamed: 0,question,variables,target,dataset,formatter,subset,split,no_cot_perplexity
0,This question refers to the following informat...,{'A': 'The ideas of personal liberty and natio...,A,mmlu,mmlu,mmlu-high_school_european_history,train,1444.3512
1,This question refers to the following informat...,"{'A': 'Capitalist', 'B': 'Scientific', 'C': 'C...",C,mmlu,mmlu,mmlu-high_school_european_history,train,2706.0737
2,This question refers to the following informat...,{'A': 'They served as a catalyst for the growt...,A,mmlu,mmlu,mmlu-high_school_european_history,train,2104.2961
3,This question refers to the following informat...,{'A': 'give the English king a new position of...,D,mmlu,mmlu,mmlu-high_school_european_history,train,5377.225
4,This question refers to the following informat...,{'A': 'His domination of the nobility left him...,D,mmlu,mmlu,mmlu-high_school_european_history,test,11120.245


In [26]:
df_data["no_cot_perplexity"].mean()

2680.1576499624375

## Generate chain-of-thoughts

In [27]:
CONTRASTIVE_SEARCH_ALPHA = 0.6
CONTRASTIVE_SEARCH_TOP_K = 4
COT_GENERATION_MAX_NEW_TOKENS = 64
COT_ANSWER_MINIMUM_TOKENS = 5

In [28]:
def get_batch_chains_of_thought(llama, tokenizer, batch_texts):
    batch = tokenizer.batch_encode_plus(
        batch_texts,
        return_tensors="pt",
        padding=True,
    )
    batch = {
        key: value.to(llama.device)
        for key, value in batch.items()
    }
    max_position_embeddings = llama.config.max_position_embeddings
    max_sequence_length = batch["attention_mask"].sum(dim=-1).max().item()
    max_new_tokens = COT_GENERATION_MAX_NEW_TOKENS
    if max_sequence_length + max_new_tokens + COT_ANSWER_MINIMUM_TOKENS > max_position_embeddings:
        max_new_tokens = max_position_embeddings - max_sequence_length - COT_ANSWER_MINIMUM_TOKENS
    llama.eval()
    with torch.no_grad():
        generation = llama.generate(batch["input_ids"],
                                    penalty_alpha=CONTRASTIVE_SEARCH_ALPHA,
                                    top_k=CONTRASTIVE_SEARCH_TOP_K,
                                    max_new_tokens=max_new_tokens,
                                    use_cache=True)
        generation = generation.detach().cpu().numpy()
    batch_size = len(batch_texts)
    result = []
    for i in range(batch_size):
        mask = generation[i] != tokenizer.pad_token_id
        item_tokens = generation[i][mask]
        prompt_token_count = (batch["input_ids"][i] != tokenizer.pad_token_id).sum().item()
        item_generation_tokens = item_tokens[prompt_token_count:]
        item_generation_text = tokenizer.decode(item_generation_tokens)
        item_generation_text_cleaned = item_generation_text.split("ASSISTANT:")[0].split("USER:")[0].strip()
        result.append(item_generation_text_cleaned)
    return result

In [29]:
def get_chains_of_thought(llama, tokenizer, texts, batch_size):
    tokenizer.padding_side = 'left'
    df = get_text_length_sorted_dataframe(texts)
    batch_count = int(np.ceil(len(df) / batch_size))
    batch_indices = np.arange(batch_count, dtype=np.int32)
    batch_indices = pd.Series(batch_indices).sample(len(batch_indices), random_state=42).values
    
    generations = {}
    for batch_index in tqdm(batch_indices):
        df_batch = df.iloc[batch_index * batch_size : (batch_index + 1) * batch_size]
        batch_real_size = len(df_batch)
        texts_batch = df_batch["text"].tolist()
        generations_batch = get_batch_chains_of_thought(llama, tokenizer, texts_batch)
        for i in range(batch_real_size):
            generations[batch_index * batch_size + i] = generations_batch[i]
    
    df["chain_of_thoughts"] = [
        generations[i]
        for i in range(len(texts))
    ]
    df = df.sort_values("index")
    return df["chain_of_thoughts"].values

In [34]:
def get_iteration_chains_of_thoughts(step, llama, tokenizer, df_data, df_formatters, batch_size):
    df_data = df_data.copy()
    fname = f"cached-{step}-chains-of-thoughts.csv"
    if not os.path.exists(fname):
        df_data["chain_of_thoughts"] = get_chains_of_thought(
            llama,
            tokenizer,
            apply_formatters(
                df_data,
                df_formatters,
                "",
            ),
            batch_size,
        )
        df_data["variables"] = df_data["variables"].apply(json.dumps)
        df_data.to_csv(fname, index=False)
    df_data = pd.read_csv(fname)
    df_data["variables"] = df_data["variables"].apply(json.loads)
    return df_data

In [36]:
df_data = get_iteration_chains_of_thoughts(
    step=0,
    llama=llama,
    tokenizer=tokenizer,
    df_data=df_data,
    df_formatters=df_formatters,
    batch_size=4
)
df_data.head()

  0%|          | 0/398 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 222.00 MiB (GPU 0; 11.00 GiB total capacity; 35.88 GiB already allocated; 0 bytes free; 36.76 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF