In [1]:
from transformers import AutoTokenizer

from redditqa.dataset import load_reddit_dataset
from redditqa.dataset.preprocessing import links

from tqdm import tqdm

from trl.trainer import ConstantLengthDataset

import importlib

  from .autonotebook import tqdm as notebook_tqdm
2023-08-21 16:07:41.583199: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_checkpoint = "meta-llama/Llama-2-7b-chat-hf"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.add_special_tokens({"pad_token":"<pad>"})

1

In [4]:
tokenizer.eos_token, tokenizer.bos_token, tokenizer.pad_token, tokenizer.unk_token

('</s>', '<s>', '<pad>', '<unk>')

In [5]:
tokenizer.eos_token_id, tokenizer.bos_token_id, tokenizer.pad_token_id, tokenizer.unk_token_id

(2, 1, 32000, 0)

In [6]:
x = tokenizer('This is a test.' + tokenizer.eos_token)
x

{'input_ids': [1, 910, 338, 263, 1243, 29889, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [7]:
tokenizer.decode(2)

'</s>'

In [8]:
for id in x.input_ids:
    print(tokenizer.decode(id))

<s>
This
is
a
test
.
</s>


In [9]:
def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    submission_title = example["question_title"]
    comments = example["answers"]
    comments = sorted(comments, key=lambda k: k["answer_score"])
    answer = comments[-1]["answer_body"]
    text = f"Question: {submission_title}\nAnswer: {answer}"
    return text


In [10]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


In [11]:
seq_length = 1024

# Load the dataset
dataset_dict = load_reddit_dataset(pairs=False)

train_data = dataset_dict["train"]
valid_data = dataset_dict["eval"]

Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/train/cache-757f0ee80690267b.arrow
Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-cba55e4212677d14.arrow
Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/test/cache-4b02acf4a2882148.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/train/cache-233e21c9955b1701.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-71c7c533e04253a7.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/test/cache-6b6a8e5cca31b0f8.arrow


In [20]:
train_data_iter = iter(train_data)
next(train_data_iter)
next(train_data_iter)
next(train_data_iter)
next(train_data_iter)
next(train_data_iter)
row = next(train_data_iter)
print(row['question_title'])
for answer in row['answers']:
    print(answer['answer_score'], '-', answer['answer_body'])

Why are there capital and lowercase letters? Where did this all begin?
11 - True ELI5 response:  People used to write in all capitals.  Over time people got lazy and found alternative ways to write letters, which we call lowercase.  However, for important words and the beginning of a passage of writing, people still liked to make those parts stand out with capital letters.  After enough time, it became conventional to have rules regarding what is capitalized and what isn't.  In the end, we decided, as a community, on certain rules that we should all follow when writing.
10 - Capital letters were the original form, used for engraving and carving into stone. As writing with pen and paper took off, the capital letter forms evolved into forms which were quicker to write. 
5 - the terms "Upper case" and "Lower case"  are leftovers from when the printing press, linotype, and letterpress dominated the printing industry. For a very long time metal move-able type ([LINK]) was extremely expensiv

In [18]:
# Estimate the average number of characters per token in the dataset
chars_per_token = chars_token_ratio(train_data, tokenizer)
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

# Create constant length datasets
train_dataset = ConstantLengthDataset(
    tokenizer,
    train_data,
    formatting_func=prepare_sample_text,
    infinite=True,
    seq_length=seq_length,
    chars_per_token=chars_per_token,
)
valid_dataset = ConstantLengthDataset(
    tokenizer,
    valid_data,
    formatting_func=prepare_sample_text,
    infinite=False,
    seq_length=seq_length,
    chars_per_token=chars_per_token,
)

  0%|          | 0/400 [00:00<?, ?it/s]

100%|██████████| 400/400 [00:01<00:00, 200.54it/s]

The character to token ratio of the dataset is: 3.98





In [21]:
row = next(iter(train_dataset))
row

{'input_ids': tensor([11422, 29892, 11131,  ...,   263,  3353, 29889]),
 'labels': tensor([11422, 29892, 11131,  ...,   263,  3353, 29889])}

In [22]:
list(row['input_ids'])

[tensor(11422),
 tensor(29892),
 tensor(11131),
 tensor(7347),
 tensor(29920),
 tensor(631),
 tensor(29892),
 tensor(2992),
 tensor(29889),
 tensor(3115),
 tensor(1556),
 tensor(310),
 tensor(1232),
 tensor(267),
 tensor(2996),
 tensor(472),
 tensor(278),
 tensor(1407),
 tensor(1095),
 tensor(310),
 tensor(670),
 tensor(6413),
 tensor(322),
 tensor(278),
 tensor(1232),
 tensor(267),
 tensor(1434),
 tensor(393),
 tensor(540),
 tensor(263),
 tensor(854),
 tensor(3192),
 tensor(29889),
 tensor(4602),
 tensor(292),
 tensor(29871),
 tensor(29945),
 tensor(3064),
 tensor(338),
 tensor(884),
 tensor(451),
 tensor(263),
 tensor(4802),
 tensor(5376),
 tensor(746),
 tensor(366),
 tensor(5401),
 tensor(29871),
 tensor(29945),
 tensor(29953),
 tensor(3064),
 tensor(29889),
 tensor(29871),
 tensor(2),
 tensor(1),
 tensor(894),
 tensor(29901),
 tensor(3750),
 tensor(437),
 tensor(278),
 tensor(8278),
 tensor(1603),
 tensor(671),
 tensor(3546),
 tensor(11251),
 tensor(11719),
 tensor(2012),
 tensor(3

In [23]:
print(tokenizer.decode(row['input_ids']))

eman, Joe Frazier, etc. Also most of loses came at the very end of his career and the loses before that he avenged. Losing 5 times is also not a big deal when you win 56 times. </s><s> Question: Why do the USA still use electoral vote instead of popular vote?
Answer: Say you have 50 buildings in a neighborhood. Forty Five of them are single family homes, with families ranging from 2 to 5 people. So about 160 people. Then you got five buildings that are frat houses with 40 people each, so 200 people. All houses in the neighborhood are the same size, but since there are five properties that have a majority of the neighborhood population, the can ban together and make all the rules. Does that sound fair to you? </s><s> Question: How can sound from a vinyl record contain all those different frequencies at once?
Answer: &gt; How does a record combines all these different frequencies in that single groove?

Sound is vibration, pressure waves in the air. The air can only have one pressure at 