In [3]:
%pip install datasets==2.0.0
import os
import re
import json
import requests
from pathlib import Path
from urllib.parse import urlparse

Note: you may need to restart the kernel to use updated packages.


In [4]:
with open("./data/homer/homer.json", "r") as f:
    sources = json.load(f)
sources

[{'title': 'The Iliad, by Homer',
  'source': '6130-0.txt',
  'start': 4133,
  'end': 23285,
  'ignore': {'startswith': ['[Illustration:', 'BOOK', 'ARGUMENT'],
   'isupper': True},
  'replace': {'\\[\\d+\\]': ''}},
 {'title': 'The Odyssey, by Homer',
  'source': 'pg1727.txt',
  'start': 740,
  'end': 10833,
  'ignore': {'startswith': 'BOOK', 'isupper': True}}]

In [5]:
def check(s, ignore):
    r = []
    for a in ignore.keys():
        if not hasattr(str, a):
            continue

        fn = getattr(str, a)
        if type(ignore[a]) == str:
            r += [fn(s, ignore[a])]

        elif type(ignore[a]) == list:
            r += [fn(s, i) for i in ignore[a]]

        elif type(ignore[a]) == bool and ignore[a]:
            r += [fn(s)]

    return any(r)

In [6]:
def substitute(s, replace):
    for a in replace.keys():
        s = re.sub(a, replace[a], s)
    return s

In [7]:
def load(
    title="",
    source="",
    start=0,
    end=100,
    ignore={},
    replace={},
    base_path: Path = Path("."),
):
    print(f"processing '{title}'")

    # load text
    file = base_path.resolve().absolute() / source
    print(f"Using {file}", end="... ")
    with open(str(file), "r", encoding="utf-8") as f:
        text = f.read()

    lines = text.encode("ascii", errors="ignore").decode("ascii").split("\n")[start:end]

    # cleaned sentences
    sentences = [
        f"{s.strip()}."
        for s in " ".join(
            [
                substitute(item, replace).strip()
                for item in lines
                if len(item) > 0 and not check(item, ignore)
            ]
        ).split(".")
    ]
    print("done!")
    return sentences


In [8]:
with open("./data/homer/homer.raw.txt", "w") as f:
    for source_id in [0, 1]:
        text =  load(**sources[source_id], base_path=Path('./data/homer'))
        for line in text:
            print(line, file=f)

processing 'The Iliad, by Homer'
Using /mnt/batch/tasks/shared/LS_root/mounts/clusters/gpuciaz/code/Users/alzeltov/locutus/data/homer/6130-0.txt... done!
processing 'The Odyssey, by Homer'
Using /mnt/batch/tasks/shared/LS_root/mounts/clusters/gpuciaz/code/Users/alzeltov/locutus/data/homer/pg1727.txt... done!


## Fine-tuning model 

Based on https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py

In [9]:
root_dir = Path("./model").resolve()
config_path = root_dir / "config"
model_path = root_dir / "weights"
tokenizer_path = root_dir / "tokenizer"
cache_dir = root_dir / ".cache"
output_dir = root_dir / ".outputs"
data_path = Path("./data/homer").resolve() / "homer.raw.txt"

In [10]:
# load dataset and process it
from datasets import load_dataset

data_files = {}
dataset_args = {}
data_files["train"] = str(data_path)
extension = "text"
dataset_args["keep_linebreaks"] = True
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=cache_dir, **dataset_args)

# train:val split = 80:20
validation_split_percentage = 20
raw_datasets["validation"] = load_dataset(
    extension,
    data_files=data_files,
    split=f"train[:{validation_split_percentage}%]",
    cache_dir=cache_dir,
    **dataset_args,
)
raw_datasets["train"] = load_dataset(
    extension,
    data_files=data_files,
    split=f"train[{validation_split_percentage}%:]",
    cache_dir=cache_dir,
    **dataset_args,
)

  from .autonotebook import tqdm as notebook_tqdm
2022/06/07 14:59:59 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
Using custom data configuration default-37e9673ffbb3a58d


Downloading and preparing dataset text/default to /mnt/batch/tasks/shared/LS_root/mounts/clusters/gpuciaz/code/Users/alzeltov/locutus/model/.cache/text/default-37e9673ffbb3a58d/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...
Dataset text downloaded and prepared to /mnt/batch/tasks/shared/LS_root/mounts/clusters/gpuciaz/code/Users/alzeltov/locutus/model/.cache/text/default-37e9673ffbb3a58d/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load pretrained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast_tokenizer=True, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(model_path, config=config_path, cache_dir=cache_dir)
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [12]:
from transformers import TextGenerationPipeline

# generate text from prefix before fine-tuning
device = -1 if model.device.type == "cpu" else model.device.index
text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=device)
print(text_generator("The war in")[0]["generated_text"])
print(text_generator("The market in America")[0]["generated_text"])

2022/06/07 15:00:18 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [13]:
# Preprocessing the datasets.
# First we tokenize all the texts.
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names,
    desc="Running tokenizer on dataset",
)

#block_size = tokenizer.model_max_length
block_size = 256

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]Running tokenizer on dataset: 100%|██████████| 2/2 [00:00<00:00, 17.30ba/s]Running tokenizer on dataset: 100%|██████████| 2/2 [00:00<00:00, 17.16ba/s]
Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]Running tokenizer on dataset: 100%|██████████| 1/1 [00:00<00:00, 14.45ba/s]


In [14]:
from itertools import chain

# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    desc=f"Grouping texts in chunks of {block_size}",
)

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

Grouping texts in chunks of 256:   0%|          | 0/2 [00:00<?, ?ba/s]Grouping texts in chunks of 256:  50%|█████     | 1/2 [00:00<00:00,  7.49ba/s]Grouping texts in chunks of 256: 100%|██████████| 2/2 [00:00<00:00, 13.16ba/s]
Grouping texts in chunks of 256:   0%|          | 0/1 [00:00<?, ?ba/s]Grouping texts in chunks of 256: 100%|██████████| 1/1 [00:00<00:00,  7.25ba/s]Grouping texts in chunks of 256: 100%|██████████| 1/1 [00:00<00:00,  7.21ba/s]


In [15]:
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    return logits.argmax(dim=-1)

from datasets import load_metric
metric = load_metric("accuracy")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics but we need to shift the labels
    labels = labels[:, 1:].reshape(-1)
    preds = preds[:, :-1].reshape(-1)
    return metric.compute(predictions=preds, references=labels)

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]Downloading builder script: 3.19kB [00:00, 2.35MB/s]                   


In [16]:
from transformers import TrainingArguments

# initialize traing arguments
training_args = TrainingArguments(
    output_dir=str(output_dir), 
    do_train=True, 
    do_eval=True,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    eval_accumulation_steps = 1,
    num_train_epochs = 20
)

In [17]:
from transformers import Trainer, default_data_collator, is_torch_tpu_available

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    # Data collator will default to DataCollatorWithPadding, so we change it.
    data_collator=default_data_collator,
    compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
    if training_args.do_eval and not is_torch_tpu_available()
    else None,
)

In [18]:
last_checkpoint = None

In [19]:
# train
if training_args.do_train:
    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint
    elif last_checkpoint is not None:
        checkpoint = last_checkpoint
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
    trainer.save_model()  # Saves the tokenizer too for easy upload

    metrics = train_result.metrics

    max_train_samples = len(train_dataset)
   
    metrics["train_samples"] = min(max_train_samples, len(train_dataset))

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

***** Running training *****
  Num examples = 220
  Num Epochs = 20
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1100


Step,Training Loss
500,2.7717
1000,1.898


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Attempted to log scalar metric loss:
2.7717
Attempted to log scalar metric learning_rate:
2.7272727272727273e-05
Attempted to log scalar metric epoch:


Saving model checkpoint to /mnt/batch/tasks/shared/LS_root/mounts/clusters/gpuciaz/code/Users/alzeltov/locutus/model/.outputs/checkpoint-500
Configuration saved in /mnt/batch/tasks/shared/LS_root/mounts/clusters/gpuciaz/code/Users/alzeltov/locutus/model/.outputs/checkpoint-500/config.json
Model weights saved in /mnt/batch/tasks/shared/LS_root/mounts/clusters/gpuciaz/code/Users/alzeltov/locutus/model/.outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /mnt/batch/tasks/shared/LS_root/mounts/clusters/gpuciaz/code/Users/alzeltov/locutus/model/.outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in /mnt/batch/tasks/shared/LS_root/mounts/clusters/gpuciaz/code/Users/alzeltov/locutus/model/.outputs/checkpoint-500/special_tokens_map.json
Saving model checkpoint to /mnt/batch/tasks/shared/LS_root/mounts/clusters/gpuciaz/code/Users/alzeltov/locutus/model/.outputs/checkpoint-1000
Configuration saved in /mnt/batch/tasks/shared/LS_root/mounts/clusters/gpuciaz/

In [20]:
import math

# eval
if training_args.do_eval:
    metrics = trainer.evaluate()
    max_eval_samples = len(eval_dataset)
    metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
    perplexity = math.exp(metrics["eval_loss"])
    metrics["perplexity"] = perplexity

metrics

***** Running Evaluation *****
  Num examples = 56
  Batch size = 4


Attempted to log scalar metric eval_loss:
3.7507991790771484
Attempted to log scalar metric eval_accuracy:
0.3490896358543417
Attempted to log scalar metric eval_runtime:
2.8389
Attempted to log scalar metric eval_samples_per_second:
19.726
Attempted to log scalar metric eval_steps_per_second:
4.932
Attempted to log scalar metric epoch:
20.0


{'eval_loss': 3.7507991790771484,
 'eval_accuracy': 0.3490896358543417,
 'eval_runtime': 2.8389,
 'eval_samples_per_second': 19.726,
 'eval_steps_per_second': 4.932,
 'epoch': 20.0,
 'eval_samples': 56,
 'perplexity': 42.555077541588325}

In [21]:
from transformers import TextGenerationPipeline

# generate text from prefix after fine-tuning
device = -1 if model.device.type == "cpu" else model.device.index
text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=device)

x = text_generator("The war in")
y = text_generator("The market in America")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [23]:
device

0

In [22]:
x, y

([{'generated_text': 'The war in the Aeaean island raged on for two days and three nights, and on the'}],
 [{'generated_text': 'The market in America is so great that it breeds many vagabonds, and it breeds many drug'}])