In [5]:
!pip install transformers==4.28.0 tokenizers datasets accelerate



In [6]:
import tensorflow as tf
import glob
import os
import shutil
import tqdm
import random
import matplotlib.pyplot as plt
import torch
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer

tf.config.list_physical_devices("GPU")

  from .autonotebook import tqdm as notebook_tqdm
2024-02-16 12:15:18.715624: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-16 12:15:18.716454: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-16 12:15:18.716608: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# Load Dataset

In [7]:
dataset_file = "dataset.txt"

# How many files to load.
file_number = 100

# Clone the repo.
!git clone https://github.com/vilmibm/lovecraftcorpus
    
# Find all the files.
paths = glob.glob("lovecraftcorpus/*.txt")

# Do not use all.
paths = paths[:file_number]
print(sorted(paths))

# each line is a sample in the dataset
# in this case, each line is a paragraph
# Merge.
# TODO: make more sophisticated to deal with short paragraphs.
with open(dataset_file, "w") as output_file:
    for path in paths:
        for line in open(path, "r"):
            for split in line.split("\n"):
                split = split.strip()
                if split != "":
                    print(split, file=output_file)

# Delete repo.
!rm -rf lovecraftcorpus

# Done.
print("Corpus downloaded.")

Cloning into 'lovecraftcorpus'...
remote: Enumerating objects: 74, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 74 (delta 0), reused 3 (delta 0), pack-reused 70[K
Receiving objects: 100% (74/74), 1.12 MiB | 5.46 MiB/s, done.
['lovecraftcorpus/alchemist.txt', 'lovecraftcorpus/arthur_jermyn.txt', 'lovecraftcorpus/azathoth.txt', 'lovecraftcorpus/beast.txt', 'lovecraftcorpus/beyond_wall_of_sleep.txt', 'lovecraftcorpus/book.txt', 'lovecraftcorpus/celephais.txt', 'lovecraftcorpus/charles_dexter_ward.txt', 'lovecraftcorpus/clergyman.txt', 'lovecraftcorpus/colour_out_of_space.txt', 'lovecraftcorpus/cool_air.txt', 'lovecraftcorpus/crawling_chaos.txt', 'lovecraftcorpus/cthulhu.txt', 'lovecraftcorpus/dagon.txt', 'lovecraftcorpus/descendent.txt', 'lovecraftcorpus/doorstep.txt', 'lovecraftcorpus/dreams_in_the_witch.txt', 'lovecraftcorpus/dunwich.txt', 'lovecraftcorpus/erich_zann.txt', 'lovecraftcorpus/ex_oblivione.txt', 'lo

# Prepare Datasets

In [8]:
raw_datasets = load_dataset("text", data_files=dataset_file)
raw_datasets
# 4371 lines

Generating train split: 4371 examples [00:00, 185606.71 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 4371
    })
})

In [9]:
raw_datasets["train"][666]
# with HuggingFace datasets, every sample is a dictionary
# a sample from the dataset here is a paragraph
# the key is always "text"

{'text': "No word was spoken amidst the distant sound that grew nearer and nearer, but as I followed the memory-face's mad stare along that cursed shaft of light to its source, the source whence also the whining came, I, too, saw for an instant what it saw, and fell with ringing ears in that fit of shrieking epilepsy which brought the lodgers and the police. Never could I tell, try as I might, what it actually was that I saw; nor could the still face tell, for although it must have seen more than I did, it will never speak again. But always I shall guard against the mocking and insatiate Hypnos, lord of sleep, against the night sky, and against the mad ambitions of knowledge and philosophy."}

# Goal

Generate new text

# Steps

1. Load Dataset
2. Prepare Datasets
3. Encode Text
4. Tokenize Text
5. Build Model



# Create Tokenizer

In [10]:
# Create empty tokenizer and its trainer
tokenizer = Tokenizer(BPE(unk_token="[UNK]")) # subword tokenization and ways to merge them
trainer = BpeTrainer(vocab_size=5_000, special_tokens=["[UNK]", "[PAD]"])
# separates the tokens with a space
tokenizer.pre_tokenizer = Whitespace()

# Batch samples to speed up process
def batch_iterator(batch_size=1000):
    # the batch size is the number of samples that will be processed at once
    # the iterator will yield a batch of samples
    # yield is a keyword in Python that is used like return, except the function will return a generator
    # a generator is an iterator that generates one item at a time
    for i in range(0, len(raw_datasets["train"]), batch_size):
        yield raw_datasets["train"][i : i + batch_size]["text"]

# Train the tokenizer
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(raw_datasets["train"]))

# Saves the tokenizer
# when downloading model, we download model, and the tokenizer
tokenizer.save("tokenizer.json")

# Load it fast
# speeds up the process
tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") 
tokenizer.add_special_tokens({"pad_token": "[PAD]"})






0

In [11]:
# random text that sounds like H.P. Lovcraft
text = "In his house at R'lyeh, dead Cthulhu waits dreaming."

# Tokenize the text
# the tokenizer will split the text into tokens
tokenizer(text)

# input_ids are the token ids
# the input_ids are fed to the model

# token_type_ids are used to distinguish different sequences in the same input

# attention_mask is used to tell the model to ignore the padding tokens

{'input_ids': [368, 169, 470, 100, 44, 6, 118, 4359, 9, 830, 3474, 1012, 282, 3509, 11], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
# random text that sounds like H.P. Lovcraft
text = "In his house at R'lyeh, dead Cthulhu waits dreaming."

# Tokenize the text
# the tokenizer will split the text into tokens
print(tokenizer(text)["input_ids"]) # token indices

tokens = [tokenizer.decode([index]) for index in tokenizer(text)["input_ids"]]
print(tokens)

[368, 169, 470, 100, 44, 6, 118, 4359, 9, 830, 3474, 1012, 282, 3509, 11]
['In', 'his', 'house', 'at', 'R', "'", 'ly', 'eh', ',', 'dead', 'Cthulhu', 'wa', 'its', 'dreaming', '.']


# Tokenize Dataset

In [13]:
# start with sequence length of 256
# pads the sequences to the same length
sequence_length = 256

# takes a dictionary as input
def tokenize_function(example):
    # tokenize the text
    tokenized_example = tokenizer(
        example["text"],
        truncation=True,
        padding=True,
        max_length=sequence_length,
    )
    return {"input_ids": tokenized_example["input_ids"]}

# tokenize entire dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=raw_datasets["train"].column_names)

Map: 100%|██████████| 4371/4371 [00:00<00:00, 12201.74 examples/s]


In [14]:
# print sample number 666
# returns input_ids
print(tokenized_datasets["train"][666])
 

{'input_ids': [1004, 2376, 127, 3391, 1394, 93, 1254, 584, 128, 1134, 3237, 102, 3237, 9, 195, 109, 35, 1480, 93, 1328, 10, 607, 6, 71, 357, 104, 240, 1022, 128, 1487, 4493, 103, 297, 111, 282, 2108, 9, 93, 2108, 2932, 1492, 93, 121, 1569, 361, 9, 35, 9, 540, 9, 382, 148, 94, 2222, 291, 113, 382, 9, 102, 1701, 152, 1188, 107, 2237, 92, 128, 1862, 103, 4436, 304, 442, 2500, 182, 942, 93, 3322, 59, 261, 102, 93, 1974, 11, 4244, 234, 35, 628, 9, 742, 109, 35, 413, 9, 291, 113, 2378, 127, 128, 35, 382, 24, 420, 234, 93, 514, 607, 628, 9, 148, 2316, 113, 394, 233, 519, 305, 365, 35, 330, 9, 113, 586, 483, 1338, 437, 11, 528, 879, 35, 1186, 2153, 894, 93, 4683, 102, 560, 100, 61, 227, 34, 77, 68, 124, 71, 9, 64, 1322, 103, 1072, 9, 894, 93, 340, 986, 9, 102, 894, 93, 357, 4167, 1688, 103, 1720, 102, 4074, 77, 11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

# Collate the Data

In [15]:
# data collator is used to batch the samples together
# data pump for training
# collate means to collect and combine
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Create the Model

In [16]:
model_config = GPT2Config(
    vocab_size=tokenizer.vocab_size, # the size of the vocabulary
    pad_token_id=tokenizer.pad_token_id, # the token id for padding
    n_ctx=sequence_length, # context length
    n_positions=sequence_length, # positions in context, the order of the tokens in sequence
    n_embd=512, # embedding dimension
    n_head=8, # number of heads in the multi-head attention models
    n_layer=6, # number of layers
)

model = GPT2LMHeadModel(model_config)
model

# wte: word token embeddings; which means the embeddings of the tokens
# wpe: word position embeddings; which means the embeddings of the positions of the tokens
# drop: dropout
# sequence length: 256
# dropout layer: 0.1
# dropout is a regularization technique; it prevents overfitting
# normalisation is done first, which differs to transformers
# normalisation means to scale the input to have a mean of 0 and a standard deviation of 1
# Conv1D means 1D convolution; convolutions are used to extract features from the input
# LayerNorm means layer normalisation; normalisation is used to improve the training of the model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(5000, 512)
    (wpe): Embedding(256, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=512, out_features=5000, bias=False)
)

# Create Trainer
# Save Trainer

In [21]:
output_path = "output"

# Create the Trainer

training_args = TrainingArguments(
    output_dir=output_path, # output directory
    overwrite_output_dir=True, # overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    #per_device_train_batch_size=16, # batch size for training per device (e.g. multiple GPUs), which took 8 minutes
    #per_device_train_batch_size=32 # double, because i was only using around 3GB of VRAM, which took 7.5 minutes
    per_device_train_batch_size=46 # increase, because i was only using around 6GB of VRAM with 32, which also took 7.5 minutes
)

trainer = Trainer(
    model=model, # the model
    args=training_args, # training arguments
    data_collator=data_collator, # data collator
    train_dataset=tokenized_datasets["train"] # training dataset
)

# Train
trainer.train()

# Save
tokenizer.save_pretrained(output_path)
model.save_pretrained(output_path)



 52%|█████▏    | 500/960 [03:55<03:44,  2.05it/s]

{'loss': 4.8177, 'learning_rate': 2.3958333333333334e-05, 'epoch': 5.21}


100%|██████████| 960/960 [07:35<00:00,  2.11it/s]

{'train_runtime': 455.0859, 'train_samples_per_second': 96.048, 'train_steps_per_second': 2.109, 'train_loss': 4.731611124674479, 'epoch': 10.0}





In [22]:
# Encode the conditioning tokens.
input_ids = tokenizer.encode("The most merciful thing in the world, I think, is the inability of the human mind to correlate all its contents.", return_tensors="pt").cuda()
print(input_ids)

# Generate more tokens.
generated_ids = model.generate(
    input_ids,
    max_length=100,
    do_sample=True,
    temperature=0.5
)
generated_sequence = tokenizer.decode(generated_ids[0], clean_up_tokenization_spaces=True)
print(generated_sequence)

tensor([[ 184,  325, 3454,  205,   92,   93,  552,    9,   35,  678,    9,  114,
           93,   92, 3974,  103,   93,  577,  609,  111,  421,  695,  227,  156,
          282, 4911,   11]], device='cuda:0')
The most merciful thing in the world, I think, is the in ability of the human mind to cor rel ate all its contents. I was a moment, and made a half - place which an old man's only because of the most of the world was a certain con stell ations. I was not that I was in the house, and not known to the ancient, but I was not even if the first time before. The thing was a strange, and I saw that I was no
