In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [15]:
import transformers

print(transformers.__version__)

4.44.2


In [4]:
from transformers.utils import send_example_telemetry

send_example_telemetry("language_modeling_from_scratch_notebook", framework="pytorch")

In [16]:
from datasets import load_dataset
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')

In [17]:
datasets["train"][10]

{'text': ' The game \'s battle system , the BliTZ system , is carried over directly from Valkyira Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific " Potentials " , skills unique to each character . They are divided into " Personal Potential " , which are innate skills that remain unaltered unless otherwise dictated by the story and can either help or impede

In [18]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(datasets["train"])

In [19]:
model_checkpoint = "gpt2"
tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer"

In [20]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)



In [21]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [22]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [23]:
tokenized_datasets["train"][1]

{'input_ids': [238, 8576, 9441, 2987, 238, 252],
 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [24]:
# block_size = tokenizer.model_max_length
block_size = 128

In [25]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [26]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [27]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

2024-09-16 08:51:16.624934: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-16 08:51:16.640090: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-16 08:51:16.645638: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-16 08:51:16.672082: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


' the " Nameless ", a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven ". \n The game began development in 2010, carrying over a large portion of the work done on Valkyria Chronicles II. While it retained the standard features of the series, it also underwent multiple adjustments, such as making the game more forgiving for series newcomers. Character designer Raita Honjou and composer Hitoshi Sakimoto both returned from previous entries, along with Valkyria Chronicles II director Takeshi Ozawa. A large'

In [28]:
from transformers import AutoConfig, AutoModelForCausalLM

config = AutoConfig.from_pretrained(model_checkpoint)
model = AutoModelForCausalLM.from_config(config)

In [29]:
from transformers import Trainer, TrainingArguments

In [30]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    f"{model_checkpoint}-wikitext2",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True
)

print("TrainingArguments and Trainer are successfully imported and initialized.")

TrainingArguments and Trainer are successfully imported and initialized.




In [31]:
training_args = TrainingArguments(
    f"{model_checkpoint}-wikitext2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True
)



In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [40]:
trainer.train()

  0%|          | 0/7038 [00:00<?, ?it/s]

{'loss': 7.9152, 'grad_norm': 3.553107738494873, 'learning_rate': 1.85791418016482e-05, 'epoch': 0.21}
{'loss': 7.291, 'grad_norm': 3.7394213676452637, 'learning_rate': 1.7158283603296393e-05, 'epoch': 0.43}
{'loss': 7.1878, 'grad_norm': 4.056800842285156, 'learning_rate': 1.573742540494459e-05, 'epoch': 0.64}
{'loss': 7.0958, 'grad_norm': 3.535494565963745, 'learning_rate': 1.4316567206592783e-05, 'epoch': 0.85}


  0%|          | 0/252 [00:00<?, ?it/s]

{'eval_loss': 7.054767608642578, 'eval_runtime': 81.853, 'eval_samples_per_second': 24.544, 'eval_steps_per_second': 3.079, 'epoch': 1.0}
{'loss': 7.0564, 'grad_norm': 3.712791919708252, 'learning_rate': 1.2895709008240979e-05, 'epoch': 1.07}
{'loss': 6.9955, 'grad_norm': 3.3751204013824463, 'learning_rate': 1.1474850809889173e-05, 'epoch': 1.28}
{'loss': 6.9726, 'grad_norm': 3.619595766067505, 'learning_rate': 1.0053992611537368e-05, 'epoch': 1.49}
{'loss': 6.928, 'grad_norm': 4.003269672393799, 'learning_rate': 8.633134413185564e-06, 'epoch': 1.71}
{'loss': 6.9071, 'grad_norm': 3.7433595657348633, 'learning_rate': 7.21227621483376e-06, 'epoch': 1.92}


  0%|          | 0/252 [00:00<?, ?it/s]

{'eval_loss': 6.889445781707764, 'eval_runtime': 82.07, 'eval_samples_per_second': 24.479, 'eval_steps_per_second': 3.071, 'epoch': 2.0}
{'loss': 6.9086, 'grad_norm': 3.597738742828369, 'learning_rate': 5.791418016481955e-06, 'epoch': 2.13}
{'loss': 6.8901, 'grad_norm': 3.7075469493865967, 'learning_rate': 4.3705598181301515e-06, 'epoch': 2.34}
{'loss': 6.9027, 'grad_norm': 3.412635564804077, 'learning_rate': 2.949701619778346e-06, 'epoch': 2.56}
{'loss': 6.873, 'grad_norm': 3.7385330200195312, 'learning_rate': 1.5288434214265418e-06, 'epoch': 2.77}
{'loss': 6.855, 'grad_norm': 3.9254696369171143, 'learning_rate': 1.0798522307473716e-07, 'epoch': 2.98}


  0%|          | 0/252 [00:00<?, ?it/s]

{'eval_loss': 6.872516632080078, 'eval_runtime': 82.0328, 'eval_samples_per_second': 24.49, 'eval_steps_per_second': 3.072, 'epoch': 3.0}
{'train_runtime': 7380.452, 'train_samples_per_second': 7.626, 'train_steps_per_second': 0.954, 'train_loss': 7.0544835031286635, 'epoch': 3.0}


TrainOutput(global_step=7038, training_loss=7.0544835031286635, metrics={'train_runtime': 7380.452, 'train_samples_per_second': 7.626, 'train_steps_per_second': 0.954, 'total_flos': 3703423157830656.0, 'train_loss': 7.0544835031286635, 'epoch': 3.0})

In [41]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/252 [00:00<?, ?it/s]

Perplexity: 958.98


In [42]:
# trainer.push_to_hub()
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Assuming `trainer` is your Trainer instance and `model` is your trained model
output_dir = "gpt2-wikitext2"

# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('gpt2-wikitext2/tokenizer_config.json',
 'gpt2-wikitext2/special_tokens_map.json',
 'gpt2-wikitext2/vocab.txt',
 'gpt2-wikitext2/added_tokens.json',
 'gpt2-wikitext2/tokenizer.json')

In [44]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer and model from the directory
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir)

# Set the model to evaluation mode
model.eval()

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


BertLMHeadModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [55]:
import torch
# Example input text
input_text = "据进行编目"

# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate text using the model
with torch.no_grad():
    output = model.generate(input_ids, max_length=50)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

行sssssssssssssssssssssssssssssssssssssssssss


In [58]:
# Example input text
input_text = "Once upon a time"

# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate text with specific constraints (e.g., maximum length, no repetition)
with torch.no_grad():
    output = model.generate(input_ids, max_length=50, no_repeat_ngram_size=2)

# Decode the generated text
conditional_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(conditional_text)

Once upon a time = = the the = of of the of to to of The The the.. the was was the on on the to the The of was of in in the a a the in of. to in @ @ the


# Masked language modeling

In [59]:
model_checkpoint = "bert-base-cased"
tokenizer_checkpoint = "sgugger/bert-like-tokenizer"

In [60]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])



In [61]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [62]:
from transformers import AutoConfig, AutoModelForMaskedLM

config = AutoConfig.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_config(config)

In [63]:
training_args = TrainingArguments(
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    push_to_hub_model_id=f"{model_checkpoint}-wikitext2",
)



In [64]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [65]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

In [66]:
trainer.train()

  0%|          | 0/7038 [00:00<?, ?it/s]

{'loss': 7.9214, 'grad_norm': 3.4579579830169678, 'learning_rate': 1.85791418016482e-05, 'epoch': 0.21}
{'loss': 7.2878, 'grad_norm': 3.68815541267395, 'learning_rate': 1.7158283603296393e-05, 'epoch': 0.43}
{'loss': 7.1849, 'grad_norm': 4.345504283905029, 'learning_rate': 1.573742540494459e-05, 'epoch': 0.64}
{'loss': 7.0972, 'grad_norm': 3.71974515914917, 'learning_rate': 1.4316567206592783e-05, 'epoch': 0.85}


  0%|          | 0/252 [00:00<?, ?it/s]

{'eval_loss': 7.050873279571533, 'eval_runtime': 82.8823, 'eval_samples_per_second': 24.239, 'eval_steps_per_second': 3.04, 'epoch': 1.0}
{'loss': 7.0542, 'grad_norm': 3.750218629837036, 'learning_rate': 1.2895709008240979e-05, 'epoch': 1.07}
{'loss': 6.9946, 'grad_norm': 3.282567024230957, 'learning_rate': 1.1474850809889173e-05, 'epoch': 1.28}
{'loss': 6.9717, 'grad_norm': 3.709643602371216, 'learning_rate': 1.0053992611537368e-05, 'epoch': 1.49}
{'loss': 6.9263, 'grad_norm': 4.0946855545043945, 'learning_rate': 8.633134413185564e-06, 'epoch': 1.71}
{'loss': 6.9057, 'grad_norm': 3.7841737270355225, 'learning_rate': 7.21227621483376e-06, 'epoch': 1.92}


  0%|          | 0/252 [00:00<?, ?it/s]

{'eval_loss': 6.894300937652588, 'eval_runtime': 82.7732, 'eval_samples_per_second': 24.271, 'eval_steps_per_second': 3.044, 'epoch': 2.0}
{'loss': 6.9079, 'grad_norm': 3.681596279144287, 'learning_rate': 5.791418016481955e-06, 'epoch': 2.13}
{'loss': 6.8879, 'grad_norm': 3.6207330226898193, 'learning_rate': 4.3705598181301515e-06, 'epoch': 2.34}
{'loss': 6.9009, 'grad_norm': 3.381568193435669, 'learning_rate': 2.949701619778346e-06, 'epoch': 2.56}
{'loss': 6.8736, 'grad_norm': 3.830254316329956, 'learning_rate': 1.5288434214265418e-06, 'epoch': 2.77}
{'loss': 6.8534, 'grad_norm': 4.066505432128906, 'learning_rate': 1.0798522307473716e-07, 'epoch': 2.98}


  0%|          | 0/252 [00:00<?, ?it/s]

{'eval_loss': 6.870739459991455, 'eval_runtime': 82.8158, 'eval_samples_per_second': 24.259, 'eval_steps_per_second': 3.043, 'epoch': 3.0}
{'train_runtime': 7385.0729, 'train_samples_per_second': 7.621, 'train_steps_per_second': 0.953, 'train_loss': 7.053630003911523, 'epoch': 3.0}


TrainOutput(global_step=7038, training_loss=7.053630003911523, metrics={'train_runtime': 7385.0729, 'train_samples_per_second': 7.621, 'train_steps_per_second': 0.953, 'total_flos': 3703423157830656.0, 'train_loss': 7.053630003911523, 'epoch': 3.0})

In [68]:
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [70]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Assuming `trainer` is your Trainer instance and `model` is your trained model
output_dir_masked = "gpt2-wikimaskedtext2"

# Save the model and tokenizer
model.save_pretrained(output_dir_masked)
tokenizer.save_pretrained(output_dir_masked)

('gpt2-wikimaskedtext2/tokenizer_config.json',
 'gpt2-wikimaskedtext2/special_tokens_map.json',
 'gpt2-wikimaskedtext2/vocab.txt',
 'gpt2-wikimaskedtext2/added_tokens.json',
 'gpt2-wikimaskedtext2/tokenizer.json')

In [72]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [75]:
# Move the model to the appropriate device (e.g., GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Example input text
input_text = "Once upon a time"

# Tokenize the input text and move the input tensor to the same device as the model
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
print(input_ids)

# Convert tensor to list for decoding
input_ids_list = input_ids[0].tolist()
print(tokenizer.decode(input_ids_list, skip_special_tokens=True))

# Generate text with specific constraints (e.g., maximum length, no repetition)
with torch.no_grad():
    output = model.generate(input_ids, max_length=50, no_repeat_ngram_size=2)

# Decode the generated text
conditional_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(conditional_text)

tensor([[   2, 6127, 3654,   69, 1814,    3]], device='cuda:0')
Once upon a time
Once upon a time.,, and, the, of, to, in the the of the in of of in,. the to the and the. of to of and of. in in to in and in. to to and
