In [46]:
import pandas as pd
import os
from huggingface_hub import notebook_login
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
import math
from transformers import AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling

In [36]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [37]:
block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [1]:
notebook_login()

Login successful
Your token has been saved to /Users/amanmalhotra/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [9]:
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')

Reusing dataset wikitext (/Users/amanmalhotra/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
datasets["train"][10]

{'text': ' The game \'s battle system , the BliTZ system , is carried over directly from Valkyira Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific " Potentials " , skills unique to each character . They are divided into " Personal Potential " , which are innate skills that remain unaltered unless otherwise dictated by the story and can either help or impede

In [45]:
model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at /Users/amanmalhotra/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "_name_or_path": "distilroberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vo

In [21]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [22]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/316M [00:00<?, ?B/s]

In [23]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-wikitext2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

In [24]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

Cloning https://huggingface.co/amanm27/distilroberta-base-finetuned-wikitext2 into local empty directory.


In [26]:
trainer.train()

***** Running training *****
  Num examples = 19242
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 7218


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [27]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 1988
  Batch size = 8


KeyboardInterrupt: 

In [28]:
trainer.push_to_hub()

Saving model checkpoint to distilroberta-base-finetuned-wikitext2
Configuration saved in distilroberta-base-finetuned-wikitext2/config.json
Model weights saved in distilroberta-base-finetuned-wikitext2/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 32.0k/313M [00:00<?, ?B/s]

Upload file runs/Feb23_19-54-06_DN0a1f23a3.SUNet/1645674928.8981209/events.out.tfevents.1645674928.DN0a1f23a3.…

Upload file training_args.bin: 100%|##########| 2.98k/2.98k [00:00<?, ?B/s]

Upload file runs/Feb23_19-54-06_DN0a1f23a3.SUNet/events.out.tfevents.1645674928.DN0a1f23a3.SUNet.16919.0: 100%…

To https://huggingface.co/amanm27/distilroberta-base-finetuned-wikitext2
   c1cea99..9629b2f  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Masked Language Modeling', 'type': 'fill-mask'}}
To https://huggingface.co/amanm27/distilroberta-base-finetuned-wikitext2
   9629b2f..0c26c6b  main -> main



'https://huggingface.co/amanm27/distilroberta-base-finetuned-wikitext2/commit/9629b2f1b30abecddec5ef008f64bd046d7c9656'

In [39]:
datasets = load_dataset("text", data_files={"train": "data/sports_article_data.csv", "validation": "data/sports_article_data.csv"})

Using custom data configuration default-677d50789c81c7ed


Downloading and preparing dataset text/default to /Users/amanmalhotra/.cache/huggingface/datasets/text/default-677d50789c81c7ed/0.0.0/08f6fb1dd2dab0a18ea441c359e1d63794ea8cb53e7863e6edf8fc5655e47ec4...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /Users/amanmalhotra/.cache/huggingface/datasets/text/default-677d50789c81c7ed/0.0.0/08f6fb1dd2dab0a18ea441c359e1d63794ea8cb53e7863e6edf8fc5655e47ec4. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
datasets["train"][10]

{'text': "  \t''Coming to the US Championships is a good time and there's always a positive energy,'' Davis said. ''It's such an honor to come here. I think being able to perform really well pushes us to put out a better performance, and we are honored to get the results that we did.''"}

In [48]:
model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model_checkpoint = "amanm27/distilroberta-base-finetuned-wikitext2"
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at /Users/amanmalhotra/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "_name_or_path": "distilroberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vo

In [49]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [50]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/amanm27/distilroberta-base-finetuned-wikitext2/resolve/main/config.json from cache at /Users/amanmalhotra/.cache/huggingface/transformers/a6a882d5d1a2ee9673436c279fe1ec71c9364160672ab16cea39d98c2e9e0d15.7152f0feab0c1d24e831cc37e93970481f0d9b60bba32a29dd60e95baee093d4
Model config RobertaConfig {
  "_name_or_path": "amanm27/distilroberta-base-finetuned-wikitext2",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size"

Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]

storing https://huggingface.co/amanm27/distilroberta-base-finetuned-wikitext2/resolve/main/pytorch_model.bin in cache at /Users/amanmalhotra/.cache/huggingface/transformers/1f28788352fccaf6cdeaf359d3f73d75d6b54be098858718c578fda8a3453f4a.391e9e4f75f2e86e7997cdeea7de8d4919266746108891ccab664066b93c4d7e
creating metadata file for /Users/amanmalhotra/.cache/huggingface/transformers/1f28788352fccaf6cdeaf359d3f73d75d6b54be098858718c578fda8a3453f4a.391e9e4f75f2e86e7997cdeea7de8d4919266746108891ccab664066b93c4d7e
loading weights file https://huggingface.co/amanm27/distilroberta-base-finetuned-wikitext2/resolve/main/pytorch_model.bin from cache at /Users/amanmalhotra/.cache/huggingface/transformers/1f28788352fccaf6cdeaf359d3f73d75d6b54be098858718c578fda8a3453f4a.391e9e4f75f2e86e7997cdeea7de8d4919266746108891ccab664066b93c4d7e
All model checkpoint weights were used when initializing RobertaForMaskedLM.

All the weights of RobertaForMaskedLM were initialized from the model checkpoint at amanm27/

In [51]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-pretrained-sports-articles",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [52]:
print(model_name)

distilroberta-base-finetuned-wikitext2


In [53]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [54]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

Cloning https://huggingface.co/amanm27/distilroberta-base-finetuned-wikitext2-pretrained-sports-articles into local empty directory.


In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.push_to_hub()