In [143]:
from datasets import load_dataset

# importing the super glue dataset to be used
dataset = load_dataset("super_glue", 'boolq')
dataset

Found cached dataset super_glue (/Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 9427
    })
    validation: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 3270
    })
    test: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 3245
    })
})

In [144]:
dataset['train'][0]

{'question': 'do iran and afghanistan speak the same language',
 'passage': 'Persian language -- Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.',
 'idx': 0,
 'label': 1}

In [145]:
# processing the dataset by adding an end of text token
def add_end_of_text(example):
    example['question'] =  example['question'] + '<|endoftext|>'
    return example

# removing the columns that are not used for training
dataset = dataset.remove_columns(['passage', 'idx', 'label'])
dataset = dataset.map(add_end_of_text)

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-77bfb92a16c5c185.arrow
Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-ab06fa24cb859d42.arrow
Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-d0853c599a9afbd5.arrow


In [146]:
dataset['train'][0]

{'question': 'do iran and afghanistan speak the same language<|endoftext|>'}

In [147]:
from transformers import AutoTokenizer

# using the bert-base-uncased tokenizer
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

loading configuration file config.json from cache at /Users/alyssahuang/.cache/huggingface/hub/models--bert-base-uncased/snapshots/bdb420bf56ef3f72ee07cd75ab6df1b765b6012a/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /Users/alyssahuang/.cache/huggingface/hub/models--bert-base-uncased/snapshots/bdb420bf56ef3f72ee07cd75ab6df1b765b6012a

In [148]:
sequence = ("This tokenizer is being applied in CS197 at"
            "Harvard.<|endoftext|>")
tokens = tokenizer.tokenize(sequence)
print(tokens)

['this', 'token', '##izer', 'is', 'being', 'applied', 'in', 'cs', '##19', '##7', 'at', '##har', '##vard', '.', '<', '|', 'end', '##oft', '##ex', '##t', '|', '>']


In [149]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[2023, 19204, 17629, 2003, 2108, 4162, 1999, 20116, 16147, 2581, 2012, 8167, 25911, 1012, 1026, 1064, 2203, 15794, 10288, 2102, 1064, 1028]


In [150]:
sequence = ("This tokenizer is being applied in CS197 at"
            "Harvard.<|endoftext|>")
tokenizer(sequence)

{'input_ids': [101, 2023, 19204, 17629, 2003, 2108, 4162, 1999, 20116, 16147, 2581, 2012, 8167, 25911, 1012, 1026, 1064, 2203, 15794, 10288, 2102, 1064, 1028, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [151]:
# tokenizing the dataset

def tokenize_function(examples):
    return tokenizer(examples["question"], truncation=True)

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["question"]
)

print(dataset["train"][0])
print(tokenized_datasets["train"][0])

 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-923411d8bbbe382a.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-3ee5e9a8c88b7881.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-67a3b6ae4e1321fa.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-065f3050738a8059.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-f2a07997690d009d.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-b14a308bdde09ad5.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-e731dfa0532744e8.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-210eddd22c326f33.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-05ca96cd0f46201f.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-883cc88edef3242a.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-b9d1fa571b08e1a5.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-9eef89538f84bcd3.arrow


{'question': 'do iran and afghanistan speak the same language<|endoftext|>'}
{'input_ids': [101, 2079, 4238, 1998, 7041, 3713, 1996, 2168, 2653, 1026, 1064, 2203, 15794, 10288, 2102, 1064, 1028, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [152]:
block_size = 128

def group_texts(examples):
    # repeat concatenation for input_ids and other keys
    concatenated_examples = {k: sum(examples[k], []) for k in
                            examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size

    # populate each of input_ids and other keys 
    result = {
        k: [t[i : i + block_size] for i in range(0,
            total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # add labels because we'll need it as the output
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

       

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-ca45029399eec698.arrow
Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-b9cb6f4a85fb29f4.arrow
Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-0478206cb14da8e3.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-42d369508436ebe6.arrow


     

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-e896d5d717350d63.arrow


  

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-858caec903a35bf0.arrow


 

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-811fa39bea7ae256.arrow
Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-713c1e3c8b39b1cb.arrow


      

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-b5b76fb14d32a40e.arrow


  

Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-cbeb95bf9a0b72b8.arrow
Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-cb90b1234cdde197.arrow
Loading cached processed dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-9c62a79729f56276.arrow


In [153]:
print(lm_datasets['train']['input_ids'][0])

[101, 2079, 4238, 1998, 7041, 3713, 1996, 2168, 2653, 1026, 1064, 2203, 15794, 10288, 2102, 1064, 1028, 102, 101, 2079, 2204, 3520, 8486, 5794, 4277, 4047, 2216, 2040, 2393, 2012, 2019, 4926, 1026, 1064, 2203, 15794, 10288, 2102, 1064, 1028, 102, 101, 2003, 3645, 3185, 9338, 2112, 1997, 3645, 6827, 2015, 1026, 1064, 2203, 15794, 10288, 2102, 1064, 1028, 102, 101, 2003, 9530, 25969, 3258, 5649, 5699, 1996, 2168, 2004, 9898, 2098, 5699, 1026, 1064, 2203, 15794, 10288, 2102, 1064, 1028, 102, 101, 2003, 6422, 23074, 3784, 1996, 2168, 2004, 3712, 20026, 1026, 1064, 2203, 15794, 10288, 2102, 1064, 1028, 102, 101, 2064, 2017, 2224, 21480, 4003, 2012, 20383, 5358, 2276, 1026, 1064, 2203, 15794, 10288, 2102, 1064, 1028, 102, 101, 2097, 2045, 2022, 1037, 2161, 1018, 1997]


In [154]:
tokenizer.decode(lm_datasets['train']['input_ids'][0])

'[CLS] do iran and afghanistan speak the same language < | endoftext | > [SEP] [CLS] do good samaritan laws protect those who help at an accident < | endoftext | > [SEP] [CLS] is windows movie maker part of windows essentials < | endoftext | > [SEP] [CLS] is confectionary sugar the same as powdered sugar < | endoftext | > [SEP] [CLS] is elder scrolls online the same as skyrim < | endoftext | > [SEP] [CLS] can you use oyster card at epsom station < | endoftext | > [SEP] [CLS] will there be a season 4 of'

In [155]:
small_train_dataset = \
    lm_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = \
    lm_datasets["validation"].shuffle(seed=42).select(range(100))

Loading cached shuffled indices for dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-c64d2ad2068f6601.arrow
Loading cached shuffled indices for dataset at /Users/alyssahuang/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-76b467cc786e8ce4.arrow


In [156]:
from transformers import AutoModelForCausalLM, AutoConfig, TrainingArguments, Trainer

# using a causal language model, xlnet-base-cased
# we use the model's structure without any pretrained weights
config = AutoConfig.from_pretrained('xlnet-base-cased')
model = AutoModelForCausalLM.from_config(config)

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

loading configuration file config.json from cache at /Users/alyssahuang/.cache/huggingface/hub/models--xlnet-base-cased/snapshots/593a21e8b79948a7f952811aa44f37d76e23d586/config.json
Model config XLNetConfig {
  "_name_or_path": "xlnet-base-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "transformers_version": "4.22.1",

In [157]:
# training and saving the model in Hugging Face's model hub
training_args = TrainingArguments(
    output_dir="./models"
    f"{model_checkpoint}-super-glue-boolq",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
/Users/alyssahuang/Documents/GitHub/CS197/CS197 Pset2/./modelsbert-base-uncased-super-glue-boolq is already a clone of https://huggingface.co/alyssahuang02/modelsbert-base-uncased-super-glue-boolq. Make sure you pull the latest changes with `repo.git_pull()`.
***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 39
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  0%|          | 0/39 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.851529598236084, 'eval_runtime': 19.4801, 'eval_samples_per_second': 5.133, 'eval_steps_per_second': 0.667, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6558276414871216, 'eval_runtime': 19.1004, 'eval_samples_per_second': 5.236, 'eval_steps_per_second': 0.681, 'epoch': 2.0}


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


  0%|          | 0/13 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.6062944531440735, 'eval_runtime': 22.0154, 'eval_samples_per_second': 4.542, 'eval_steps_per_second': 0.59, 'epoch': 3.0}
{'train_runtime': 261.9374, 'train_samples_per_second': 1.145, 'train_steps_per_second': 0.149, 'train_loss': 1.793543693346855, 'epoch': 3.0}


TrainOutput(global_step=39, training_loss=1.793543693346855, metrics={'train_runtime': 261.9374, 'train_samples_per_second': 1.145, 'train_steps_per_second': 0.149, 'train_loss': 1.793543693346855, 'epoch': 3.0})

In [158]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


  0%|          | 0/13 [00:00<?, ?it/s]

Perplexity: 1.83


In [159]:
tokenizer.save_pretrained(f"{model_checkpoint}-super-glue-boolq")
model.push_to_hub(f"{model_checkpoint}-super-glue-boolq")

tokenizer config file saved in bert-base-uncased-super-glue-boolq/tokenizer_config.json
Special tokens file saved in bert-base-uncased-super-glue-boolq/special_tokens_map.json
Configuration saved in bert-base-uncased-super-glue-boolq/config.json
Model weights saved in bert-base-uncased-super-glue-boolq/pytorch_model.bin
Uploading the following files to alyssahuang02/bert-base-uncased-super-glue-boolq: config.json,pytorch_model.bin


In [160]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(f"./{model_checkpoint}-super-glue-boolq")
tokenizer = AutoTokenizer.from_pretrained(f"./{model_checkpoint}-super-glue-boolq")


loading configuration file ./bert-base-uncased-super-glue-boolq/config.json
Model config XLNetConfig {
  "_name_or_path": "./bert-base-uncased-super-glue-boolq",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "untie_r": true,
  "use_mems_eval": true,
  "use_mems_tr

In [161]:
# testing our model on an example
start_text = ("A speedrun is a playthrough of a video game, \
or section of a video game, with the goal of \
completing it as fast as possible. Speedruns \
often follow planned routes, which may incorporate sequence \
breaking, and might exploit glitches that allow sections to \
be skipped or completed more quickly than intended. ")

prompt = "What is the"
inputs = tokenizer(
     start_text + prompt,
     add_special_tokens=False,
     return_tensors="pt")["input_ids"]

prompt_length = len(tokenizer.decode(inputs[0]))
outputs = model.generate(
     inputs,
     max_length=100,
     do_sample=True,
     top_k=50,
     top_p=0.95,
     temperature=0.9,
     num_return_sequences=3)

generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1:]

print(tokenizer.decode(outputs[0]))

the goal of life ο immigrant demographic art empirical paler empiricaltions oblique™ silenceection michaels aside? nasty ferrer demographic oblique michaelsuto dinah concessions ginger intrinsic ministerial 44th meiji immigrantlli talking?? immigrant concessions 44th denmark photographicection talking 44thection denmarkה nasty concessions imperative talking ভ believed third believed wadetorslatinguto sequelה nasty ministerial photographic uncertainty edotedlating ferrer meiji?lliuto ᴬ jack ভ meijiaire nastyaire ভ perpetualaire intrinsicぬgradeex demographicgradelli denmark flu erica wilmingtonurized
