In [23]:
from datasets import Dataset
from transformers import Trainer
from transformers import AutoModelForMaskedLM, DistilBertForMaskedLM
from transformers import AutoTokenizer, DistilBertTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd

In [6]:
model_checkpoint = "distilbert-base-cased"
model = DistilBertForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint, do_lower_case=False)

In [25]:
SEED = 0
BATCH_SIZE = 16
LEARNING_RATE = 2e-5 
LR_WARMUP_STEPS = 100
WEIGHT_DECAY = 0.01

### Dataset prep

In [20]:
with open('train.txt', 'r') as f:
    lines = f.readlines()
    
df = pd.DataFrame({"text": [line for line in lines]})
df_train, df_valid = train_test_split(
    df, test_size=0.15, random_state=SEED
)

train_dataset = Dataset.from_pandas(df_train[['text']].dropna())
valid_dataset = Dataset.from_pandas(df_valid[['text']].dropna())

In [22]:
def tokenize_function(row):
    return tokenizer(
        row['text'],
        padding='max_length',
        return_special_tokens_mask=True)
  
column_names = dataset.column_names

train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names,
)

valid_dataset = valid_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names,
)

  0%|          | 0/182 [00:00<?, ?ba/s]

  0%|          | 0/33 [00:00<?, ?ba/s]

In [31]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
steps_per_epoch = int(len(train_dataset) / BATCH_SIZE)

training_args = TrainingArguments(
    output_dir='./distilbert-joke',
    logging_dir='./LMlogs',             
    num_train_epochs=2,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=LR_WARMUP_STEPS,
    # save_steps=steps_per_epoch,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE, 
    evaluation_strategy='steps',
    save_strategy='steps',
    eval_steps=1000,
    save_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model='loss', 
    greater_is_better=False,
    seed=SEED
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("./finetuned-distilbert/")

The following columns in the training set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: __index_level_0__, special_tokens_mask. If __index_level_0__, special_tokens_mask are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 181718
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 22716
  Number of trainable parameters = 65812036
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
1000,1.4778,2.481643
2000,2.3422,2.192749
3000,2.2646,2.14557
4000,2.2516,2.096352
5000,2.1527,2.073568
6000,2.1541,2.033172
7000,2.0959,2.010656
8000,2.1009,1.987986
9000,2.0748,1.953148
10000,2.0549,1.941496


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: __index_level_0__, special_tokens_mask. If __index_level_0__, special_tokens_mask are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 32068
  Batch size = 16
Saving model checkpoint to ./distilbert-joke/checkpoint-1000
Configuration saved in ./distilbert-joke/checkpoint-1000/config.json
Model weights saved in ./distilbert-joke/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./distilbert-joke/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./distilbert-joke/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: __index_level_0__, special_tokens_mask. If __index_level_0__, special_tokens_mask are not expected by