In [5]:
from transformers import RobertaTokenizerFast, BertTokenizerFast, DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('/mnt/transcriber/Call_Scoring/sa_tokenizer/', add_special_tokens=True)

In [6]:
from transformers import DistilBertConfig

config = DistilBertConfig(
    vocab_size=tokenizer.vocab_size,
)

In [7]:
from transformers import DistilBertForMaskedLM
model = DistilBertForMaskedLM(config=config)

In [8]:
from torch.utils.data import Dataset, random_split
class MLMDataSet(Dataset):
    def __init__(self, paths, tokenizer):
        self.sentences = []
        for file in paths:
            with open(file, 'r') as f:
                sent = f.readlines()
                self.sentences.extend([line.strip('\n') for line in sent])
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self, idx):
        return self.tokenizer(self.sentences[idx])

In [9]:
import random
random.seed(1000)
from pathlib import Path
paths = [str(x) for x in Path('/mnt/transcriber/Call_Scoring/transcriptions/csr_ch/train').glob("**/*.txt")]
mini_paths = random.sample(paths, 1000)
mlm_ds = MLMDataSet(mini_paths, tokenizer)

train_size = int(0.8 * len(mlm_ds))
test_size = len(mlm_ds) - train_size
train_dataset, dev_dataset = random_split(mlm_ds, [train_size, test_size])

In [11]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./mlm_test",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_steps=500,
    eval_steps=500, 
    save_total_limit=2,
    prediction_loss_only=True,
    do_train=True,
    do_eval=True,
    evaluation_strategy='steps'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset = dev_dataset,
)

In [8]:
%%time
trainer.train()

***** Running training *****
  Num examples = 59060
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 11076


Step,Training Loss,Validation Loss
500,6.0026,5.384438
1000,5.1958,5.051957
1500,4.9596,4.8143
2000,4.6333,4.444946
2500,4.4449,4.241751
3000,4.2076,4.037235
3500,4.0664,3.882293
4000,3.8853,3.803676
4500,3.778,3.701168
5000,3.6839,3.612546


***** Running Evaluation *****
  Num examples = 14765
  Batch size = 16
Saving model checkpoint to ./mlm_test/checkpoint-500
Configuration saved in ./mlm_test/checkpoint-500/config.json
Model weights saved in ./mlm_test/checkpoint-500/pytorch_model.bin
Deleting older checkpoint [mlm_test/checkpoint-3500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 14765
  Batch size = 16
Saving model checkpoint to ./mlm_test/checkpoint-1000
Configuration saved in ./mlm_test/checkpoint-1000/config.json
Model weights saved in ./mlm_test/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [mlm_test/checkpoint-4000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 14765
  Batch size = 16
Saving model checkpoint to ./mlm_test/checkpoint-1500
Configuration saved in ./mlm_test/checkpoint-1500/config.json
Model weights saved in ./mlm_test/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [mlm_test/checkpoint-500] due to args.save_to

CPU times: user 1d 10h 44min 23s, sys: 16h 59min 7s, total: 2d 3h 43min 31s
Wall time: 3h 17min 32s


TrainOutput(global_step=11076, training_loss=3.8806873333932, metrics={'train_runtime': 11852.5587, 'train_samples_per_second': 14.949, 'train_steps_per_second': 0.934, 'total_flos': 1749275025814080.0, 'train_loss': 3.8806873333932, 'epoch': 3.0})

In [9]:
trainer.save_model("./mlm_test")

Saving model checkpoint to ./mlm_test
Configuration saved in ./mlm_test/config.json
Model weights saved in ./mlm_test/pytorch_model.bin


In [10]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./mlm_test",
    tokenizer=tokenizer
)

loading configuration file ./mlm_test/config.json
Model config DistilBertConfig {
  "_name_or_path": "./mlm_test",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",
  "vocab_size": 22044
}

loading configuration file ./mlm_test/config.json
Model config DistilBertConfig {
  "_name_or_path": "./mlm_test",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  

In [13]:
str = "And may go ahead and get your data birth please [MASK] you have you previously been with safe auto in the past."
fill_mask(str)

[{'score': 0.2619188129901886,
  'token': 111,
  'token_str': 'have',
  'sequence': 'and may go ahead and get your data birth please have you have you previously been with safe auto in the past.'},
 {'score': 0.08725928515195847,
  'token': 379,
  'token_str': 'help',
  'sequence': 'and may go ahead and get your data birth please help you have you previously been with safe auto in the past.'},
 {'score': 0.07615751028060913,
  'token': 177,
  'token_str': 'are',
  'sequence': 'and may go ahead and get your data birth please are you have you previously been with safe auto in the past.'},
 {'score': 0.05747812241315842,
  'token': 700,
  'token_str': 'assist',
  'sequence': 'and may go ahead and get your data birth please assist you have you previously been with safe auto in the past.'},
 {'score': 0.053029175847768784,
  'token': 92,
  'token_str': 'do',
  'sequence': 'and may go ahead and get your data birth please do you have you previously been with safe auto in the past.'}]

In [2]:
from transformers import AutoModel

In [3]:
model = AutoModel.from_pretrained("./mlm_test")

Some weights of the model checkpoint at ./mlm_test were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
model.evaluate()

AttributeError: 'DistilBertModel' object has no attribute 'evaluate'

In [17]:
from transformers import Trainer, TrainingArguments

paths = [str(x) for x in Path('/mnt/transcriber/Call_Scoring/transcriptions/csr_ch/test').glob("**/*.txt")]
mini_paths = random.sample(paths, 1000)
mlm_ds = MLMDataSet(mini_paths, tokenizer)

training_args = TrainingArguments(
    output_dir="./mlm_test",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_steps=500,
    eval_steps=500, 
    save_total_limit=2,
    prediction_loss_only=True,
    do_train=False,
    do_eval=True,
    evaluation_strategy='steps'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset = mlm_ds,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [16]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 7512
  Batch size = 16


{'eval_loss': 9.542838096618652,
 'eval_runtime': 206.4027,
 'eval_samples_per_second': 36.395,
 'eval_steps_per_second': 2.277}