# **Introduction**

In [1]:
# Supress the cell output.
import os
print(os.getcwd())
%%capture
# Install the right packages.
!pip install transformers datasets

D:\dl4nlp\dl4nlp


UsageError: Line magic function `%%capture` not found.


## **Datasets**

Import the datasets and write them to a text file in the right format.

In [3]:
import io,os
from datasets import load_dataset
import torch

def dd_to_file(dialogues, path_texts_file):
#     if not os.path.exists(path_texts_file):
#         os.mkdir(os.path.join(os.getcwd(),path_texts_file))
    texts = []
    # Add all dataset splits together for pretraining.
    for split in dialogues:
        for d in dialogues[split]:
            sentence = '\n'.join(d['dialog'])
            texts.append(sentence)
    # Move list to single string.
    all_texts = '\n\n'.join(texts)
    # Send all texts string to single file.
    print(os.getcwd())
    io.open(file='./'+path_texts_file, mode='w', encoding='utf-8').write(all_texts)
    # Print when done.
    print(f'DailyDialog data file saved in `{path_texts_file}`\n')

dd = load_dataset('daily_dialog')
dd_to_file(dd, '/content/dd.txt')


def swag_to_file(dialogues, path_texts_file):
    texts = []
    # Add all dataset splits together for pretraining.
    for split in dialogues:
        # Except for 'test', no labels available.
        if split == 'test':
            continue
        for d in dialogues[split]:
            ending = 'ending' + str(d['label'])
            sentence = d['sent1'] + '\n' + d['sent2'] + d[ending]
            texts.append(sentence)
    # Move list to single string.
    all_texts = '\n\n'.join(texts)
    # Send all texts string to single file.
    io.open(file='./'+path_texts_file, mode='w', encoding='utf-8').write(all_texts)
    # Print when done.
    print(f'SWAG data file saved in `{path_texts_file}`\n' )

swag = load_dataset('swag', 'regular')
swag_to_file(swag, '/content/swag.txt')

Using custom data configuration default
Reusing dataset daily_dialog (C:\Users\Redux Gamer\.cache\huggingface\datasets\daily_dialog\default\1.0.0\c03444008e9508b8b76f1f6793742d37d5e5f83364f8d573c2747bff435ea55c)


  0%|          | 0/3 [00:00<?, ?it/s]

D:\dl4nlp\dl4nlp
DailyDialog data file saved in `/content/dd.txt`



Reusing dataset swag (C:\Users\Redux Gamer\.cache\huggingface\datasets\swag\regular\0.0.0\9640de08cdba6a1469ed3834fcab4b8ad8e38caf5d1ba5e7436d8b1fd067ad4c)


  0%|          | 0/3 [00:00<?, ?it/s]

SWAG data file saved in `/content/swag.txt`



In [4]:
from transformers import BertTokenizer, DataCollatorForLanguageModeling

model_checkpoint = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True,
    mlm_probability= 0.15
)



**NOTE:** If you get memory errors, during training, lower either `batch_size` (later in the notebook) or `block_size`. 

`block_size` is the maximum sequence length after which sentences get truncated.
It can be changed per dataset.

**SO** check per dataset what the maximum sequence length is, use the next power of 2 or something.

In [6]:
from transformers import TextDatasetForNextSentencePrediction

dataset_dd = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path="./content/dd.txt",
    block_size = 256
)

dataset_swag = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path="./content/swag.txt",
    block_size = 256
)

# **Pretraining**

## **Pretraining schemes**

*Before* we finetune on the multiple choice task, there are 5 different pretraining schemes:

0.   Original pretraining (`model_pt`)
1.   Continued pretraining on DailyDialog (`model_cpt_dd`)
2.   Continued pretraining on SWAG (`model_cpt_swag`)
3.   Continued pretraining on DailyDialog and then SWAG (`model_cpt_dd_swag`)
4.   Continued pretraining on SWAG and then DailyDialog (`model_cpt_swag_dd`)

**NOTE**:

Always run the origininal pretraining as it loads in the originally pretrained model for the other pretraining schemes. And depending on the experiment, run another pretraining in the corresponding section.


### **Original pretraining**

In [7]:
from transformers import BertForPreTraining

# First load in the original pretrained bert model.
model_pt = BertForPreTraining.from_pretrained(model_checkpoint)
for params in model_pt.parameters():
      params.requires_grad = False

for index, layer in enumerate(model_pt.bert.encoder.layer):
      if index >=10: #total of 11 bert layers.
        for params in layer.parameters():
              params.requires_grad = True

from transformers import Trainer, TrainingArguments


Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### **Continued pretraining on DailyDialog**

In [8]:
training_args = TrainingArguments(
    output_dir= os.path.join(os.getcwd(),'content','cpt'),
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,

    
    save_strategy='epoch',
#     save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model_pt,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_dd,
)

trainer.train()
trainer.save_model("cpt_dd_model")

***** Running training *****
  Num examples = 24631
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 15395


Step,Training Loss
500,2.2627
1000,2.1576
1500,2.1141
2000,2.0656
2500,2.0602
3000,2.0073
3500,1.9971
4000,1.942
4500,1.9322
5000,1.905


Saving model checkpoint to D:\dl4nlp\dl4nlp\content\cpt\checkpoint-3079
Configuration saved in D:\dl4nlp\dl4nlp\content\cpt\checkpoint-3079\config.json
Model weights saved in D:\dl4nlp\dl4nlp\content\cpt\checkpoint-3079\pytorch_model.bin
Saving model checkpoint to D:\dl4nlp\dl4nlp\content\cpt\checkpoint-6158
Configuration saved in D:\dl4nlp\dl4nlp\content\cpt\checkpoint-6158\config.json
Model weights saved in D:\dl4nlp\dl4nlp\content\cpt\checkpoint-6158\pytorch_model.bin
Saving model checkpoint to D:\dl4nlp\dl4nlp\content\cpt\checkpoint-9237
Configuration saved in D:\dl4nlp\dl4nlp\content\cpt\checkpoint-9237\config.json
Model weights saved in D:\dl4nlp\dl4nlp\content\cpt\checkpoint-9237\pytorch_model.bin
Saving model checkpoint to D:\dl4nlp\dl4nlp\content\cpt\checkpoint-12316
Configuration saved in D:\dl4nlp\dl4nlp\content\cpt\checkpoint-12316\config.json
Model weights saved in D:\dl4nlp\dl4nlp\content\cpt\checkpoint-12316\pytorch_model.bin
Saving model checkpoint to D:\dl4nlp\dl4nlp\c

### **Continued pretraining on SWAG**

In [6]:
training_args = TrainingArguments(
    output_dir= os.path.join(os.getcwd(),'content','cpt','swag'),
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
#     save_steps=500,
    
    save_strategy='epoch',
#     save_total_limit=2,
    prediction_loss_only=True,
)
print(os.getcwd())    
trainer = Trainer(
    model=model_pt,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_swag,
)

trainer.train()
# trainer.save_model("cpt_swag_model_meh")

C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp


***** Running training *****
  Num examples = 140904
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 88065


Step,Training Loss
500,2.7577
1000,2.4337
1500,2.311
2000,2.3462
2500,2.2733
3000,2.2541
3500,2.2434
4000,2.2124
4500,2.2384
5000,2.1926


Saving model checkpoint to C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag\checkpoint-17613
Configuration saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag\checkpoint-17613\config.json
Model weights saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag\checkpoint-17613\pytorch_model.bin
Saving model checkpoint to C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag\checkpoint-35226
Configuration saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag\checkpoint-35226\config.json
Model weights saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag\checkpoint-35226\pytorch_model.bin
Saving model checkpoint to C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag\checkpoint-52839
Configuration saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag\checkpoint-52839\config.json
Model weights saved 

TrainOutput(global_step=88065, training_loss=1.7047069378242456, metrics={'train_runtime': 3964.0161, 'train_samples_per_second': 177.729, 'train_steps_per_second': 22.216, 'total_flos': 1.9761744944254176e+16, 'train_loss': 1.7047069378242456, 'epoch': 5.0})

### **Continued pretraining on DailyDialog and then SWAG**

**Note** that you must have first run the pretraining on DailyDialog.

In [7]:
config_fn = 'cpt_dd_model/config.json'
state_dict_fn = 'cpt_dd_model/pytorch_model.bin'
model_cpt_dd = BertForPreTraining.from_pretrained(None, config=config_fn, state_dict=torch.load(state_dict_fn))

training_args = TrainingArguments(
    output_dir= os.path.join(os.getcwd(),'content','cpt','dd_swag'),
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
#     save_steps=500,
    save_strategy='epoch',
#     save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model_cpt_dd,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_swag,
)

trainer.train()
trainer.save_model("cpt_dd_swag_model")

loading configuration file cpt_dd_model/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

All model checkpoint weights were used when initializing BertForPreTraining.

All the weights of BertForPreTraining were initialized from the model checkpoint at None.
If your task is similar to the task the model of the checkpoint was trained on, you can al

Step,Training Loss
500,2.8453
1000,2.5193
1500,2.3708
2000,2.3971
2500,2.3362
3000,2.2957
3500,2.3022
4000,2.2494
4500,2.2758
5000,2.2473


Saving model checkpoint to C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\dd_swag\checkpoint-17613
Configuration saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\dd_swag\checkpoint-17613\config.json
Model weights saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\dd_swag\checkpoint-17613\pytorch_model.bin
Saving model checkpoint to C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\dd_swag\checkpoint-35226
Configuration saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\dd_swag\checkpoint-35226\config.json
Model weights saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\dd_swag\checkpoint-35226\pytorch_model.bin
Saving model checkpoint to C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\dd_swag\checkpoint-52839
Configuration saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\dd_swag\checkpoint-52839\config.j

### **Continued pretraining on SWAG and then DailyDialog**

**Note** that you must have first run the pretraining on SWAG.

In [8]:
config_fn = 'cpt_swag_model/config.json'
state_dict_fn = 'cpt_swag_model/pytorch_model.bin'
model_cpt_swag = BertForPreTraining.from_pretrained(None, config=config_fn, state_dict=torch.load(state_dict_fn))

training_args = TrainingArguments(
    output_dir= os.path.join(os.getcwd(),'content','cpt','swag_dd'),
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
#     save_steps=1000,
    
    save_strategy='epoch',
#     save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model_cpt_swag,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_dd,
)

trainer.train()
trainer.save_model("cpt_swag_dd_model")

loading configuration file cpt_swag_model/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

All model checkpoint weights were used when initializing BertForPreTraining.

All the weights of BertForPreTraining were initialized from the model checkpoint at None.
If your task is similar to the task the model of the checkpoint was trained on, you can 

Step,Training Loss
500,2.353
1000,2.1244
1500,2.0436
2000,1.9897
2500,1.9312
3000,1.9043
3500,1.8816
4000,1.8105
4500,1.7093
5000,1.6968


Saving model checkpoint to C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag_dd\checkpoint-3671
Configuration saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag_dd\checkpoint-3671\config.json
Model weights saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag_dd\checkpoint-3671\pytorch_model.bin
Saving model checkpoint to C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag_dd\checkpoint-7342
Configuration saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag_dd\checkpoint-7342\config.json
Model weights saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag_dd\checkpoint-7342\pytorch_model.bin
Saving model checkpoint to C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag_dd\checkpoint-11013
Configuration saved in C:\Users\Redux Gamer\Desktop\EC2021\ES_2021_G96\dl4nlp\content\cpt\swag_dd\checkpoint-11013\config.json
Mo

# **Finetuning**

Depending on which model you want to finetune upon, change the folder names in `config_fn` and `state_dict_fn`. 


```
# This example below loads in the model, pretrained only on DailyDialog
config_fn = 'cpt_dd_model/config.json'
state_dict_fn = 'cpt_dd_model/pytorch_model.bin'
model_ft = BertForMultipleChoice.from_pretrained(None, config=config_fn, state_dict=torch.load(state_dict_fn))
```



In [20]:
from transformers import BertForMultipleChoice

# This is how to initialize the continued-pretrained model for the multiple choice task.
config_fn = 'cpt_dd_model/config.json'
state_dict_fn = 'cpt_dd_model/pytorch_model.bin'
model_ft = BertForMultipleChoice.from_pretrained(None, config=config_fn, state_dict=torch.load(state_dict_fn))

loading configuration file cpt_dd_model/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

Some weights of the model checkpoint at None were not used when initializing BertForMultipleChoice: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions