# Initial Setups


## (Google Colab use only)

In [1]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    drive.mount('/content/drive')
    
    # If there's a package I need to install separately, do it here
    #!pip install pyro-ppl

    # cd to the appropriate working directory under my Google Drive
    %cd 'drive/My Drive/cs696ds_lexalytics/Language Model Finetuning'
    
    # Install packages specified in requirements
    !pip install -r requirements.txt
    
    # List the directory contents
    !ls

In [2]:
# IPython reloading magic
%load_ext autoreload
%autoreload 2

## Experiment ID

**NOTE**: The following `experiment_id` MUST BE CHANGED in order to avoid overwriting the files from other experiments!!!!!!

In [3]:
# We will use the following string ID to identify this particular (training) experiments
# in directory paths and other settings
experiment_id = 'lm_further_pretraining_bert_yelp_restaurants'

## Package Imports

In [4]:
import sys
import os
import random

import numpy as np
import torch
import transformers
import datasets

import utils

# Random seed settings
random_seed = 696
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# Print version information
print("Python version: " + sys.version)
print("NumPy version: " + np.__version__)
print("PyTorch version: " + torch.__version__)
print("Transformers version: " + transformers.__version__)

Python version: 3.7.10 (default, Feb 26 2021, 18:47:35) 
[GCC 7.3.0]
NumPy version: 1.19.2
PyTorch version: 1.7.1
Transformers version: 4.3.3


## PyTorch GPU settings

In [5]:
if torch.cuda.is_available():
    # https://github.com/UKPLab/sentence-transformers/issues/413
    torch.multiprocessing.set_start_method('spawn')
    
    # https://github.com/pytorch/pytorch/issues/37377
    os.environ['MKL_SERVICE_FORCE_INTEL'] = "1"
    
    torch_device = torch.device('cuda')

    # Set this to True to make your output immediately reproducible
    # Note: https://pytorch.org/docs/stable/notes/randomness.html
    torch.backends.cudnn.deterministic = False
    
    # Disable 'benchmark' mode: Set this False if you want to measure running times more fairly
    # Note: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
    torch.backends.cudnn.benchmark = True
    
    # Faster Host to GPU copies with page-locked memory
    use_pin_memory = True 

    # CUDA libraries version information
    print("CUDA Version: " + str(torch.version.cuda))
    print("cuDNN Version: " + str(torch.backends.cudnn.version()))
    print("CUDA Device Name: " + str(torch.cuda.get_device_name()))
    print("CUDA Capabilities: "+ str(torch.cuda.get_device_capability()))
else:
    torch_device = torch.device('cpu')
    use_pin_memory = False

print()
print("PyTorch device selected:", torch_device)

CUDA Version: 11.0
cuDNN Version: 8005
CUDA Device Name: GeForce GTX TITAN X
CUDA Capabilities: (5, 2)

PyTorch device selected: cuda


# Further pre-training

## Load the BERT-base model

In [6]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased", cache_dir='./bert_base_cache')
model = transformers.AutoModelForMaskedLM.from_pretrained("bert-base-uncased", cache_dir='./bert_base_cache')

model = model.to(torch_device)
model = torch.nn.DataParallel(model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Load the Yelp restaurants dataset

In [7]:
yelp = datasets.load_dataset(
    './dataset_scripts/yelp_restaurants',
    data_files={
        'train': 'dataset_files/yelp_restaurants/yelp_academic_dataset_review.json',
        'restaurant_ids': 'dataset_files/yelp_restaurants/restaurantIDs.txt',
    },
    cache_dir='./dataset_cache')

Using custom data configuration default


Downloading and preparing dataset yelp_restaurants/default-ceef9e04fb6ab232 (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to ./dataset_cache/yelp_restaurants/default-ceef9e04fb6ab232/0.0.0/1a6412d257d1f098c2e7a0ac3a450b82eb6f35095fa78cc3ff87d9a1b65368f4...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset yelp_restaurants downloaded and prepared to ./dataset_cache/yelp_restaurants/default-ceef9e04fb6ab232/0.0.0/1a6412d257d1f098c2e7a0ac3a450b82eb6f35095fa78cc3ff87d9a1b65368f4. Subsequent calls will reuse this data.


In [8]:
data_yelp_train = yelp['train']

In [9]:
print("Number of training data:", len(data_yelp_train))

Number of training data: 2152007


In [10]:
# Check out how individual data points look like
print(data_yelp_train[696])

{'id': 'EUgMR__LDQYjdD6qlWgrlw', 'stars': 1.0, 'text': "Burgers is bland and tasteless, relies on toppings. So expensive. Hot dog is tiny and terribly salty. Fries are similar to Harveys, don't compare to New York Fries. Would not go back."}


### Preprocessing: Encode the text with Tokenizer

In [11]:
train_dataset_pretraining = data_yelp_train.map(
    lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=256),
    remove_columns=data_yelp_train.column_names,
    batched=True, num_proc=8)











## Pre-train further

### Training settings

In [12]:
# Our new 'smart' masking
collator = utils.DataCollatorForSmartMLM(
    tokenizer=tokenizer,
    mlm_probability=0.15,
    prob_replace_with_mask=0.8,
    prob_replace_with_random=0.1,
)

In [13]:
training_args = transformers.TrainingArguments(
    output_dir=os.path.join('.', 'progress', experiment_id, 'results'), # output directory
    overwrite_output_dir=True,
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=32,  # 32 * 4 GPUs = 128 Total
    evaluation_strategy='epoch',
    warmup_steps=5000,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=os.path.join('.', 'progress', experiment_id, 'logs'), # directory for storing logs
    seed=random_seed,
    fp16=True,
    fp16_opt_level='O2',
    fp16_backend='amp',
    prediction_loss_only=True,
    load_best_model_at_end=True,
    dataloader_num_workers=8,
    dataloader_pin_memory=False
)

In [14]:
print(training_args.n_gpu)

4


In [15]:
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=collator, # do the masking on the go
    train_dataset=train_dataset_pretraining,
)

### Training loop

In [16]:
%%time
trainer.train()

ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/bseoh/miniconda3/envs/zeroshotatsc/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/bseoh/miniconda3/envs/zeroshotatsc/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/mnt/nfs/work1/696ds-s21/bseoh/lexalytics/utils/data_collator_smart_mlm.py", line 80, in __call__
    batch = self.tokenizer.pad(examples, return_tensors="pt")
  File "/home/bseoh/miniconda3/envs/zeroshotatsc/lib/python3.7/site-packages/transformers/tokenization_utils_base.py", line 2641, in pad
    "You should supply an encoding or a list of encodings to this method"
ValueError: You should supply an encoding or a list of encodings to this methodthat includes input_ids, but you provided []


### Save the model to the local directory

In [None]:
trainer.save_model(os.path.join('.', 'trained_models', experiment_id))

In [None]:
tokenizer.save_pretrained(os.path.join('.', 'trained_models', experiment_id))

## LM Evaluation

In [None]:
eval_results = trainer.evaluate()

In [None]:
print(eval_results)

perplexity = np.exp(eval_results["eval_loss"])

print(perplexity)

## Playing with my own input sentences

In [None]:
example = f"""The {tokenizer.mask_token} of {tokenizer.mask_token} is awful, but its {tokenizer.mask_token} is fantastic."""

example_encoded = tokenizer.encode(example, add_special_tokens=True, return_tensors="pt").to(torch_device)

# Let's decode this back just to see how they were actually encoded
example_tokens = []

for id in example_encoded[0]:
    example_tokens.append(tokenizer.convert_ids_to_tokens(id.item()))

print(example_tokens)

In [None]:
example_prediction = model(example_encoded)

example_prediction_argmax = torch.argmax(example_prediction[0], dim=-1)[0]

print(example_prediction_argmax)

print(tokenizer.decode(example_prediction_argmax))