In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# get the environment ready
!pip install transformers
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Collecting transformers
  Downloading transformers-4.12.4-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.2 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 62.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 48.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, tr

Get the data ready for model

In [3]:
# load and process mom & son data
import pandas as pd
import numpy
# Load the dataset into a pandas dataframe.
df_mom = pd.read_csv("/content/drive/MyDrive/iss/NLP/perfectmom/data/dialog_mom", header=0, names=['index','mom_sentence','sentiment','sentiment_score'])
df_son = pd.read_csv("/content/drive/MyDrive/iss/NLP/perfectmom/data/dialog_son", header=0, names=['index','son_sentence','sentiment','sentiment_score'])

# Display the first 10 rows from the data.
df_son.head(10)
sentences_labels = df_mom.mom_sentence.values
sentences=df_son.son_sentence.values



# Tokenize all of the sentences and map the tokens to thier word IDs.
from transformers import EncoderDecoderModel, BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

input_ids = []
labels=[]

for sent in sentences:
  encoded_dict=tokenizer(sent,padding = 'max_length',truncation=True, max_length=16,return_tensors="pt").input_ids
  # encoded_dict=tokenizer(sent,return_tensors="pt").input_ids
  input_ids.append(encoded_dict)
for label in sentences_labels:
  encoded_dict=tokenizer(label,padding = 'max_length',truncation=True, max_length=16,return_tensors="pt").input_ids
  # encoded_dict=tokenizer(label,return_tensors="pt").input_ids
  labels.append(encoded_dict)

input_ids = torch.cat(input_ids, dim=0)
labels = torch.cat(labels, dim=0)

print(input_ids[51])
print(labels[51])


Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

tensor([ 101, 5643, 5643, 6624,  749,  720,  102,    0,    0,    0,    0,    0,
           0,    0,    0,    0])
tensor([ 101,  679, 6206,  102,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])


In [4]:
#Split Dataset for Training and Validation
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids,labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

3,110 training samples
  164 validation samples


In [5]:
# create a datalogger
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

Get Model and the setting done

In [7]:
# get pretrained model
from transformers import EncoderDecoderModel,AdamW
import torch
import numpy as np
import time
import datetime

epochs = 2

model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-chinese', 'bert-base-chinese') # initialize Bert2Bert from pre-trained checkpoints
model.cuda()

# training
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.


# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
print(total_steps)

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertLMHeadModel: ['cls.seq_relationship.bias', 'cls.seq_relations

196


Get the training done

In [8]:
import random
import numpy as np

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()


# For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        # b_input_mask = batch[1].to(device)
        b_labels = batch[1].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        outputs = model(input_ids=b_input_ids, labels=b_labels)
        model

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += outputs.loss.item()

        # Perform a backward pass to calculate the gradients.
        outputs.loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        # b_input_mask = batch[1].to(device)
        b_labels = batch[1].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            # outputs = model(b_input_ids, 
            #                        token_type_ids=None, 
            #                        attention_mask=b_input_mask,
            #                        labels=b_labels)
            outputs = model(input_ids=b_input_ids, labels=b_labels)
            loss, logits = outputs.loss, outputs.logits
        total_eval_loss += loss.item()

    #     # # Accumulate the validation loss.
    #     # total_eval_loss += outputs.loss.item()

    #     # # Move logits and labels to CPU
    #     # logits = outputs.logits.detach().cpu().numpy()
    #     # label_ids = b_labels.to('cpu').numpy()
        
    #     # # Move logits and labels to CPU
    #     # logits = outputs.logits.detach().cpu().numpy()
    #     # label_ids = b_labels.to('cpu').numpy()

    #     # # Calculate the accuracy for this batch of test sentences, and
    #     # # accumulate it over all batches.
    #     total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # # Report the final accuracy for this validation run.
    # avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    # print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # # Calculate the average loss over all of the batches.
    # avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # # Measure how long the validation run took.
    # validation_time = format_time(time.time() - t0)
    
    # print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    # print("  Validation took: {:}".format(validation_time))

    # # Record all statistics from this epoch.
    # training_stats.append(
    #     {
    #         'epoch': epoch_i + 1,
    #         'Training Loss': avg_train_loss,
    #         'Valid. Loss': avg_val_loss,
    #         'Valid. Accur.': avg_val_accuracy,
    #         'Training Time': training_time,
    #         'Validation Time': validation_time
    #     }
    # )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))       

# model.save_pretrained("bert2bert")
# model = EncoderDecoderModel.from_pretrained("bert2bert")
# input_ids = tokenizer("今天是个好日子", return_tensors="pt").input_ids
# generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)



Training...




  Batch    40  of     98.    Elapsed: 0:00:08.
  Batch    80  of     98.    Elapsed: 0:00:16.

  Average training loss: 4.46
  Training epcoh took: 0:00:20

Running Validation...

Training...
  Batch    40  of     98.    Elapsed: 0:00:08.
  Batch    80  of     98.    Elapsed: 0:00:16.

  Average training loss: 2.81
  Training epcoh took: 0:00:20

Running Validation...

Training complete!
Total training took 0:00:40 (h:mm:ss)


In [9]:
model.save_pretrained("bert2bert_mom_son")
model1 = EncoderDecoderModel.from_pretrained("bert2bert_mom_son")

In [16]:
input_ids = tokenizer("我姥姥最近怎么样", return_tensors="pt").input_ids
generated = model1.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)

tokenizer.decode(generated[0], skip_special_tokens=True)
tokenizer.convert_ids_to_tokens(generated[0])

['[PAD]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]']

In [None]:
# from transformers import EncoderDecoderModel, BertTokenizer
# import torch

# tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-chinese', 'bert-base-chinese') # initialize Bert2Bert from pre-trained checkpoints

# # training
# model.config.decoder_start_token_id = tokenizer.cls_token_id
# model.config.pad_token_id = tokenizer.pad_token_id
# model.config.vocab_size = model.config.decoder.vocab_size

# input_ids = tokenizer("今天是个好日子", return_tensors="pt").input_ids
# labels = tokenizer("你说的对", return_tensors="pt").input_ids
# outputs = model(input_ids=input_ids, labels=labels)
# loss, logits = outputs.loss, outputs.logits

# # save and load from pretrained
# model.save_pretrained("bert2bert_sample")
# model = EncoderDecoderModel.from_pretrained("bert2bert_sample")

# # generation
# generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
# tokenizer.convert_ids_to_tokens(generated[0])
# tokenizer.decode(generated[0], skip_special_tokens=True)



load data and enbe