<a href="https://colab.research.google.com/github/alexgaskell10/NLP_Translation/blob/master/notebooks/stuff.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# ! pip install transformers



In [0]:
import torch
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.nn import Linear
from torch import optim
torch.set_printoptions(4)

**Loading BERT**

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')
input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
with torch.no_grad():
    hidden_states = model(input_ids)  # Models outputs are now tuples
    last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples

print(len(hidden_states), last_hidden_states.shape)

2 torch.Size([1, 9, 768])


In [0]:
# Download and unzip the data
from os.path import exists
if not exists('ende_data.zip'):
    !wget -O ende_data.zip https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d
    !unzip ende_data.zip

In [14]:
# Check the files
import io

#English-German
print("---EN-DE---")
print()

with open("./train.ende.src", "r") as ende_src:
    print("Source: ",ende_src.readline())
with open("./train.ende.mt", "r") as ende_mt:
    print("Translation: ",ende_mt.readline())
with open("./train.ende.scores", "r") as ende_scores:
    print("Score: ",ende_scores.readline())


---EN-DE---

Source:  José Ortega y Gasset visited Husserl at Freiburg in 1934.

Translation:  1934 besuchte José Ortega y Gasset Husserl in Freiburg.

Score:  1.1016968715664406



In [0]:
# Load data into variables
with open("./train.ende.src", "r") as ende_src:
    en_train = ende_src.read().split('\n')[:-1]
with open("./train.ende.mt", "r") as ende_src:
    de_train = ende_src.read().split('\n')[:-1]
with open("./train.ende.scores", "r") as ende_src:
    train_scores = [float(x) for x in ende_src.read().split('\n')[:-1]]

In [0]:
# Convert input sequences to correct format

# Tokenize English
inputs_en = []
max_len_en = 0
    
for i in range(len(en_train)):
    seq = en_train[i][:-1]
    input_ids = torch.tensor([tokenizer.encode(seq, add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    inputs_en.append(input_ids)
    if input_ids.shape[-1] > max_len_en:
        max_len_en = input_ids.shape[-1]

# Tokenize German
inputs_de = []
max_len_de = 0

for i in range(len(en_train)):
    seq = de_train[i][:-1]
    input_ids = torch.tensor([tokenizer.encode(seq, add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    inputs_de.append(input_ids)
    if input_ids.shape[-1] > max_len_de:
        max_len_de = input_ids.shape[-1]

# Combine tokens into single tensor
inp_tensor = torch.zeros((len(en_train), max_len_en + max_len_de - 2), requires_grad=True)      # <-- -2 because special tokens are not necessary at beginning of German sequence

for i in range(len(inputs_en)):
    # Add English tokens
    en_tokens = inputs_en[i].squeeze()
    inp_tensor[i, : len(en_tokens)] = en_tokens

    # Add German tokens
    de_tokens = inputs_de[i][:,2:].squeeze()      # <-- ignore first 2 tokens as these are special tokens and unnecessary in this case
    inp_tensor[i, max_len_en : max_len_en + len(de_tokens)] = de_tokens


In [0]:
USE_GPU = True
dtype = torch.float32 

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

model = model.to(device=device)
inp_tensor = inp_tensor.to(device=device, dtype=torch.long)


# batches = torch.split(inp_tensor, 1000, dim=0)
# list_bert_embs = []
# for X in batches:
#     with torch.no_grad():
#         last_hidden_states = model(X)[0]    # <-- take word embeddings ([1] gives sentence embeddings)

#     list_bert_embs.append(last_hidden_states)

#     print(last_hidden_states.shape)

#     torch.cuda.empty_cache()

# bert_embs = torch.cat(list_bert_embs, dim=0)

In [52]:
class BertRegressor:
    def __init__(self, input_shape):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased').to(device)
        self.linear = Linear(in_features = input_shape[-1] * 768,                   # <-- 768 is the dim of Bert embedding
                             out_features = 1, bias = True).to(device)

    def forward(self, X):
        X = self.bert(X)[0].view(X.shape[0], -1)
        print(X.requires_grad)
        preds = self.linear(X)
        return preds

    def loss(self, scores, pred_scores):
        y = torch.tensor(scores).view(-1,1).to(device)
        mse = torch.sum((pred_scores - y)**2) / len(scores)
        return mse

    def zero_grad(self):
        self.bert.zero_grad()
        self.linear.zero_grad()

    def __call__(self, X):
        return self.forward(X)
        

X = inp_tensor[:10]
y = train_scores[:10]
br = BertRegressor(X.shape)

optimizer = AdamW(br.linear.parameters(), lr = 2e-5, eps = 1e-8)
optimizer_ = AdamW(br.bert.parameters(), lr = 2e-5, eps = 1e-8)

# with torch.no_grad():
#     embs = br.bert(X)[0].view(X.shape[0], -1)

# embs.requires_grad = True

for i in range(10):
    br.zero_grad()
    torch.cuda.empty_cache()

    # y_pred = br.linear(embs)
    y_pred = br(X)
    loss = br.loss(y, y_pred)
    print(loss, br.linear.weight.data.max(), br.linear.weight.data.min())

    loss.backward()
    optimizer.step()
    optimizer_.step()

torch.cuda.empty_cache()

True
tensor(0.9442, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0031, device='cuda:0') tensor(-0.0031, device='cuda:0')
True
tensor(2.6090, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0032, device='cuda:0') tensor(-0.0032, device='cuda:0')
True
tensor(1.1504, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0032, device='cuda:0') tensor(-0.0032, device='cuda:0')
True
tensor(0.4209, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0032, device='cuda:0') tensor(-0.0032, device='cuda:0')
True
tensor(0.2937, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0032, device='cuda:0') tensor(-0.0032, device='cuda:0')
True
tensor(0.1886, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0032, device='cuda:0') tensor(-0.0032, device='cuda:0')
True
tensor(0.0701, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0032, device='cuda:0') tensor(-0.0032, device='cuda:0')
True
tensor(0.0442, device='cuda:0', grad_fn=<DivBackward0>) tensor(0.0032, device='cuda:0') tensor(-0.0033, device='c

In [46]:
for i in br.linear.parameters():
    print(i.next())

AttributeError: ignored

In [0]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    

In [0]:
import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")

In [0]:
# Use 90% for training and 10% for validation.
X_train, X_test, y_train, y_test = train_test_split(input_ids, labels, random_state=1, test_size=0.1)

# # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# optimizer = AdamW(model.parameters(),
#                   lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
#                   eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
#                 )


# # Number of training epochs (authors recommend between 2 and 4)
# epochs = 4

# # Total number of training steps is number of batches * number of epochs.
# total_steps = len(train_dataloader) * epochs

# # Create the learning rate scheduler.
# scheduler = get_linear_schedule_with_warmup(optimizer, 
#                                             num_warmup_steps = 0,
#                                             num_training_steps = total_steps)