# Finetuning BERT 🚀

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/89/f07e7a884072ad37b1b6b1578637ab36152e0251d74abb950d967a59904e/transformers-4.3.1-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 8.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 18.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/fd/5b/44baae602e0a30bcc53fbdbc60bd940c15e143d252d658dfdefce736ece5/tokenizers-0.10.1-cp36-cp36m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 54.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=defd8be01d3

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import torch
import gc

tqdm.pandas()

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
!nvidia-smi

Tue Feb  9 14:07:58 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P8    11W /  70W |     10MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import logging
logging.basicConfig(
    filename="log_bert.log",
    filemode='a',
    format='%(asctime)s %(levelname)s %(message)s',
    datefmt='%H:%M:%S',
    level=logging.DEBUG
)

# Load data

In [None]:
!gdown "https://drive.google.com/uc?id=1RqwwBkarpAEZ0Zs1SVxndjI5mhtwZre5"

Downloading...
From: https://drive.google.com/uc?id=1RqwwBkarpAEZ0Zs1SVxndjI5mhtwZre5
To: /content/abstract_test_full.csv
919MB [00:09, 94.5MB/s]


In [None]:
!gdown "https://drive.google.com/uc?id=1w8cCfCd9A_Ph6jIVTs34pVZMOhKTiX0m"

Downloading...
From: https://drive.google.com/uc?id=1w8cCfCd9A_Ph6jIVTs34pVZMOhKTiX0m
To: /content/abstract_train_full.csv
103MB [00:00, 126MB/s]  


In [None]:
df_train = pd.read_csv("abstract_train_full.csv", index_col=0)
df_test = pd.read_csv("abstract_test_full.csv", index_col=0)

  mask |= (ar1 == a)


In [None]:
df_train.fillna("", inplace=True)
df_test["abstract"].fillna("", inplace=True)

In [None]:
df_train

Unnamed: 0,authorID,abstract,h_index
0,7248981,fuelled bring internet things concept real int...,11.0
1,7248981,recent advances mobile devices network technol...,11.0
2,7248981,several research groups working designing new ...,11.0
3,7248981,next generation internet provide ubiquitous co...,11.0
4,7248981,recent huge trend towards running network inte...,11.0
...,...,...,...
231235,2908220509,,1.0
231236,2908220509,,1.0
231237,2908220509,,1.0
231238,2908220509,,1.0


In [None]:
df_test

Unnamed: 0,authorID,abstract,h_index
0,1036332,underground utility conveyance may precisely l...,
1,1036332,invention relates method system wireless netwo...,
2,1036332,present invention system method searching larg...,
3,1036332,method apparatus calculating engineered capaci...,
4,1036332,method apparatus detecting abnormal calling ac...,
...,...,...,...
2081145,2908506980,,
2081146,2908506980,,
2081147,2908506980,,
2081148,2908506980,,


# Hyperparameters

In [None]:
from transformers import BertModel, BertTokenizer, RobertaTokenizer, RobertaModel, AutoTokenizer, AutoModel

MODELS = {
    "bert-base-uncased": (BertModel, BertTokenizer, "bert-base-uncased"),
    "bert-large-uncased": (BertModel, BertTokenizer, "bert-large-uncased"),
    "roberta-base": (RobertaModel, RobertaTokenizer, "roberta-base"),
    "roberta-large": (RobertaModel, RobertaTokenizer, "roberta-large"),
    "scibert_uncased": (AutoModel, AutoTokenizer, "allenai/scibert_scivocab_uncased")
}

In [None]:
# Hyper Parameters
# number of samples to take for training
N = 50
# Workers to load the data (4*num_gpus recommended by the pytorch team)
NUM_WORKERS = 4*torch.cuda.device_count()
# Maximum length fo the tokens
MAX_TOKEN_LENGTH = 512
# The name of the model
MODEL_NAME = "scibert_uncased"
# should we train head layer and whole network or only whole network
USE_DUAL_TRAINING = True
# should we use the decreasing lr strategy for the transformer
USE_LR_SCHEME = False
HEAD_PARAMS = {
    "epochs": 3,
    "batch_size": 32,
    "lr": 1e-4
}
BODY_PARAMS = {
    "epochs": 2,
    "batch_size": 6,
    "lr": 5e-5,
    "lr_transfo": 3e-5,
    "lr_decay": 0.95
}
# betas for AdamW
ADAMW_BETAS = (0.9, 0.999)
# how to divide the test df (too large to fit in memory)
BATCH_SIZE_DF = len(df_test) // 1000
# batches for the prediction on the test set
BATCH_SIZE_TEST = 128

In [None]:
hyperparams = {
    "N": N,
    "NUM_WORKERS": NUM_WORKERS,
    "MAX_TOKEN_LENGTH": MAX_TOKEN_LENGTH,
    "MODEL_NAME": MODEL_NAME,
    "USE_DUAL_TRAINING": USE_DUAL_TRAINING,
    "USE_LR_SCHEME": USE_LR_SCHEME,
    "HEAD_PARAMS": HEAD_PARAMS,
    "BODY_PARAMS": BODY_PARAMS,
    "ADAMW_BETAS": ADAMW_BETAS,
    "BATCH_SIZE_DF": BATCH_SIZE_DF,
    "BATCH_SIZE_TEST": BATCH_SIZE_TEST
}
logging.info(f"Parameters for the current run: {hyperparams}")

## Create model and tokenizer

In [None]:
model_class, tokenizer_class, pretrained_weights = MODELS[MODEL_NAME]
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
base_model = model_class.from_pretrained(
    pretrained_weights, output_hidden_states=True
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…




In [None]:
df_train = df_train.iloc[:N]

# Tokenize

In [None]:
def tokenize_sentence(sentence, tokenizer, **tokenizer_kwargs):
    return tokenizer.encode_plus(
        sentence,
        **tokenizer_kwargs
    )

In [None]:
from torch.utils.data import TensorDataset, random_split

def get_tokens_labels(df, tokenizer, progress=True, **tokenizer_kwargs):
    # extract raw values
    sentences = df["abstract"].values
    _sentences = tqdm(sentences) if progress else sentences
    # tokenization
    tokenized_sentences = [
        tokenize_sentence(sent, tokenizer, **tokenizer_kwargs) 
        for sent in _sentences
    ]
    _tokenized_sentences = tqdm(tokenized_sentences) if progress else tokenized_sentences
     # Add the encoded sentence to the list.
    input_ids = [
        token_dict["input_ids"] for token_dict in _tokenized_sentences
    ]
    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    # if we are in test mode
    if df["h_index"].isna().sum() > 0:
        return TensorDataset(input_ids)
    labels = torch.tensor(df["h_index"].values, dtype=torch.float32)
    # Print sentence 0, now as a list of IDs.
    print('Original: ', sentences[0])
    print('Token IDs:', input_ids[0])
    # Combine the inputs into a TensorDataset
    return TensorDataset(input_ids, labels)

In [None]:
tokenizer_kwargs = {
    "add_special_tokens": True, # Add '[CLS]' and '[SEP]'
    "max_length": MAX_TOKEN_LENGTH,           # Pad & truncate all sentences.## a changer
    "padding": "max_length",
    "return_attention_mask": True,   # Construct attn. masks.
    "return_tensors": "pt",     # Return pytorch tensors.
    "truncation": True
}

logging.info(f"Tokenizer arguments: {tokenizer_kwargs}")
dataset = get_tokens_labels(df_train, tokenizer, **tokenizer_kwargs)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Original:  fuelled bring internet things concept real internet engineering task force working standard allows vast number smart objects deployed local wireless sensor networks using huge address space data information harvesting security point open security threats local network cryptography techniques applied front line defence deterrent easily broken weak secure nature lowpan devices wireless compromised nodes could lead insider attacks without detected cryptography intrusion detection system primarily needed second line defence monitor network operations raise alarm case paper analyses potential security threats reviews current solutions countering discovers three novel security namely rank local repair resource depleting seriously affecting routing protocol lossy routing protocol used establish network new ids concept introduced countermeasure method securing routing protocol lossy network topology internal qos potential research works also presented provide baseline reference res

# Split and DataLoaders


## Train/validation split

In [None]:
# Create a 90-10 train-validation split.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

logging.info('{:>5,} training samples'.format(train_size))
logging.info('{:>5,} validation samples'.format(val_size))

   45 training samples
    5 validation samples


## Custom Batch Sampler to speed up the training (courtesy of Theo le s)

In [None]:
from torch.utils.data.sampler import BatchSampler

class LenMatchBatchSampler(BatchSampler):
    """
    Custom PyTorch Sampler that generate batches of similar length.
    Used alongside with trim_tensor, it helps speed up training.
    """
    def __iter__(self):

        buckets = [[]] * 100
        yielded = 0

        for idx in self.sampler:
            count_zeros = torch.sum(self.sampler.data_source[idx][0] == 0)
            count_zeros = int(count_zeros / 64) 
            if len(buckets[count_zeros]) == 0:  buckets[count_zeros] = []

            buckets[count_zeros].append(idx)

            if len(buckets[count_zeros]) == self.batch_size:
                batch = list(buckets[count_zeros])
                yield batch
                yielded += 1
                buckets[count_zeros] = []

        batch = []
        leftover = [idx for bucket in buckets for idx in bucket]

        for idx in leftover:
            batch.append(idx)
            if len(batch) == self.batch_size:
                yielded += 1
                yield batch
                batch = []

        if len(batch) > 0 and not self.drop_last:
            yielded += 1
            yield batch

        assert len(self) == yielded, "produced an inccorect number of batches. expected %i, but yielded %i" %(len(self), yielded)



def trim_tensors(tokens, min_len=10):
    """
    Trim tensors so that within a batch, padding is shortened.
    This speeds up training for RNNs and Transformers
    """
    max_len = max(torch.max(torch.sum((tokens != 0), 1)), min_len)
    return tokens[:, :max_len]

# Model classes

In [None]:
# Here you have to be explicit about your hs blending strategy: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
def _produce_hidden_state_combination(hidden_states):
    # shape of hidden state is (num_transformers_layers, batch_size, seq_len, hidden_dim)
    embedding = hidden_states[0].mean(1)
    *_, h9, h10, h11, h12, _ = hidden_states
    concat_last_four = torch.cat((h9, h10, h11, h12), dim=1)
    mean_last_four = concat_last_four.mean(1)
    hs_concat = torch.cat((embedding, mean_last_four), dim=1)
    return hs_concat

In [None]:
# Here you have to be explicit about what leayers you want for the final pooler
def _produce_pooler(pooler_in_features, pooler_out_features, device=device):
    return nn.Sequential(
        nn.Linear(pooler_in_features, 512),
        nn.Tanh(),
        nn.Dropout(0.4),
        nn.Linear(512, pooler_out_features),
        nn.Tanh(),
        nn.Dropout(0.2),
    ).to(device)

In [None]:
import torch.nn as nn

class TransformerModel(nn.Module):
    def __init__(
        self, base_model, num_classes=1, pooler_in_features=None, pooler_out_features=256
    ):
        super().__init__()
        self.base_model = base_model
        # out features from the pretrained model
        if not pooler_in_features:
            self.pooler_in_features = self.base_model.pooler.dense.out_features
        else:
            self.pooler_in_features = pooler_in_features
        self.h_index_pooler = _produce_pooler(
            self.pooler_in_features, pooler_out_features
        )
        self.h_index_top_layer = nn.Linear(pooler_out_features, num_classes)

    def forward(self, ids):
        sequence_output, pooled_output, hidden_states = self.base_model(
            ids, 
            attention_mask=(ids > 0), 
            return_dict=False
        )
        reduced_hidden_state = _produce_hidden_state_combination(hidden_states)
        # sequence_output has the following shape: (batch_size, reduced_hidden_state_size)
        pooler_output = self.h_index_pooler(reduced_hidden_state)
        y = self.h_index_top_layer(pooler_output)
        return torch.squeeze(y, 1)

# Training loop

In [None]:
import time
import datetime
import random

def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# freeze netwrok/layers
def freeze(model):
    for param in model.parameters():
        param.requires_grad = False

def unfreeze(modem):
    for param in modem.parameters():
        param.requires_grad = True

def unfreeze_layer(model, name):
    for n, p in list(model.named_parameters()):
        if name in n:
            p.requires_grad = True

In [None]:
def _get_optimization_params(
    model, lr=1e-3, weight_decay=0, lr_transfo=3e-5, lr_decay=1, use_lr_scheme=USE_LR_SCHEME
):
    if not use_lr_scheme:
        return model.parameters()
    opt_params = []
    no_decay = ["bias", "LayerNorm.weight"]
    nb_blocks = len(model.base_model.encoder.layer)
    
    for n, p in model.named_parameters():
        wd = 0 if any(nd in n for nd in no_decay) else weight_decay
        
        if "transformer" in n and "pooler" not in n:
            lr_ = lr_transfo
            if "transformer.embeddings" in n:
                lr_ = lr_transfo * lr_decay ** (nb_blocks)
            else:
                for i in range(nb_blocks):  # for bert base
                    if f"layer.{i}." in n:
                        lr_ = lr_transfo * lr_decay ** (nb_blocks - 1 - i)
                        break
        else:
            lr_ = lr

        opt_params.append({
         "params": [p], 
         "weight_decay": wd,
         'lr':lr_,
        })
    return opt_params

In [None]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
def fit(model, optimizer, scheduler, criterion, train_dataloader, start_time):
    # Reset the total loss for this epoch.
    total_train_loss = 0
    # don't accumulate gradients over epochs
    optimizer.zero_grad()
    # train mode
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 50 batches.
        if not step % 100 and step:
            elapsed = format_time(time.time() - start_time)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            logging.info('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader
        b_input_ids = batch[0]
        b_labels = batch[1].to(device)
        # trim id tensor to accelerate training, since batches are ordered
        b_input_ids = trim_tensors(b_input_ids).to(device)

        result = model(b_input_ids)
        loss = criterion(result, b_labels)
        total_train_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        optimizer.zero_grad()

    # Calculate the average loss over all of the batches.
    return total_train_loss / len(train_dataloader)

In [None]:
def validate(model, criterion, validation_dataloader):
    model.eval()
    # Tracking variables 
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    with torch.no_grad():
        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)
            
            result = model(b_input_ids)
            loss = criterion(result, b_labels)
            total_eval_loss += loss.item()

    # Calculate the average loss over all of the batches.
    return total_eval_loss / len(validation_dataloader)

In [None]:
import random
import numpy as np
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup

def fit_and_eval(
    model, 
    train_dataset, 
    val_dataset,
    adamw_betas=ADAMW_BETAS, 
    epochs=2, 
    batch_size=16, 
    lr=5e-5,
    weight_decay=0, 
    lr_transfo=3e-5, 
    lr_decay=1
):
    # training_stats = []
    total_t0 = time.time()

    ## Loss (mean reduction by default)
    criterion = nn.L1Loss().cuda()

    # Create the DataLoaders for our training and validation sets.
    len_sampler = LenMatchBatchSampler(
        RandomSampler(train_dataset), 
        batch_size=batch_size, 
        drop_last=True
    )
    train_dataloader = DataLoader(
        train_dataset,  # The training samples.
        batch_sampler=len_sampler, # Select batches not randomly (by size)
        num_workers=NUM_WORKERS, # Trains with this batch size.
        pin_memory=True
    )
    # For validation the order doesn't matter, so we'll just read them sequentially.
    validation_dataloader = DataLoader(
        val_dataset, # The validation samples.
        shuffle=False, # Pull out batches sequentially.
        batch_size=batch_size, # Evaluate with this batch size.
        num_workers=NUM_WORKERS,
        pin_memory=True
    )

    # Optimization params
    opt_params = _get_optimization_params(model)
    # logging.info(f"Optimization parameters for ADAMW: {opt_params}")
    optimizer = AdamW(opt_params, lr=lr, betas=adamw_betas)
    
    # scheduler
    num_warmup_steps = 0
    num_training_steps = int(epochs * len(train_dataloader))    
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps, num_training_steps
    )
    for epoch_i in range(epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        logging.info('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        logging.info('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()
        avg_train_loss = fit(
            model, optimizer, scheduler, criterion, train_dataloader, t0
        )                    
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        logging.info("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))
        logging.info("  Training epoch took: {:}".format(training_time))
            
        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.
        print("")
        print("Running Validation...")
        logging.info("Running Validation...")
        t0 = time.time()
        avg_val_loss = validate(model, criterion, validation_dataloader)
        validation_time = format_time(time.time() - t0)
        
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))
        logging.info("  Validation Loss: {0:.2f}".format(avg_val_loss))
        logging.info("  Validation took: {:}".format(validation_time))

        # # Record all statistics from this epoch.
        # training_stats.append(
        #     {
        #         'epoch': epoch_i + 1,
        #         'Training Loss': avg_train_loss,
        #         'Valid. Loss': avg_val_loss,
        #         'Training Time': training_time,
        #         'Validation Time': validation_time
        #     }
        # )
    # cleaning garbage
    torch.cuda.empty_cache()
    gc.collect()
    print("")
    print("Training complete!")
    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
    logging.info("Training complete!")
    logging.info("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
    # save checkpoints
    torch.save({
        'iter': int(HEAD_PARAMS["epochs"]) + int(BODY_PARAMS["epochs"]),
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, "model.ckpt")


In [None]:
def training_eval_loop(
    model,
    train_dataset,
    val_dataset,
    head_params=HEAD_PARAMS,
    body_params=BODY_PARAMS,
    use_dual_training=USE_DUAL_TRAINING,
):
    if use_dual_training:
        print("Training the head pooler layer")
        freeze(model)
        for layer in ['h_index_top_layer', 'h_index_pooler']:
            unfreeze_layer(model, layer)
        num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f'-> {num_parameters} trainable parameters\n')
        fit_and_eval(model, train_dataset, val_dataset, **HEAD_PARAMS)
    print('\n- Training all layers: ')
    unfreeze(model)
    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'-> {n_parameters} trainable parameters\n')
    fit_and_eval(model, train_dataset, val_dataset, **BODY_PARAMS)
    return model

In [None]:
# trying the concat strategy (last 4 hidden layers)
pooler_in_features = base_model.pooler.dense.out_features*2
model = TransformerModel(
    base_model, pooler_in_features=pooler_in_features
)

In [None]:
# Bring to cuda and visualize model
model.cuda()

TransformerModel(
  (base_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [None]:
trained_model = training_eval_loop(
    model,
    train_dataset, 
    val_dataset
)

Training the head pooler layer
-> 918529 trainable parameters


Training...

  Average training loss: 15.51
  Training epoch took: 0:00:01

Running Validation...
  Validation Loss: 10.83
  Validation took: 0:00:00

Training...

  Average training loss: 12.30
  Training epoch took: 0:00:01

Running Validation...
  Validation Loss: 10.46
  Validation took: 0:00:00

Training...

  Average training loss: 13.69
  Training epoch took: 0:00:01

Running Validation...
  Validation Loss: 10.28
  Validation took: 0:00:00

Training complete!
Total training took 0:00:05 (h:mm:ss)

- Training all layers: 
-> 110836993 trainable parameters


Training...

  Average training loss: 11.76
  Training epoch took: 0:00:02

Running Validation...
  Validation Loss: 8.43
  Validation took: 0:00:00

Training...

  Average training loss: 11.97
  Training epoch took: 0:00:01

Running Validation...
  Validation Loss: 8.05
  Validation took: 0:00:00

Training complete!
Total training took 0:00:04 (h:mm:ss)


In [None]:
!cat log_bert.log

14:08:36 INFO Parameters for the current run: {'N': 50, 'NUM_WORKERS': 4, 'MAX_TOKEN_LENGTH': 512, 'MODEL_NAME': 'scibert_uncased', 'USE_DUAL_TRAINING': True, 'USE_LR_SCHEME': False, 'HEAD_PARAMS': {'epochs': 3, 'batch_size': 32, 'lr': 0.0001}, 'BODY_PARAMS': {'epochs': 2, 'batch_size': 6, 'lr': 5e-05, 'lr_transfo': 3e-05, 'lr_decay': 0.95}, 'ADAMW_BETAS': (0.9, 0.999), 'BATCH_SIZE_DF': 2081, 'BATCH_SIZE_TEST': 128}
14:08:36 DEBUG Starting new HTTPS connection (1): huggingface.co:443
14:08:36 DEBUG https://huggingface.co:443 "HEAD /allenai/scibert_scivocab_uncased/resolve/main/config.json HTTP/1.1" 200 0
14:08:36 DEBUG Attempting to acquire lock 139986204815144 on /root/.cache/huggingface/transformers/858852fd2471ce39075378592ddc87f5a6551e64c6825d1b92c8dab9318e0fc3.03ff9e9f998b9a9d40647a2148a202e3fb3d568dc0f170dda9dda194bab4d5dd.lock
14:08:36 INFO Lock 139986204815144 acquired on /root/.cache/huggingface/transformers/858852fd2471ce39075378592ddc87f5a6551e64c6825d1b92c8dab9318e0fc3.03ff

# Save torch model to disk and convert it to tensorflow for embedding extraction


In [None]:
!pip install onnx
# For onnx-tensorflow, you may want to refer to the installation guide here: https://github.com/onnx/onnx-tensorflow
!git clone https://github.com/onnx/onnx-tensorflow.git
%cd onnx-tensorflow
!pip install -e .
%cd ..

fatal: destination path 'onnx-tensorflow' already exists and is not an empty directory.
/content/onnx-tensorflow/onnx-tensorflow
Obtaining file:///content/onnx-tensorflow/onnx-tensorflow
Installing collected packages: onnx-tf
  Found existing installation: onnx-tf 1.7.0
    Can't uninstall 'onnx-tf'. No files were found to uninstall.
  Running setup.py develop for onnx-tf
Successfully installed onnx-tf


In [None]:
!pip install tensorflow==2.4.0



In [None]:
from torch.onnx import export

model_onnx_path = "./models/model.onnx"
dummy_input = (
    train_dataset[0][0].unsqueeze(0).to(device), 
    train_dataset[0][0].unsqueeze(0).to(device),
)
input_names = ["input_ids", "attention_mask"]
output_names = ["logit"]
export(
    trained_model.base_model, 
    dummy_input, 
    model_onnx_path, 
    input_names=input_names, 
    output_names=output_names,
    opset_version=11
)

  position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
  input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors


In [None]:
# Load ONNX model and convert to TensorFlow format
model_onnx = onnx.load('./models/model.onnx')

tf_rep = prepare(model_onnx)

# Export model as .pb file
tf_rep.export_graph('./models/model_simple.pb')

In [None]:
def batch_predict(
    model, 
    df_test, 
    batch_size_df=BATCH_SIZE_DF, 
    batch_size_test=BATCH_SIZE_TEST, 
    tokenizer=tokenizer, 
    tokenizer_kwargs=tokenizer_kwargs
):
    n = len(df_test)
    size_groups = np.arange(n) // batch_size_df
    # (batch_size, pred_dim) regression
    predictions = torch.empty((0, 1)).to(device)
    model.eval()

    for i, batch_df in tqdm(df_test.groupby(size_groups)):
        batch_len = len(batch_df)
        test_dataset = get_tokens_labels(
            batch_df, tokenizer, progress=False, **tokenizer_kwargs
        )
        print(f"Token encoded for batch {i}")
        test_dataloader = DataLoader(
            test_dataset,  # The test samples.
            batch_size=batch_size_test,
            shuffle=False, # Select batches not randomly (by size)
            num_workers=NUM_WORKERS, # Train with this batch size.
            pin_memory=True
        )
        with torch.no_grad():
            for ids in test_dataloader:
                pred = model(ids[0].cuda())
                predictions = torch.cat((predictions, pred.view(-1, 1)))
    return predictions

In [None]:
predictions = batch_predict(
    model, df_test
)

In [None]:
df_test["h_index"] = predictions.detach().cpu().numpy()

In [None]:
df_test.csv(f"predictions.csv")