##Installations and imports

In [1]:
!pip install datasets==1.0.1
!pip install transformers==3.1.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets==1.0.1
  Downloading datasets-1.0.1-py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 31.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 71.4 MB/s 
Installing collected packages: xxhash, datasets
Successfully installed datasets-1.0.1 xxhash-3.0.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==3.1.0
  Downloading transformers-3.1.0-py3-none-any.whl (884 kB)
[K     |████████████████████████████████| 884 kB 30.1 MB/s 
Collecting tokenizers==0.8.1.rc2
  Downloading tokenizers-0.8.1rc2-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 59.0 MB/s 
[?25hCollecting sentencepiece!=0.1.92
  Downloading sentencepiece

In [2]:
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset, load_metric

os.environ["TOKENIZERS_PARALLELISM"] = "false"

PyTorch version 1.12.1+cu113 available.
TensorFlow version 2.8.2 available.


It is recommended to use GPU for faster processing for this code. 

In [3]:
# Check that we are using 100% of GPU memory footprint support libraries/code
# from https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip -q install gputil
!pip -q install psutil
!pip -q install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
  process = psutil.Process(os.getpid())
  print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
  print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

  Building wheel for gputil (setup.py) ... [?25l[?25hdone
Gen RAM Free: 12.3 GB  | Proc size: 759.0 MB
GPU RAM Free: 15109MB | Used: 0MB | Util   0% | Total 15109MB


In [4]:
# Load the MRPC dataset (train, validation and test)
dataset = load_dataset('glue', 'mrpc')

https://raw.githubusercontent.com/huggingface/datasets/1.0.1/datasets/glue/glue.py not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/tmpcg5qsj7r


Downloading:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

storing https://raw.githubusercontent.com/huggingface/datasets/1.0.1/datasets/glue/glue.py in cache at /root/.cache/huggingface/datasets/8a575b8341116e1cf0f3a928d70f7ee7ec1aee5e6620744d30d5331a2be68979.d804a9b67563ab7de5bb068d5eccc0eff8cf0849041ad2e8afff1beb8a14544d.py
creating metadata file for /root/.cache/huggingface/datasets/8a575b8341116e1cf0f3a928d70f7ee7ec1aee5e6620744d30d5331a2be68979.d804a9b67563ab7de5bb068d5eccc0eff8cf0849041ad2e8afff1beb8a14544d.py
https://raw.githubusercontent.com/huggingface/datasets/1.0.1/datasets/glue/dataset_infos.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/tmpwo6v3zhc


Downloading:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

storing https://raw.githubusercontent.com/huggingface/datasets/1.0.1/datasets/glue/dataset_infos.json in cache at /root/.cache/huggingface/datasets/e9129383f6197a6e76ba55cb5a1dfc2cc28e085b4585b48a3cf8978344805837.03e16a366649d9f5c63e615ccbc58466c013cc8677c5be1b52636c46e597c13c
creating metadata file for /root/.cache/huggingface/datasets/e9129383f6197a6e76ba55cb5a1dfc2cc28e085b4585b48a3cf8978344805837.03e16a366649d9f5c63e615ccbc58466c013cc8677c5be1b52636c46e597c13c
Checking /root/.cache/huggingface/datasets/8a575b8341116e1cf0f3a928d70f7ee7ec1aee5e6620744d30d5331a2be68979.d804a9b67563ab7de5bb068d5eccc0eff8cf0849041ad2e8afff1beb8a14544d.py for additional imports.
Creating main folder for dataset https://raw.githubusercontent.com/huggingface/datasets/1.0.1/datasets/glue/glue.py at /root/.cache/huggingface/modules/datasets_modules/datasets/glue
Creating specific version folder for dataset https://raw.githubusercontent.com/huggingface/datasets/1.0.1/datasets/glue/glue.py at /root/.cache/hugg

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/930e9d141872db65102cabb9fa8ac01c11ffc8a1b72c2e364d8cdda4610df542...


Dataset not on Hf google storage. Downloading and preparing it from source
  0%|          | 0/3 [00:00<?, ?it/s]Couldn't get ETag version for url https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc
https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmplsf1lhcg


Downloading:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

storing https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc in cache at /root/.cache/huggingface/datasets/downloads/8d220c9428aab35412988ca4af82113e71078cfb86c00cf98b8d2ff0af54d19f
creating metadata file for /root/.cache/huggingface/datasets/downloads/8d220c9428aab35412988ca4af82113e71078cfb86c00cf98b8d2ff0af54d19f
 33%|███▎      | 1/3 [00:01<00:03,  1.93s/it]https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmp74hywwfo


Downloading: 0.00B [00:00, ?B/s]

storing https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt in cache at /root/.cache/huggingface/datasets/downloads/7c6c4f66e416181b62e136ddd5834ec10afe3aac4f7a327b81ca74025ea69529
creating metadata file for /root/.cache/huggingface/datasets/downloads/7c6c4f66e416181b62e136ddd5834ec10afe3aac4f7a327b81ca74025ea69529
 67%|██████▋   | 2/3 [00:09<00:05,  5.12s/it]https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmpby6jjate


Downloading: 0.00B [00:00, ?B/s]

storing https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt in cache at /root/.cache/huggingface/datasets/downloads/d0f75e90c732a9847ec38471fddece4ebcaad09dd1958467e2b00c6a3cbd31a9
creating metadata file for /root/.cache/huggingface/datasets/downloads/d0f75e90c732a9847ec38471fddece4ebcaad09dd1958467e2b00c6a3cbd31a9
100%|██████████| 3/3 [00:12<00:00,  4.02s/it]
Downloading took 0.0 min
Checksum Computation took 0.0 min
All the checksums matched successfully for dataset source files
Generating split train


0 examples [00:00, ? examples/s]

Done writing 3668 examples in 943851 bytes /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/930e9d141872db65102cabb9fa8ac01c11ffc8a1b72c2e364d8cdda4610df542.incomplete/glue-train.arrow.
Generating split validation


0 examples [00:00, ? examples/s]

Done writing 408 examples in 105887 bytes /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/930e9d141872db65102cabb9fa8ac01c11ffc8a1b72c2e364d8cdda4610df542.incomplete/glue-validation.arrow.
Generating split test


0 examples [00:00, ? examples/s]

Done writing 1725 examples in 442418 bytes /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/930e9d141872db65102cabb9fa8ac01c11ffc8a1b72c2e364d8cdda4610df542.incomplete/glue-test.arrow.
All the splits matched successfully.
Constructing Dataset for split train, validation, test, from /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/930e9d141872db65102cabb9fa8ac01c11ffc8a1b72c2e364d8cdda4610df542


Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/930e9d141872db65102cabb9fa8ac01c11ffc8a1b72c2e364d8cdda4610df542. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 225.96it/s]


Read the train, validation and test files produced from the previous notebook. Pass the csv files as it is. 

In [5]:

# Transform data into pandas dataframes
df_train = pd.read_csv('train.csv')
df_val = pd.read_csv('val.csv')
df_test = pd.read_csv('test.csv')


print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(172, 4)
(24, 4)
(50, 4)


In [6]:
print(df_train.head())

             tweet_id  label  \
0  910523364154003456      1   
1  910523388598280192      0   
2  910523398144397312      0   
3  910523398144397312      0   
4  910523398144397312      0   

                                           sentence1  \
0  RT @FCBarcelona: Our solidarity with the victi...   
1  Mexico earthquake: Many children killed at pri...   
2  RT @AmichaiStein1: #BREAKING: Israel search &a...   
3  RT @AmichaiStein1: #BREAKING: Israel search &a...   
4  RT @AmichaiStein1: #BREAKING: Israel search &a...   

                                           sentence2  
0  A group of men in uniform standing in front of...  
1         A city street filled with lots of traffic.  
2  A group of people standing outside of a large ...  
3  A large jetliner sitting on top of an airport ...  
4  A large jetliner sitting on top of an airport ...  


Custom dataset class for sentence pair classification expecting sentence 1 which is the orignal tweet text and sentene 2 which is the caption produced by the image in natural language.

In [7]:
class CustomDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=True, bert_model='albert-base-v2'):

        self.data = data  # pandas dataframe
        #Initialize the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent1 = str(self.data.loc[index, 'sentence1'])
        sent2 = str(self.data.loc[index, 'sentence2'])

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.maxlen,  
                                      return_tensors='pt')  # Return torch.Tensor objects
        
        token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label = self.data.loc[index, 'label']
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

Sentence Pair classifer module, comments for each set of important lines.

In [8]:
class SentencePairClassifier(nn.Module):

    def __init__(self, bert_model="albert-base-v2", freeze_bert=False):
        super(SentencePairClassifier, self).__init__()
        #  Instantiating BERT-based model object
        self.bert_layer = AutoModel.from_pretrained(bert_model)

        #  Fix the hidden-state size of the encoder outputs (If you want to add other pre-trained models here, search for the encoder output size)
        if bert_model == "albert-base-v2":  # 12M parameters
            hidden_size = 768
        elif bert_model == "albert-large-v2":  # 18M parameters
            hidden_size = 1024
        elif bert_model == "albert-xlarge-v2":  # 60M parameters
            hidden_size = 2048
        elif bert_model == "albert-xxlarge-v2":  # 235M parameters
            hidden_size = 4096
        elif bert_model == "bert-base-uncased": # 110M parameters
            hidden_size = 768

        # Freeze bert layers and only train the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        # Classification layer
        self.cls_layer = nn.Linear(hidden_size, 1)

        self.dropout = nn.Dropout(p=0.1)

    @autocast()  # run in mixed precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)

        # Feeding to the classifier layer the last layer hidden-state of the [CLS] token further processed by a
        # Linear Layer and a Tanh activation. The Linear layer weights were trained from the sentence order prediction (ALBERT) or next sentence prediction (BERT)
        # objective during pre-training.
        logits = self.cls_layer(self.dropout(pooler_output))

        return logits

In [9]:
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    

def evaluate_loss(net, device, criterion, dataloader):
    net.eval()

    mean_loss = 0
    count = 0

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            count += 1

    return mean_loss / count

In [10]:
print("Creation of the models' folder...")
!mkdir models

Creation of the models' folder...


##Training

Trainer function for the bert. 

In [11]:
def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):

    best_loss = np.Inf
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5  # print the training loss 5 times per epoch
    iters = []
    train_losses = []
    val_losses = []

    scaler = GradScaler()

    for ep in range(epochs):

        net.train()
        running_loss = 0.0
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):

            # Converting to cuda tensors
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
    
            # Enables autocasting for the forward pass (model + loss)
            with autocast():
                # Obtaining the logits from the model
                logits = net(seq, attn_masks, token_type_ids)

                # Computing loss
                loss = criterion(logits.squeeze(-1), labels.float())
                loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged

            # Backpropagating the gradients
            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            scaler.scale(loss).backward()

            if (it + 1) % iters_to_accumulate == 0:
                # Optimization step
                # scaler.step() first unscales the gradients of the optimizer's assigned params.
                # If these gradients do not contain infs or NaNs, opti.step() is then called,
                # otherwise, opti.step() is skipped.
                scaler.step(opti)
                # Updates the scale for next iteration.
                scaler.update()
                # Adjust the learning rate based on the number of iterations.
                lr_scheduler.step()
                # Clear gradients
                opti.zero_grad()


            running_loss += loss.item()

            if (it + 1) % print_every == 0:  # Print training loss information
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(it+1, nb_iterations, ep+1, running_loss / print_every))

                running_loss = 0.0


        val_loss = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
        print()
        print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))

        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1

    # Saving the model
    path_to_model='models/{}_lr_{}_val_loss_{}_ep_{}.pt'.format(bert_model, lr, round(best_loss, 5), best_ep)
    torch.save(net_copy.state_dict(), path_to_model)
    print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache()

These are the set of hyper-parameters for which we obtained the best results for each disaster. 

In [12]:
bert_model = 'albert-base-v2'  # 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', 'albert-xxlarge-v2' 'bert-base-uncased', ...
#bert_model = "bert-base-uncased"
freeze_bert = False  # if True, freeze the encoder weights and only update the classification layer weights
maxlen = 180  # maximum length of the tokenized input sentence pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
bs = 16  # batch size
iters_to_accumulate = 2  # the gradient accumulation adds gradients over an effective batch of size : bs * iters_to_accumulate. If set to "1", you get the usual batch size
lr = 2e-5  # learning rate
epochs = 10  # number of training epochs

Training and Validation

In [13]:
#  Set all seeds to make reproducible results
set_seed(1)

# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)
# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)

if torch.cuda.device_count() > 1:  # if multiple GPUs
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    net = nn.DataParallel(net)

net.to(device)

criterion = nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader)  # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)

Reading training data...


Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Reading validation data...


  cpuset_checked))


Downloading:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

 18%|█▊        | 2/11 [00:03<00:14,  1.56s/it]


Iteration 2/11 of epoch 1 complete. Loss : 0.26391735672950745 


 45%|████▌     | 5/11 [00:04<00:02,  2.05it/s]


Iteration 4/11 of epoch 1 complete. Loss : 0.30020371079444885 


 64%|██████▎   | 7/11 [00:04<00:01,  3.03it/s]


Iteration 6/11 of epoch 1 complete. Loss : 0.3083043843507767 


 73%|███████▎  | 8/11 [00:04<00:00,  3.45it/s]


Iteration 8/11 of epoch 1 complete. Loss : 0.2875303030014038 


100%|██████████| 11/11 [00:05<00:00,  4.56it/s]


Iteration 10/11 of epoch 1 complete. Loss : 0.28562261164188385 


100%|██████████| 11/11 [00:05<00:00,  1.97it/s]
100%|██████████| 2/2 [00:00<00:00,  4.33it/s]



Epoch 1 complete! Validation Loss : 0.5343971252441406
Best validation loss improved from inf to 0.5343971252441406



 18%|█▊        | 2/11 [00:00<00:02,  3.32it/s]


Iteration 2/11 of epoch 2 complete. Loss : 0.20602890104055405 


 45%|████▌     | 5/11 [00:01<00:01,  4.43it/s]


Iteration 4/11 of epoch 2 complete. Loss : 0.2719835788011551 


 55%|█████▍    | 6/11 [00:01<00:01,  4.57it/s]


Iteration 6/11 of epoch 2 complete. Loss : 0.2762701213359833 


 73%|███████▎  | 8/11 [00:01<00:00,  4.74it/s]


Iteration 8/11 of epoch 2 complete. Loss : 0.24665208160877228 


100%|██████████| 11/11 [00:02<00:00,  5.18it/s]


Iteration 10/11 of epoch 2 complete. Loss : 0.2594084143638611 


100%|██████████| 11/11 [00:02<00:00,  4.26it/s]
100%|██████████| 2/2 [00:00<00:00,  4.28it/s]



Epoch 2 complete! Validation Loss : 0.4680213928222656
Best validation loss improved from 0.5343971252441406 to 0.4680213928222656



 18%|█▊        | 2/11 [00:00<00:02,  3.29it/s]


Iteration 2/11 of epoch 3 complete. Loss : 0.20162618160247803 


 45%|████▌     | 5/11 [00:01<00:01,  4.42it/s]


Iteration 4/11 of epoch 3 complete. Loss : 0.21968196332454681 


 55%|█████▍    | 6/11 [00:01<00:01,  4.57it/s]


Iteration 6/11 of epoch 3 complete. Loss : 0.25409989058971405 


 73%|███████▎  | 8/11 [00:01<00:00,  4.72it/s]


Iteration 8/11 of epoch 3 complete. Loss : 0.2084341198205948 


100%|██████████| 11/11 [00:02<00:00,  5.19it/s]


Iteration 10/11 of epoch 3 complete. Loss : 0.24271684885025024 


100%|██████████| 11/11 [00:02<00:00,  4.26it/s]
100%|██████████| 2/2 [00:00<00:00,  4.34it/s]



Epoch 3 complete! Validation Loss : 0.4114875793457031
Best validation loss improved from 0.4680213928222656 to 0.4114875793457031



 18%|█▊        | 2/11 [00:00<00:02,  3.26it/s]


Iteration 2/11 of epoch 4 complete. Loss : 0.18133975565433502 


 36%|███▋      | 4/11 [00:01<00:01,  4.17it/s]


Iteration 4/11 of epoch 4 complete. Loss : 0.16891849786043167 


 55%|█████▍    | 6/11 [00:01<00:01,  4.55it/s]


Iteration 6/11 of epoch 4 complete. Loss : 0.22978641837835312 


 73%|███████▎  | 8/11 [00:01<00:00,  4.66it/s]


Iteration 8/11 of epoch 4 complete. Loss : 0.19059621542692184 


100%|██████████| 11/11 [00:02<00:00,  5.14it/s]


Iteration 10/11 of epoch 4 complete. Loss : 0.21268032491207123 


100%|██████████| 11/11 [00:02<00:00,  4.23it/s]
100%|██████████| 2/2 [00:00<00:00,  4.39it/s]



Epoch 4 complete! Validation Loss : 0.37967681884765625
Best validation loss improved from 0.4114875793457031 to 0.37967681884765625



 27%|██▋       | 3/11 [00:00<00:02,  3.86it/s]


Iteration 2/11 of epoch 5 complete. Loss : 0.17478064447641373 


 36%|███▋      | 4/11 [00:01<00:01,  4.15it/s]


Iteration 4/11 of epoch 5 complete. Loss : 0.13914604112505913 


 55%|█████▍    | 6/11 [00:01<00:01,  4.52it/s]


Iteration 6/11 of epoch 5 complete. Loss : 0.1914094239473343 


 73%|███████▎  | 8/11 [00:01<00:00,  4.68it/s]


Iteration 8/11 of epoch 5 complete. Loss : 0.1499720737338066 


100%|██████████| 11/11 [00:02<00:00,  5.11it/s]


Iteration 10/11 of epoch 5 complete. Loss : 0.17792263627052307 


100%|██████████| 11/11 [00:02<00:00,  4.21it/s]
100%|██████████| 2/2 [00:00<00:00,  4.39it/s]



Epoch 5 complete! Validation Loss : 0.3709430694580078
Best validation loss improved from 0.37967681884765625 to 0.3709430694580078



 18%|█▊        | 2/11 [00:00<00:02,  3.26it/s]


Iteration 2/11 of epoch 6 complete. Loss : 0.14912666380405426 


 36%|███▋      | 4/11 [00:01<00:01,  4.15it/s]


Iteration 4/11 of epoch 6 complete. Loss : 0.11970631778240204 


 55%|█████▍    | 6/11 [00:01<00:01,  4.52it/s]


Iteration 6/11 of epoch 6 complete. Loss : 0.1631123647093773 


 73%|███████▎  | 8/11 [00:01<00:00,  4.64it/s]


Iteration 8/11 of epoch 6 complete. Loss : 0.1267596334218979 


100%|██████████| 11/11 [00:02<00:00,  5.11it/s]


Iteration 10/11 of epoch 6 complete. Loss : 0.14769907295703888 


100%|██████████| 11/11 [00:02<00:00,  4.22it/s]
100%|██████████| 2/2 [00:00<00:00,  4.38it/s]



Epoch 6 complete! Validation Loss : 0.3459453582763672
Best validation loss improved from 0.3709430694580078 to 0.3459453582763672



 18%|█▊        | 2/11 [00:00<00:02,  3.09it/s]


Iteration 2/11 of epoch 7 complete. Loss : 0.12276577949523926 


 36%|███▋      | 4/11 [00:01<00:01,  4.08it/s]


Iteration 4/11 of epoch 7 complete. Loss : 0.09985687583684921 


 55%|█████▍    | 6/11 [00:01<00:01,  4.49it/s]


Iteration 6/11 of epoch 7 complete. Loss : 0.13953622430562973 


 73%|███████▎  | 8/11 [00:01<00:00,  4.65it/s]


Iteration 8/11 of epoch 7 complete. Loss : 0.10477857291698456 


100%|██████████| 11/11 [00:02<00:00,  5.16it/s]


Iteration 10/11 of epoch 7 complete. Loss : 0.12025092542171478 


100%|██████████| 11/11 [00:02<00:00,  4.17it/s]
100%|██████████| 2/2 [00:00<00:00,  4.42it/s]



Epoch 7 complete! Validation Loss : 0.35581398010253906


 18%|█▊        | 2/11 [00:00<00:02,  3.30it/s]


Iteration 2/11 of epoch 8 complete. Loss : 0.09998871386051178 


 36%|███▋      | 4/11 [00:01<00:01,  4.17it/s]


Iteration 4/11 of epoch 8 complete. Loss : 0.0824718065559864 


 55%|█████▍    | 6/11 [00:01<00:01,  4.51it/s]


Iteration 6/11 of epoch 8 complete. Loss : 0.1080535277724266 


 73%|███████▎  | 8/11 [00:01<00:00,  4.68it/s]


Iteration 8/11 of epoch 8 complete. Loss : 0.08282199501991272 


100%|██████████| 11/11 [00:02<00:00,  5.12it/s]


Iteration 10/11 of epoch 8 complete. Loss : 0.0981053002178669 


100%|██████████| 11/11 [00:02<00:00,  4.22it/s]
100%|██████████| 2/2 [00:00<00:00,  4.32it/s]



Epoch 8 complete! Validation Loss : 0.33405303955078125
Best validation loss improved from 0.3459453582763672 to 0.33405303955078125



 18%|█▊        | 2/11 [00:00<00:02,  3.13it/s]


Iteration 2/11 of epoch 9 complete. Loss : 0.08209061622619629 


 36%|███▋      | 4/11 [00:01<00:01,  4.05it/s]


Iteration 4/11 of epoch 9 complete. Loss : 0.07204627804458141 


 55%|█████▍    | 6/11 [00:01<00:01,  4.49it/s]


Iteration 6/11 of epoch 9 complete. Loss : 0.09179127216339111 


 73%|███████▎  | 8/11 [00:01<00:00,  4.69it/s]


Iteration 8/11 of epoch 9 complete. Loss : 0.07367435842752457 


100%|██████████| 11/11 [00:02<00:00,  5.16it/s]


Iteration 10/11 of epoch 9 complete. Loss : 0.08745480328798294 


100%|██████████| 11/11 [00:02<00:00,  4.18it/s]
100%|██████████| 2/2 [00:00<00:00,  4.38it/s]



Epoch 9 complete! Validation Loss : 0.3435192108154297


 18%|█▊        | 2/11 [00:00<00:02,  3.11it/s]


Iteration 2/11 of epoch 10 complete. Loss : 0.07387953996658325 


 36%|███▋      | 4/11 [00:01<00:01,  4.10it/s]


Iteration 4/11 of epoch 10 complete. Loss : 0.06321243196725845 


 55%|█████▍    | 6/11 [00:01<00:01,  4.51it/s]


Iteration 6/11 of epoch 10 complete. Loss : 0.08855355158448219 


 73%|███████▎  | 8/11 [00:01<00:00,  4.68it/s]


Iteration 8/11 of epoch 10 complete. Loss : 0.070979755371809 


100%|██████████| 11/11 [00:02<00:00,  5.16it/s]


Iteration 10/11 of epoch 10 complete. Loss : 0.07611594349145889 


100%|██████████| 11/11 [00:02<00:00,  4.18it/s]
100%|██████████| 2/2 [00:00<00:00,  4.25it/s]



Epoch 10 complete! Validation Loss : 0.3383808135986328
The model has been saved in models/albert-base-v2_lr_2e-05_val_loss_0.33405_ep_8.pt


##Prediction

In [14]:
print("Creation of the results' folder...")
!mkdir results

Creation of the results' folder...


In [15]:
def get_probs_from_logits(logits):
    """
    Converts a tensor of logits into an array of probabilities by applying the sigmoid function
    """
    probs = torch.sigmoid(logits.unsqueeze(-1))
    return probs.detach().cpu().numpy()

def test_prediction(net, device, dataloader, with_labels=True, result_file="results/output.txt"):
    """
    Predict the probabilities on a dataset with or without labels and print the result in a file
    """
    net.eval()
    w = open(result_file, 'w')
    probs_all = []

    with torch.no_grad():
        if with_labels:
            for seq, attn_masks, token_type_ids, _ in tqdm(dataloader):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()
        else:
            for seq, attn_masks, token_type_ids in tqdm(dataloader):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()

    w.writelines(str(prob)+'\n' for prob in probs_all)
    w.close()

  For this cell to run, pass the path of the model file in /models directory

In [17]:
path_to_model = '/content/models/albert-base-v2_lr_2e-05_val_loss_0.33405_ep_8.pt'  
# path_to_model = '/content/models/...'  # You can add here your trained model

path_to_output_file = 'results/output.txt'

print("Reading test data...")
test_set = CustomDataset(df_test, maxlen, bert_model)
test_loader = DataLoader(test_set, batch_size=bs, num_workers=5)

model = SentencePairClassifier(bert_model)
if torch.cuda.device_count() > 1:  # if multiple GPUs
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)

print()
print("Loading the weights of the model...")
model.load_state_dict(torch.load(path_to_model))
model.to(device)

print("Predicting on test data...")
test_prediction(net=model, device=device, dataloader=test_loader, with_labels=True,  # set the with_labels parameter to False if your want to get predictions on a dataset without labels
                result_file=path_to_output_file)
print()
print("Predictions are available in : {}".format(path_to_output_file))

Reading test data...

Loading the weights of the model...
Predicting on test data...


100%|██████████| 4/4 [00:00<00:00,  5.80it/s]


Predictions are available in : results/output.txt





##Evaluation

In [18]:
path_to_output_file = 'results/output.txt'  # path to the file with prediction probabilities

labels_test = df_test['label']  # true labels

probs_test = pd.read_csv(path_to_output_file, header=None)[0]  # prediction probabilities
threshold = 0.4   # you can adjust this threshold for your own dataset
preds_test=(probs_test>=threshold).astype('uint8') # predicted labels using the above fixed threshold

metric = load_metric("glue", "mrpc")

https://raw.githubusercontent.com/huggingface/datasets/1.0.1/metrics/glue/glue.py not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/tmp6urjx6j_


Downloading:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

storing https://raw.githubusercontent.com/huggingface/datasets/1.0.1/metrics/glue/glue.py in cache at /root/.cache/huggingface/datasets/50d5843bbbbd80c47809bc76a5b03c0fd87d068509b0060103ae8182e4f5cfb9.ec871b06a00118091ec63eff0a641fddcb8d3c7cd52e855bbb2be28944df4b82.py
creating metadata file for /root/.cache/huggingface/datasets/50d5843bbbbd80c47809bc76a5b03c0fd87d068509b0060103ae8182e4f5cfb9.ec871b06a00118091ec63eff0a641fddcb8d3c7cd52e855bbb2be28944df4b82.py
Checking /root/.cache/huggingface/datasets/50d5843bbbbd80c47809bc76a5b03c0fd87d068509b0060103ae8182e4f5cfb9.ec871b06a00118091ec63eff0a641fddcb8d3c7cd52e855bbb2be28944df4b82.py for additional imports.
Creating main folder for metric https://raw.githubusercontent.com/huggingface/datasets/1.0.1/metrics/glue/glue.py at /root/.cache/huggingface/modules/datasets_modules/metrics/glue
Creating specific version folder for metric https://raw.githubusercontent.com/huggingface/datasets/1.0.1/metrics/glue/glue.py at /root/.cache/huggingface/mod

In [19]:
# Compute the accuracy and F1 scores
metric._compute(predictions=preds_test, references=labels_test)

{'accuracy': 0.86, 'f1': 0.7586206896551724}