## Installing dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |▍                               | 10kB 19.5MB/s eta 0:00:01[K     |▊                               | 20kB 6.5MB/s eta 0:00:01[K     |█▏                              | 30kB 7.7MB/s eta 0:00:01[K     |█▌                              | 40kB 7.7MB/s eta 0:00:01[K     |█▉                              | 51kB 7.0MB/s eta 0:00:01[K     |██▎                             | 61kB 7.5MB/s eta 0:00:01[K     |██▋                             | 71kB 7.9MB/s eta 0:00:01[K     |███                             | 81kB 8.4MB/s eta 0:00:01[K     |███▍                            | 92kB 7.9MB/s eta 0:00:01[K     |███▊                            | 102kB 8.1MB/s eta 0:00:01[K     |████                            | 112kB 8.1MB/s eta 0:00:01[K     |████▌                           | 122kB 8.1M

## Importing modules

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import tensorflow as tf
import torch
import pandas as pd

import random
import numpy as np
from sklearn.metrics import classification_report, f1_score
from transformers import get_linear_schedule_with_warmup
from tensorflow import summary
import time
import datetime
%load_ext tensorboard

In [None]:
# Getting the GPU device name
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
  print('Found GPU at: {}'.format(device_name))
else:
  raise SystemError('GPU device not found')



Found GPU at: /device:GPU:0


In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda")

  print("There are %d GPU(s) available" % torch.cuda.device_count())

  print("We will use the GPU:", torch.cuda.get_device_name(0))
else:
  print("No GPU available, using CPU")
  device = torch.device('cpu')

There are 1 GPU(s) available
We will use the GPU: Tesla V100-SXM2-16GB


In [None]:
# Reading data
train = pd.read_csv('/content/drive/My Drive/SST-2/train.tsv', sep='\t')
dev = pd.read_csv('/content/drive/My Drive/SST-2/dev.tsv', sep='\t')
test = pd.read_csv('/content/drive/My Drive/SST-2/test.tsv', sep='\t')

## Pre-processing

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=480.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [None]:
# Pre-processing
def process(df, batch, method):

  if method == 'train':
    
    # Getting a list of sentences and their labels
    sentences = df.sentence.values
    labels = df.label.values

  else:
    sentences = df.sentence.values

  # Tokenizing
  input_ids = []
  attention_masks = []
  token_type_ids = []

  for sent in sentences:
    encoded_dict = tokenizer.encode_plus(sent,
                                         add_special_tokens=True, 
                                         max_length = 128,  
                                         truncation = True,
                                         padding = 'max_length', 
                                         return_attention_mask = True,
                                         return_token_type_ids = True,
                                         return_tensors = 'pt')
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    token_type_ids.append(encoded_dict['token_type_ids'])
  
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  token_types_ids = torch.cat(token_type_ids, dim=0)
  if method == 'train':
    labels = torch.tensor(labels)
    
    dataset = TensorDataset(input_ids, attention_masks, token_types_ids, labels)

  else:
    dataset = TensorDataset(input_ids, attention_masks, token_types_ids)

  dataloader = DataLoader(dataset,
                          sampler = RandomSampler(dataset),
                          batch_size = batch)
  
  return dataloader

In [None]:
def do_prediction(model, dataloader, device):
  """
  Make Prediction
  """

  preds, true_labels = [], []

  model.eval()
  for batch in dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_token_type_ids = batch

    # Telling the model not to compute or store gradients, 
    # saving memory and speeding up prediction
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids,
                      token_type_ids=b_token_type_ids,
                      attention_mask=b_input_mask)
      
      logits = outputs[0]

      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      #label_ids = b_labels.to('cpu').numpy()

      # Store predictions and true labels
      preds.append(logits)
      #true_labels.append(label_ids)

  # Combine the results across all batches
  flat_preds = np.concatenate(preds, axis=0)

  # For each sample, pick the label with higher score
  flat_preds = np.argmax(flat_preds, axis=1)

  # Combine the correct labels for each batch into a single list
 # flat_true_labels = np.concatenate(true_labels, axis=0)

  return pd.DataFrame({'predictions':flat_preds.tolist()})
  


## Model Building

In [None]:
train_dataloader = process(train, 32, 'train')

In [None]:
dev_dataloader = process(dev, 32, 'train')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base")
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=331070498.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)

In [None]:
epochs = 4

total_steps = len(train_dataloader) * epochs
warmup_steps = total_steps * 0.01

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)

In [None]:
def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()

  return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapse):
  """
  Get time delta
  """
  elapse_rounded = int(round(elapse))
  return str(datetime.timedelta(seconds=elapse_rounded))

In [None]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


training_stats = []

total_t0 = time.time()
# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')


    t0 = time.time()
    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {} of {}. Elapse: {:}'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: token type ids
        #   [3]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_token_type_ids = batch[2].to(device)
        b_labels = batch[3].to(device)

        model.zero_grad()        

        loss, logits = model(b_input_ids, 
                             token_type_ids=b_token_type_ids, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)

    
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))    
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("Running Validation...")
    t0 = time.time()
    
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    f1 = 0

    model.eval()
    # Evaluate data for one epoch
    for batch in dev_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: token type ids
        #   [3]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_token_type_ids = batch[2].to(device)
        b_labels = batch[3].to(device)
        
        with torch.no_grad():        

            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=b_token_type_ids, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        f1 += f1_score(label_ids, np.argmax(logits, axis=1).flatten(), average='weighted')
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(dev_dataloader)
    avg_f1_score = f1 / len(dev_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    print("  F1-Score: {0:.2f}".format(avg_f1_score))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(dev_dataloader)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,

        }
    )

print("")
print("Training complete!")


Training...
  Batch 40 of 2105. Elapse: 0:00:05
  Batch 80 of 2105. Elapse: 0:00:10
  Batch 120 of 2105. Elapse: 0:00:14
  Batch 160 of 2105. Elapse: 0:00:19
  Batch 200 of 2105. Elapse: 0:00:24
  Batch 240 of 2105. Elapse: 0:00:28
  Batch 280 of 2105. Elapse: 0:00:33
  Batch 320 of 2105. Elapse: 0:00:38
  Batch 360 of 2105. Elapse: 0:00:42
  Batch 400 of 2105. Elapse: 0:00:47
  Batch 440 of 2105. Elapse: 0:00:51
  Batch 480 of 2105. Elapse: 0:00:56
  Batch 520 of 2105. Elapse: 0:01:01
  Batch 560 of 2105. Elapse: 0:01:05
  Batch 600 of 2105. Elapse: 0:01:10
  Batch 640 of 2105. Elapse: 0:01:15
  Batch 680 of 2105. Elapse: 0:01:19
  Batch 720 of 2105. Elapse: 0:01:24
  Batch 760 of 2105. Elapse: 0:01:29
  Batch 800 of 2105. Elapse: 0:01:33
  Batch 840 of 2105. Elapse: 0:01:38
  Batch 880 of 2105. Elapse: 0:01:43
  Batch 920 of 2105. Elapse: 0:01:47
  Batch 960 of 2105. Elapse: 0:01:52
  Batch 1000 of 2105. Elapse: 0:01:57
  Batch 1040 of 2105. Elapse: 0:02:01
  Batch 1080 of 2105. Ela

In [None]:
import os
os.makedirs(os.getcwd(), exist_ok=True)
print("Saving model to {}".format(os.getcwd()))

# Save trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'model') else model
model_to_save.save_pretrained(os.getcwd())
tokenizer.save_pretrained(os.getcwd())

Saving model to /content


('/content/vocab.json',
 '/content/merges.txt',
 '/content/special_tokens_map.json',
 '/content/added_tokens.json')

## Test Set

In [None]:
from functools import partial
testing = process(test, 32, 'test')

In [None]:
test_preds = do_prediction(model, testing, device)
test['preds'] = test_preds

In [None]:
test.head()

Unnamed: 0,index,sentence,preds
0,0,uneasy mishmash of styles and genres .,0
1,1,this film 's relationship to actual tension is...,1
2,2,"by the end of no such thing the audience , lik...",1
3,3,director rob marshall went out gunning to make...,1
4,4,lathan and diggs have considerable personal ch...,1


In [None]:
test.preds.value_counts(normalize=True)

1    0.529929
0    0.470071
Name: preds, dtype: float64