https://mccormickml.com/2019/07/22/BERT-fine-tuning/

## Imports and Downloading Data

In [2]:
import tensorflow as tf
import torch

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

ModuleNotFoundError: No module named 'tensorflow'

In [3]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [4]:
!pip3 install wget

[33mYou are using pip version 18.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [5]:
import wget
import os

print('Downloading dataset...')

# The URL for the dataset zip file.
url = 'https://competitions.codalab.org/my/datasets/download/4db8bf21-def7-4a86-99f5-7b23d5691bb3'

# Download the file (if we haven't already)
if not os.path.exists('multi-fc/'):
    !mkdir multi-fc
    wget.download(url, 'multi-fc/multi-fc.zip')
    !unzip multi-fc/multi-fc.zip -d multi-fc/

Downloading dataset...


## Reading in Data

In [6]:
import pandas as pd
import csv

# Load the dataset into a pandas dataframe.
df = pd.read_csv("multi-fc/train.tsv", delimiter='\t', header=None, quoting=csv.QUOTE_NONE, \
                 names= ['claimID', 'claim', 'label', 'claimURL', 'reason', 'categories', 'speaker', \
                  'checker', 'tags', 'articleTitle', 'publishDate', 'claimDate', 'entities'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 2 random rows from the data.
df.sample(2)

Number of training sentences: 27,940



Unnamed: 0,claimID,claim,label,claimURL,reason,categories,speaker,checker,tags,articleTitle,publishDate,claimDate,entities
20860,pomt-00316,"""Kavanaugh accuser’s brother’s firm linked to ...",pants on fire!,/facebook-fact-checks/statements/2018/sep/21/b...,Christine Blasey Ford’s brother’s job history ...,,Bloggers,,,,2018-09-21T09:58:59,2018-09-17,['None']
10987,tron-02120,Inspirational story titled “The Room”,fiction!,https://www.truthorfiction.com/theroom/,,inspirational,,,,Inspirational story titled “The Room”,"Mar 16, 2015",,['None']


In [7]:
# Contains empty claim
indexNames = df[ df['claimID'] == 'bove-00197' ].index
 
# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)
sentences = df.claim.values
labels = df.label.values

In [8]:
import numpy as np

In [9]:
len(np.unique(labels))

116

### Adding Evidence Snippets to Claims

In [10]:
'''
pre_instances[] is a list of claim+snippet pairs

claimsnippet_labels[] is a list of the labels (since every claim has one label and is being expanded to claim+label 
pairs), essentially just an expanded list of the original labels

[SEP] token is used not as a BERT separator token, but so that when we tokenize we can properly split up the 
claim and snippet to pass into encode_plus as separate arguments (see tokenizer section for context)

[UNK] token in the exception is used for claims that do not have evidence snippets, BERT can handle this inherently


'''

'\npre_instances[] is a list of claim+snippet pairs\n\nclaimsnippet_labels[] is a list of the labels (since every claim has one label and is being expanded to claim+label \npairs), essentially just an expanded list of the original labels\n\n[SEP] token is used not as a BERT separator token, but so that when we tokenize we can properly split up the \nclaim and snippet to pass into encode_plus as separate arguments (see tokenizer section for context)\n\n[UNK] token in the exception is used for claims that do not have evidence snippets, BERT can handle this inherently\n\n\n'

In [11]:
pre_instances = []
claimsnippet_labels = []
count = 0
for a in range(len(list(df.claim))):
    claim,claimID,label = list(df.claim)[a], list(df.claimID)[a], list(df.label)[a]
    try:
        f=open("multi-fc/snippets/{claimID}".format(claimID=claimID), "r")
        for line in f.readlines():
            split = line.split("\t")
            snippet = split[2]
            pre_instance =  claim +" [SEP] "+snippet
            pre_instances.append(pre_instance)
            claimsnippet_labels.append(label)
    except FileNotFoundError:
            pre_instance = claim + "[SEP]" + "[UNK]"
            pre_instances.append(pre_instance)
            claimsnippet_labels.append(label)

    

In [12]:
pre_instances[0]

'"Six out of 10 of the highest unemployment rates are also in so-called right to work states." [SEP] May 8, 2013 ... Ron Maag and Kristina Roegner, claiming that "six out of 10 of the highest  unemployment rates are also in so-called right to work states.'

### Encoding Labels and Importing Tokenizer

In [13]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
elongated_labels = le.fit_transform(claimsnippet_labels)

In [14]:
!pip3 install transformers

Collecting transformers
  Using cached https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl
Collecting tokenizers==0.5.2 (from transformers)
  Using cached https://files.pythonhosted.org/packages/f5/d7/a3882b2d36991f613b749fc5e305cccc345ec9d6ab0621ad7e7bf1be8691/tokenizers-0.5.2.tar.gz
  Installing build dependencies ... [?25ldone
Collecting sacremoses (from transformers)
  Using cached https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz
Collecting click (from sacremoses->transformers)
  Using cached https://files.pythonhosted.org/packages/dd/c0/4d8f43a9b16e289f36478422031b8a63b54b6ac3b1ba605d602f10dd54d6/click-7.1.1-py2.py3-none-any.whl
Collecting joblib (from sacremoses->transformers)
  Using cached https://files.pythonhosted.org/packages/28/5c/cf6a2b65a321c4a209efcdf64c2689efae2cb62661f8f6f4bb28547cf1bf/joblib-0.14

In [15]:
from transformers import BertTokenizer
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


### Viewing Example

In [16]:
# Print the original sentence.
print(' Original: ', pre_instances[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(pre_instances[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(pre_instances[0])))

 Original:  "Six out of 10 of the highest unemployment rates are also in so-called right to work states." [SEP] May 8, 2013 ... Ron Maag and Kristina Roegner, claiming that "six out of 10 of the highest  unemployment rates are also in so-called right to work states.
Tokenized:  ['"', 'six', 'out', 'of', '10', 'of', 'the', 'highest', 'unemployment', 'rates', 'are', 'also', 'in', 'so', '-', 'called', 'right', 'to', 'work', 'states', '.', '"', '[SEP]', 'may', '8', ',', '2013', '.', '.', '.', 'ron', 'ma', '##ag', 'and', 'kristina', 'roe', '##gne', '##r', ',', 'claiming', 'that', '"', 'six', 'out', 'of', '10', 'of', 'the', 'highest', 'unemployment', 'rates', 'are', 'also', 'in', 'so', '-', 'called', 'right', 'to', 'work', 'states', '.']
Token IDs:  [1000, 2416, 2041, 1997, 2184, 1997, 1996, 3284, 12163, 6165, 2024, 2036, 1999, 2061, 1011, 2170, 2157, 2000, 2147, 2163, 1012, 1000, 102, 2089, 1022, 1010, 2286, 1012, 1012, 1012, 6902, 5003, 8490, 1998, 28802, 20944, 10177, 2099, 1010, 6815, 20

### Tokenizer

Now that our instances are pre_processed, we can pass them into the BERT tokenizer

In [17]:
'''
every "sent" in pre_sentences is broken up into its constituent claim and snippet using the dummu SEP token
      AKA .split("SEP")
'''

'\nevery "sent" in pre_sentences is broken up into its constituent claim and snippet using the dummu SEP token\n      AKA .split("SEP")\n'

In [18]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in pre_instances:
    
    claim, snippet = sent.split("[SEP]")[0], sent.split("[SEP]")[1]
   
    encoded_dict = tokenizer.encode_plus(
                        claim, #claim to encode
                        snippet,# snippet to encode
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                    )

    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
elongated_labels = torch.tensor(elongated_labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', pre_instances[0])
print('Token IDs:', input_ids[0])

Original:  "Six out of 10 of the highest unemployment rates are also in so-called right to work states." [SEP] May 8, 2013 ... Ron Maag and Kristina Roegner, claiming that "six out of 10 of the highest  unemployment rates are also in so-called right to work states.
Token IDs: tensor([  101,  1000,  2416,  2041,  1997,  2184,  1997,  1996,  3284, 12163,
         6165,  2024,  2036,  1999,  2061,  1011,  2170,  2157,  2000,  2147,
         2163,  1012,  1000,   102,  2089,  1022,  1010,  2286,  1012,  1012,
         1012,  6902,  5003,  8490,  1998, 28802, 20944, 10177,  2099,  1010,
         6815,  2008,  1000,  2416,  2041,  1997,  2184,  1997,  1996,  3284,
        12163,  6165,  2024,  2036,  1999,  2061,  1011,  2170,  2157,  2000,
         2147,  2163,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,

In [19]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [20]:
# Select a batch size for training. 
batch_size = 32

# Create an iterator of our data with torch DataLoader 
train_data = TensorDataset(input_ids, attention_masks, elongated_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [21]:
!pip install pytorch-pretrained-bert pytorch-nlp



In [22]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=116)
model.cpu()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [23]:
# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

t_total value of -1 results in schedule not being applied


In [24]:
from tqdm import tqdm, trange

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
  
# Store our loss and accuracy for plotting
train_loss_set = []
# Number of training epochs 
epochs = 4

# BERT training loop
for _ in trange(epochs, desc="Epoch"):  
  
  ## TRAINING
  
  # Set our model to training mode
    model.train()  
  # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
  # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to CPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        train_loss_set.append(loss.item())    
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# plot training performance
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()