# RoBERTA - Claims + All Snippets


The code has been adapted from the notebook By Chris McCormick and Nick Ryan
https://mccormickml.com/2019/07/22/BERT-fine-tuning/


## 1. Using Colab GPU for Training


In [0]:
!nvidia-smi

Fri May 15 23:01:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8     7W /  75W |      0MiB /  7611MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [0]:
import torch
device = torch.device("cpu")

In [2]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P40


In [3]:
torch.cuda.empty_cache()

In [4]:
!pip install transformers



# 2. Loading MultiFC Dataset


## Download & Extract

We'll use the `wget` package to download the dataset to the Colab instance's file system. 

In [5]:
!pip install wget



In [6]:
import wget
import os

print('Downloading dataset...')

# The URL for the dataset zip file.
url = 'https://competitions.codalab.org/my/datasets/download/4db8bf21-def7-4a86-99f5-7b23d5691bb3'

# Download the file (if we haven't already)
if not os.path.exists('multi-fc/'):
    !mkdir multi-fc
    wget.download(url, 'multi-fc/multi-fc.zip')
    !unzip multi-fc/multi-fc.zip -d multi-fc/

Downloading dataset...


## Parse

In [7]:
import pandas as pd
import csv

# Load the dataset into a pandas dataframe.
train_df = pd.read_csv("multi-fc/train.tsv", delimiter='\t', header=None, quoting=csv.QUOTE_NONE, \
                 names= ['claimID', 'claim', 'label', 'claimURL', 'reason', 'categories', 'speaker', \
                  'checker', 'tags', 'articleTitle', 'publishDate', 'claimDate', 'entities'])

dev_df = pd.read_csv("multi-fc/dev.tsv", delimiter='\t', header=None, quoting=csv.QUOTE_NONE, \
                 names= ['claimID', 'claim', 'label', 'claimURL', 'reason', 'categories', 'speaker', \
                  'checker', 'tags', 'articleTitle', 'publishDate', 'claimDate', 'entities'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(train_df.shape[0]))

# Report the number of sentences.
print('Number of dev sentences: {:,}\n'.format(dev_df.shape[0]))

# Display 2 random rows from the data.
train_df.head()

Number of training sentences: 27,940

Number of dev sentences: 3,493



Unnamed: 0,claimID,claim,label,claimURL,reason,categories,speaker,checker,tags,articleTitle,publishDate,claimDate,entities
0,pomt-03627,"""Six out of 10 of the highest unemployment rat...",half-true,/ohio/statements/2013/may/06/chris-redfern/ohi...,When a couple of Statehouse Republicans prepar...,,Chris Redfern,,,,2013-05-06T06:00:00,2013-04-30,['None']
1,pomt-09611,"""No Democratic campaign for (Fla.) governor ha...",true,/florida/statements/2010/jan/15/alex-sink/flor...,Florida's leading Republican candidate for gov...,,Alex Sink,,,,2010-01-15T13:59:00,2010-01-06,['None']
2,tron-00214,Forward an email for Jasmine,fiction!,https://www.truthorfiction.com/jasmine/,,9-11-attack,,,,Forward an email for Jasmine,"Mar 17, 2015",,['None']
3,snes-04484,Pope Francis endorsed Donald Trump for president.,false,https://www.snopes.com/fact-check/pope-francis...,,Junk News,,Dan Evon,,"Pope Francis Shocks World, Endorses Donald Tru...",10 July 2016,,['None']
4,pomt-06704,Says Ron Paul insisted FEMA should be shut down.,true,/texas/statements/2011/sep/03/maureen-dowd/mau...,Commenting on the federal response to Hurrican...,,Maureen Dowd,,,,2011-09-03T06:00:00,2011-08-30,['None']


In [8]:
def generate_domain_df(df):
    getDomain = lambda x: x[:4]
    return df['claimID'].apply(getDomain).reset_index().rename(columns={"claimID": "domain"})

In [9]:
def sample_df(df, sample_frac=0.1):
    getDomain = lambda x: x[:4]
    df['domain'] = df['claimID'].apply(getDomain)
    df = df.groupby('domain').apply(lambda x: x.sample(frac = sample_frac))
    df.reset_index(drop=True,inplace=True)
    df.drop(columns=['domain'],inplace=True)
    return df

In [10]:
def drop_malformed(df):
    return df.dropna(subset=['claim']).reset_index()

In [11]:
def get_snippets(claimID):
  snippets = []
  try:
    f=open("multi-fc/snippets/{claimID}".format(claimID=claimID), "r")
    for line in f.readlines():
        split = line.split("\t")
        snippet = split[2]
        snippets.append(snippet)
    return snippets
  except FileNotFoundError:
    ##claimID did not have any snippets (we have 3875 / len(train) instances of this)
    return snippets


In [12]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(train_df['label'])

LabelEncoder()

In [13]:
# train_df = sample_df(train_df)

In [14]:
#gives original train and dev DFs with just the bad claims dropped
train_df = drop_malformed(train_df)
dev_df = drop_malformed(dev_df)

In [15]:
##gives df of domain for every claim
train_domains = generate_domain_df(train_df)
dev_domains = generate_domain_df(dev_df)

##maps domain name to id from 0 to domains - 1 
domain_to_domain_id = {k: v for v, k in enumerate(train_domains.domain.unique())}

## replaces domain name column with domain id
train_domains = train_domains.replace({'domain': domain_to_domain_id})
dev_domains = dev_domains.replace({'domain': domain_to_domain_id})

In [16]:
## creates dataframe with claim, index, domain id
train_claims = pd.concat([train_df[['claim']], train_domains], axis=1)
dev_claims = pd.concat([dev_df[['claim']], dev_domains], axis=1)


In [17]:
##gives dfs with label, index, domain_id
train_labels = pd.concat([train_df['label'], train_domains], axis=1)
dev_labels = pd.concat([dev_df['label'], dev_domains], axis=1)

##creates dictionary mapping domain_id to all possible labels within domain
train_labels_map_to_domain = train_labels.groupby('domain')['label'].apply(set).apply(list).to_dict()
dev_labels_map_to_domain = dev_labels.groupby('domain')['label'].apply(set).apply(list).to_dict()

##maps possible labels within to domain to 0 to num_labels-1
for domain in train_labels_map_to_domain:
    train_labels_map_to_domain[domain] = {k: v for v, k in enumerate(train_labels_map_to_domain[domain])} 



## creates list of size len(train) and len(dev) where each label is mapped to its' value within its' domain as per previous code
original_train_labels = train_labels
train_labels = []
original_dev_labels = dev_labels
dev_labels = []
for index, row in original_train_labels.iterrows():
    train_labels.append(train_labels_map_to_domain[row.domain][row.label])
for index, row in original_dev_labels.iterrows():
    dev_labels.append(train_labels_map_to_domain[row.domain][row.label])

In [18]:
## creates dfs with label_id, index, domain per row (claim) and dictionary mapping
train_lab_dom = pd.concat([pd.DataFrame(train_labels, columns=['label_id']), train_domains], axis=1)
train_domain_label_map = train_lab_dom.groupby('domain')['label_id'].apply(set).to_dict()

dev_lab_dom = pd.concat([pd.DataFrame(dev_labels, columns=['label_id']), dev_domains], axis=1)
dev_domain_label_map = dev_lab_dom.groupby('domain')['label_id'].apply(set).to_dict()

domain_label_map = train_domain_label_map

In [19]:
domain_label_map

{0: {0, 1, 2, 3, 4, 5, 6, 7, 8},
 1: {0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26},
 2: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
 3: {0, 1, 2, 3, 4, 5, 6, 7},
 4: {0, 1, 2},
 5: {0, 1, 2, 3, 4, 5},
 6: {0, 1, 2, 3, 4, 5},
 7: {0, 1, 2, 3},
 8: {0, 1, 2, 3, 4, 5},
 9: {0, 1, 2, 3, 4, 5, 6},
 10: {0, 1, 2, 3, 4, 5, 6},
 11: {0, 1},
 12: {0, 1, 2, 3, 4, 5, 6},
 13: {0, 1, 2},
 14: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
 15: {0, 1, 2, 3},
 16: {0, 1, 2, 3, 4, 5},
 17: {0, 1, 2},
 18: {0, 1, 2, 3, 4, 5, 6},
 19: {0, 1, 2, 3, 4},
 20: {0, 1, 2},
 21: {0, 1, 2},
 22: {0, 1},
 23: {0, 1, 2, 3},
 24: {0, 1, 2},
 25: {0, 1, 2}}

# 3. Tokenization & Input Formatting


In [20]:
from transformers import RobertaTokenizer

# Load the BERT tokenizer.
print('Loading Roberta tokenizer...')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Loading Roberta tokenizer...


## Tokenize Dataset

The transformers library provides a helpful `encode` function which will handle most of the parsing and data prep steps for us.

Before we are ready to encode our text, though, we need to decide on a **maximum sentence length** for padding / truncating to.

The below cell will perform one tokenization pass of the dataset in order to measure the maximum sentence length.

In [21]:
tokenizer.unk_token

'<unk>'

In [22]:
tokenizer.unk_token == '<unk>'

True

In [23]:
max_len = 0

# For every sentence...
for index, row in train_claims.iterrows():
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    claimID = train_df.iloc[index]['claimID']
    snippets = get_snippets(claimID)
    if(not snippets):
      input_ids = tokenizer.encode(text=row.claim,text_pair = tokenizer.unk_token, add_special_tokens=True)
      max_len = max(max_len, len(input_ids))
    else:
      for snippet in snippets:
        if len(snippet)>0:
            input_ids = tokenizer.encode(text=row.claim,text_pair = snippet, add_special_tokens=True) 
            max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (629 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1310 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (581 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  1315


In [24]:
snippets = [tokenizer.unk_token]
for snippet in snippets:
  print(snippet)

<unk>


In [25]:
from torch.utils.data import TensorDataset
from collections import defaultdict

def encode_claims(df_claims, en_labels, input_seq_length=512,train=True):
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = defaultdict(list)
    attention_masks = defaultdict(list)
    labels = defaultdict(list)
    token_type_ids = defaultdict(list)
    claim_ids = defaultdict(list)

    datasets = {}
    domains_to_tensors = {}

    # For every sentence...
    for index, row in df_claims.iterrows():

        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.

        if(train):
            claimID = train_df.iloc[index]['claimID']
        else:
            claimID = dev_df.iloc[index]['claimID']

        snippets = get_snippets(claimID)

        if(not snippets):
            snippets = [tokenizer.unk_token]
    
        for snippet in snippets:
            if len(snippet)>0:
                encoded_dict = tokenizer.encode_plus(
                            row.claim,
                            text_pair = snippet,             # Sentence to encode.
                            add_special_tokens = True,       # Add '[CLS]' and '[SEP]'
                            max_length = input_seq_length, # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,    # Construct attn. masks.
                            return_tensors = 'pt', 
                            return_token_type_ids = True          # Return pytorch tensors.
                        )

        # Add the encoded sentence to the list.    
                input_ids[row.domain].append(encoded_dict['input_ids'])

            # And its attention mask (simply differentiates padding from non-padding).
                attention_masks[row.domain].append(encoded_dict['attention_mask'])
                
                if en_labels is not None:
                    labels[row.domain].append(en_labels[index])

            # do the same with token_type_ids
                token_type_ids[row.domain].append(encoded_dict['token_type_ids'])
                #labels[row.domain].append(en_labels[index])

                claim_ids[row.domain].append(int(''.join(x for x in claimID if x.isdigit())))

            
    for domain in domain_label_map:
        # Convert the lists into tensors.
        input_temp = torch.cat(input_ids[domain], dim=0)
        attention_masks_temp = torch.cat(attention_masks[domain], dim=0)
        token_type_ids_temp = torch.cat(token_type_ids[domain],dim=0)
        labels_temp = torch.tensor(labels[domain])
        claim_ids_temp = torch.tensor(claim_ids[domain])
        
        # print(input_temp.shape, attention_masks_temp.shape, token_type_ids_temp.shape,labels_temp.shape, claim_ids_temp.shape)
        if labels_temp.size(0)!= 0:
            datasets[domain] = TensorDataset(input_temp, attention_masks_temp,token_type_ids_temp,labels_temp,claim_ids_temp)
        else:
            datasets[domain] = TensorDataset(input_temp, attention_masks_temp,token_type_ids_temp,claim_ids_temp)

    return datasets

In [26]:
train_datasets = encode_claims(train_claims, train_labels,train=True)
dev_datasets = encode_claims(dev_claims, dev_labels,train=False)

## Training & Validation Split


We'll also create an iterator for our dataset using the torch DataLoader class. This helps save on memory during training because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into memory.

In [27]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 16
batch_size_val = 1

train_dataloaders = {}
validation_dataloaders = {}

for idx, domain in enumerate(domain_label_map):

    # Create the DataLoaders for our training and validation sets.
    # We'll take training samples in random order. 
    train_dataloaders[domain] = DataLoader(
                train_datasets[domain],  # The training samples.
                sampler = RandomSampler(train_datasets[domain]), # Select batches randomly
                batch_size = batch_size # Trains with this batch size.
            )

    # For validation the order doesn't matter, so we'll just read them sequentially.
    validation_dataloaders[domain] = DataLoader(
                dev_datasets[domain], # The validation samples.
                sampler = SequentialSampler( dev_datasets[domain]), # Pull out batches sequentially.
                batch_size = batch_size_val # Evaluate with this batch size.
            )

# 4. Train Our Classification Model

Now that our input data is properly formatted, it's time to fine tune the model. 

## RoBertaForSequenceClassification

In [28]:
from torch import nn

class MultiLinear(nn.Module):
    def __init__(self, hidden_size, domain_label_map):
        super(MultiLinear, self).__init__()
        self.linear_models = {}
        for domain in domain_label_map:
            self.linear_models[domain] = nn.Linear(hidden_size, len(domain_label_map[domain])).to(device)

    def forward(self, domain, input):
        return self.linear_models[domain].forward(input)

In [29]:
from transformers import RobertaForSequenceClassification, RobertaModel
from torch.nn import CrossEntropyLoss

class RobertaForSequenceClassificationMultiDomain(RobertaForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = MultiLinear(config.hidden_size, domain_label_map)
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, \
        inputs_embeds=None, \
        m_labels=None,
        m_domain=None):
        """
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import BertTokenizer, BertForSequenceClassification
        import torch

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)

        loss, logits = outputs[:2]

        """

        outputs = self.roberta(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask,
                            inputs_embeds=inputs_embeds)

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(domain, pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        
        if self.training:

            loss_fct = CrossEntropyLoss()

            loss = loss_fct(logits.view(-1, len(domain_label_map[m_domain.item()])), m_labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [30]:
from transformers import RobertaForSequenceClassification, AdamW, RobertaConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = RobertaForSequenceClassificationMultiDomain.from_pretrained(
    "roberta-base", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = len(le.classes_), # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda();

In [31]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The Roberta model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The Roberta model has 199 different named parameters.

==== Embedding Layer ====

roberta.embeddings.word_embeddings.weight               (50265, 768)
roberta.embeddings.position_embeddings.weight             (514, 768)
roberta.embeddings.token_type_embeddings.weight             (1, 768)
roberta.embeddings.LayerNorm.weight                           (768,)
roberta.embeddings.LayerNorm.bias                             (768,)

==== First Transformer ====

roberta.encoder.layer.0.attention.self.query.weight       (768, 768)
roberta.encoder.layer.0.attention.self.query.bias             (768,)
roberta.encoder.layer.0.attention.self.key.weight         (768, 768)
roberta.encoder.layer.0.attention.self.key.bias               (768,)
roberta.encoder.layer.0.attention.self.value.weight       (768, 768)
roberta.encoder.layer.0.attention.self.value.bias             (768,)
roberta.encoder.layer.0.attention.output.dense.weight     (768, 768)
roberta.encoder.layer.0.attention.output.dense.bias         

## Optimizer & Learning Rate Scheduler

In [32]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


In [33]:
from transformers import get_linear_schedule_with_warmup

epochs = 4

total_steps = 0
for domain in domain_label_map:
    # Total number of training steps is [number of batches] x [number of epochs]. 
    # (Note that this is not the same as the number of training samples).
    total_steps += len(train_dataloaders[domain]) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

## Training Loop

In [34]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [35]:
from sklearn.metrics import f1_score

def f1_macro(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, pred_flat, average='macro')

def f1_micro(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, pred_flat, average='micro')

Helper function for formatting elapsed times as `hh:mm:ss`


In [36]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


We're ready to kick off the training!

In [37]:
from sklearn.metrics import precision_recall_fscore_support

In [38]:
def f_score_calculation(scores,labels):
    '''
    scores: dict of domains as keys and values as another dictionary with claims as keys and predicted classes as values
    labels: dict of domains as keys and values as another dictionary with claims as keys and actual classes as values

    Returns:
    f1_micro: dictionary of macro f1 scores for each domain
    f1_micro: dictionary of micro f1 scores for each domain
    '''
    f1_macro = {}
    f1_micro = {}
    
    for dom in labels.keys():
        y_pred = [stats.mode(scores[dom][k])[0][0] for k in scores[dom].keys()]
        y_true = [labels[dom][k] for k in scores[dom].keys()]
        f1_macro[dom] = precision_recall_fscore_support(y_true,y_pred,average='macro')[2]
        f1_micro[dom] = precision_recall_fscore_support(y_true,y_pred,average='micro')[2]
    return f1_micro, f1_macro

In [41]:
# scores[2]

In [42]:
# len(scores.keys())

In [44]:
# torch.sum(current_input == b_input_ids[:,:5])==5

In [45]:
# logits[0]

In [46]:
# use when required
model.load_state_dict(torch.load('model.pth'))

<All keys matched successfully>

In [None]:
import random
import numpy as np
import random
from collections import defaultdict
from scipy import stats

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()
best_val = -10000

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    domains = list(domain_label_map.keys())
    random.shuffle(domains)

    dataloaders_iters = {}
    dataloader_batch_tracker = {}
    for domain in train_dataloaders:
        dataloaders_iters[domain] = iter(train_dataloaders[domain])
        dataloader_batch_tracker[domain] = 0
    train_domains_list = domains.copy()
    total_train_loss = 0
    step = 0

    while train_domains_list:
        domain = random.choice(train_domains_list)
        dataloader_batch_tracker[domain] += 1
        try:
            batch = next(dataloaders_iters[domain])
        except StopIteration: 
            train_domains_list.remove(domain)
            continue

        step += 1
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(dataloader_batch_tracker[domain], len(train_dataloaders[domain]), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains four pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: token type ids
        #   [3]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_token_type_ids = batch[2].to(device)
        b_labels = batch[3].to(device)

        b_domain = torch.tensor([domain]).to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        loss, logits = model(b_input_ids, 
                            token_type_ids=b_token_type_ids, 
                            attention_mask=b_input_mask, 
                            m_labels=b_labels,
                            m_domain=b_domain)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / step         

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

# ========================================
#               Validation
# ========================================
# After the completion of each training epoch, measure our performance on
# our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    scores = {k:defaultdict(list) for k in domain_label_map.keys()} #scores is a dictionary with key as domaid_id and the value as a defaultdict with claims as the keys and predictions list as the values
    labels = {k:defaultdict(int) for k in domain_label_map.keys()} #labels is a doctionary with key as domain_id and the values are the true labels


    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # F1 Scores
    f1_macro_score = 0
    f1_micro_score = 0

    for domain in validation_dataloaders:
        dataloaders_iters[domain] = iter(validation_dataloaders[domain])
        dataloader_batch_tracker[domain] = 0
    step = 0
    validation_domains_list = domains.copy()

    while validation_domains_list:
        step += 1
        domain = random.choice(validation_domains_list)

        try:
            batch = next(dataloaders_iters[domain])
        except StopIteration: 
            validation_domains_list.remove(domain)
            continue

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: token type ids 
        #   [3]: labels 
        b_input_ids = batch[0].to(device)
        #print(b_input_ids)
        b_input_mask = batch[1].to(device)
        b_token_type_ids = batch[2].to(device)
        b_labels = batch[3].to(device)
        b_domain = torch.tensor([domain]).to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            (loss, logits) = model(b_input_ids, 
                                token_type_ids=b_token_type_ids, 
                                attention_mask=b_input_mask,
                                m_labels=b_labels,
                                m_domain=b_domain)



        s = str(b_input_ids[0,:5].cpu().tolist()) #converting list to an immutable object so that it can be used as the keys of a hash table
        scores[b_domain.item()][s].append(torch.argmax(logits[0]).item()) 
        labels[b_domain.item()][s] = b_labels[0].item()

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        f1_macro_score += f1_macro(logits, label_ids) 
        f1_micro_score += f1_micro(logits, label_ids) 

    # Accuracy for vote on validation method
    #print("Accuracy for VOTE ON VALIDATION method = {0:.2f}".format(np.mean([stats.mode[v] for k,v in score.items()])))
    micro_f1_vote,macro_f1_vote = f_score_calculation(scores,labels)
    avg_macro_f1_vote = np.mean(list(macro_f1_vote.values()))
    avg_micro_f1_vote = np.mean(list(micro_f1_vote.values()))
    
    if avg_micro_f1_vote>best_val:
        torch.save(model.state_dict(),'model.pth')
        best_val = avg_micro_f1_vote
    
    print("F1 Macro VOTE: {0:.2f}".format(avg_macro_f1_vote))
    print("F1 Micro VOTE: {0:.2f}".format(avg_micro_f1_vote))
    print('By domain, micro VOTE')
    print(micro_f1_vote)
    print('By domain, macro VOTE')
    print(macro_f1_vote)
    
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / step
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Report the final f1 macro score for this validation run.
    avg_val_f1_macro = f1_macro_score / step
    print("  F1 Macro: {0:.2f}".format(avg_val_f1_macro))

    # Report the final f1 micro score for this validation run.
    avg_val_f1_micro = f1_micro_score / step
    print("  F1 Micro: {0:.2f}".format(avg_val_f1_micro))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / step

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

Let's view the summary of the training process.

In [0]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

# 5. Performance On Test Set

### Data Preparation



We'll need to apply all of the same steps that we did for the training data to prepare our test data set.

In [49]:
test_df = pd.read_csv("multi-fc/test.tsv", delimiter='\t', header=None, quoting=csv.QUOTE_NONE, \
                 names= ['claimID', 'claim', 'claimURL', 'reason', 'categories', 'speaker', \
                  'checker', 'tags', 'articleTitle', 'publishDate', 'claimDate', 'entities'])

# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(test_df.shape[0]))

index_to_add = test_df.index[test_df.isnull()['claim'] == True].tolist()
test_df = drop_malformed(test_df)

##gives df of domain for every claim
test_domains = generate_domain_df(test_df)
## replaces domain name column with domain id
test_domains = test_domains.replace({'domain': domain_to_domain_id})

test_claims = pd.concat([test_df[['claim','claimID']], test_domains], axis=1)

test_datasets = encode_claims(test_claims, en_labels=None, train=False)

#Claim idx domain map used to correctly sort output
test_domain_idx_map = test_domains.groupby('domain')['index'].apply(list).to_dict()

Number of test sentences: 3,491



In [None]:
# test_datasets

In [50]:
test_dataloaders = {}

for idx, domain in enumerate(domain_label_map):

    # Create the DataLoaders for our test set.
    # For test the order doesn't matter, so we'll just read them sequentially.
    
    test_dataloaders[domain] = DataLoader(
                test_datasets[domain], # The validation samples.
                sampler = SequentialSampler( test_datasets[domain]), # Pull out batches sequentially.
                batch_size = 1 # Evaluate with this batch size.
            )

## Evaluate on Test Set



With the test set prepared, we can apply our fine-tuned model to generate predictions on the test set.

In [51]:
print("Running Testing...")
import random
t0 = time.time()
predictions = []
pred_domains = []

scores = {k:defaultdict(list) for k in domain_label_map.keys()} #scores is a dictionary with key as domaid_id and the value as a defaultdict with claims as the keys and predictions list as the values
#labels = {k:defaultdict(int) for k in domain_label_map.keys()} #labels is a doctionary with key as domain_id and the values are the true labels
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
model.eval()
dataloaders_iters = {}
dataloader_batch_tracker = {}
domains = list(domain_label_map.keys())
for domain in test_dataloaders:
    dataloaders_iters[domain] = iter(test_dataloaders[domain])
    dataloader_batch_tracker[domain] = 0
step = 0
test_domains_list = domains.copy()

for domain in test_dataloaders:
    for step, batch in enumerate(test_dataloaders[domain]):
        
        pred_domains.append(domain)

            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using 
            # the `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_token_type_ids = batch[2].to(device)
        b_domain = torch.tensor([domain]).to(device)
        claim_id = batch[3].to(device)
            
            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which 
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.
            logits = model(b_input_ids, 
                                    token_type_ids=b_token_type_ids, 
                                    attention_mask=b_input_mask,
                                    m_domain=b_domain)
        
        #s = str(b_input_ids[0,:5].cpu().tolist()) #converting list to an immutable object so that it can be used as the keys of a hash table
        
        # SR - claim_id.item instead of claim_id (because claim_id is also a tensor)
        scores[b_domain.item()][claim_id.item()].append(torch.argmax(logits[0]).item()) 
        #labels[b_domain.item()][s] = b_labels[0].item()
            
#         # Accumulate the validation loss.
#     logits = outputs[1]
#   # Move logits and labels to CPU
#     logits = logits.detach().cpu().numpy()
  
#   # Store predictions and true labels
#     predictions.append(logits)




Running Testing...


In [88]:
#claim_id.item()

10525

In [1]:
# scores[0].keys()

In [60]:
import copy
from scipy import stats
from statistics import mode

# pred_dict = copy.deepcopy(scores)
pred_dict = {k: defaultdict(int) for k in domain_label_map.keys()}

# y_pred = [stats.mode(scores[dom][k])[0][0] for k in scores[dom].keys()]

for domain in test_dataloaders:
    # for claim_id in test_dataloaders[domain]:
    for batch in test_dataloaders[domain]:
        # print(domain)
        # print(stats.mode(scores[domain.values()][claim_id]))
        claim_id = batch[3].item()
        
        # pred_dict[domain][claim_id] = stats.mode(scores[domain][claim_id])
        pred_dict[domain][claim_id] = stats.mode(scores[domain][claim_id])[0][0]
        # print(pred_dict[domain][claim_id])

In [2]:
# train_labels_map_to_domain

In [67]:
preds = []
for index, row in test_claims.iterrows():
    
    domain = row.domain
    claim_id = int(''.join(x for x in row.claimID if x.isdigit()))
    p = list(train_labels_map_to_domain[domain].keys())[pred_dict[domain][claim_id]]
    preds.append(p)

In [68]:
preds.insert(index_to_add[0],'false')

In [70]:
len(preds)

3491

In [73]:
# write preds to a file

with open('test.predict', 'w') as filehandle:
    filehandle.writelines("%s\n" % pred for pred in preds)