In [1]:
import os
import sys
import json
import random
import time
import re
import datetime
import pickle

from tqdm import tqdm

import numpy as np
from localutils import *

import torch

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertModel, BertTokenizer, DistilBertTokenizer, RobertaTokenizer
from transformers import get_linear_schedule_with_warmup

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaForSequenceClassification, BertForSequenceClassification, AdamW, BertConfig, DistilBertForSequenceClassification, DistilBertConfig

### Dataset Configs

In [6]:
configs = {
    'news_headline' : {
        'dataset_path': 'data/NewsHeadline_comet_autocomplete.jsonl',
        'model_name': 'distilbert',
        'model_save_point': 'kaggle-news',
        'epochs': 5,
        'test_size': 0.5
    },
    
    'semeval' : {
        'dataset_path': 'data/FullSEMEVAL_comet.jsonl', #'data/SemEval_comet_autocomplete.jsonl',
        'model_name': 'distilbert',
        'model_save_point': 'semeval',
        'epochs': 10,
        'test_size': 0.2
    },
    
    'figlang' : {
        'dataset_path': 'data/FigLang_comet_autocomplete.jsonl',
        'model_name': 'distilbert',
        'model_save_point': 'figlang',
        'epochs': 10,
        'test_size': 0.2
    },
}

In [7]:
config = configs['semeval']

In [8]:
PATH = config['dataset_path']

def load_dataset(filename = PATH):
    dataset = []
    with open(filename) as f:
        for line in f:
            entry = {}
            
            line = line.strip()
            d = json.loads(line)
            
            entry['sentence'] = d['sentence']
            entry['label'] = int(d['label'])
            entry['support'] = []
            for k in d['common_sense'].keys():
                if k == 'xWant' or k == 'xEffect':
                    entry['support'].append(d['common_sense'][k][0] if d['common_sense'][k][0] != 'none' else d['common_sense'][k][1])

            dataset.append(entry)
    return dataset

In [10]:
dataset = load_dataset()

### BERT Pre-processing

In [11]:
model_name = config['model_name']

if model_name == "distilbert":
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
elif model_name == "bert":
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
elif model_name == "roberta":
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

In [12]:
def encode_all(sentences):
    input_ids = []
    for data in sentences:
        input_ids.append(tokenizer.encode(data, add_special_tokens=True))
    return input_ids

In [13]:
MAX_LEN = 32

### Attention Mask

In [14]:
def get_attn(input_ids):
    attention_masks = []

    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks

In [15]:
def create_dataset(dataset):
    all_data = []
    
    for data in tqdm(dataset):
        input_ids = []
        
        input_ids.append(tokenizer.encode(data['sentence'].lower()))
        
        for s in data['support']:
            input_ids.append(tokenizer.encode(s.lower()))
            
        input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
        attn_mask = torch.tensor(get_attn(input_ids))
        input_ids = torch.tensor(input_ids)
        
        entry = {}
        entry['raw_sentence'] = data['sentence'] + " [SUPPORT]: " + " [SEP] ".join(data['support'])
        entry['sentence'] = input_ids[0]
        entry['sentence_mask'] = attn_mask[0]
        
        entry['support'] = input_ids[1:]
        entry['support_mask'] = attn_mask[1:]
        
        all_data.append((entry, data['label']))
    return all_data

In [16]:
dataset_list = create_dataset(dataset)

100%|██████████| 3834/3834 [00:03<00:00, 1212.76it/s]


### Training & Validation Split

In [17]:
trainset, validationset = train_test_split(dataset_list, random_state=2018, test_size=config['test_size'])

In [18]:
def save_dataset():
    with open('model/trainset-' + config['model_save_point'] +'.data', 'wb') as f:
        pickle.dump(trainset, f)

    with open('model/validationset-' + config['model_save_point'] + '.data', 'wb') as f:
        pickle.dump(validationset, f)

In [19]:
save_dataset()

### Data Loaders

In [20]:
BATCH_SIZE = 16

train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(validationset, batch_size=BATCH_SIZE, shuffle=True)

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Train Our Classification Model

In [22]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

if model_name == "distilbert":
    model = DistilBertForSequenceClassification.from_pretrained(
                                            "distilbert-base-uncased",
                                            num_labels = 2, 
                                            output_attentions = False, 
                                            output_hidden_states = True, 
                                        )
elif model_name == "roberta":
    model = RobertaForSequenceClassification.from_pretrained(
                                            "roberta-base",
                                            num_labels = 2, 
                                            output_attentions = True, 
                                            output_hidden_states = False, 
                                        )
elif model_name == "roberta":
    model = BertForSequenceClassification.from_pretrained(
                                            "bert-base-uncased",
                                            num_labels = 2, 
                                            output_attentions = False, 
                                            output_hidden_states = False, 
                                        )
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [23]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8 
                )

total_steps = len(train_loader) * config['epochs']
    
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

### Utility

In [24]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Training

In [25]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [26]:
def train():
    total_loss = 0
    model.train()
    for step, (batch, labels) in enumerate(train_loader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))

        b_input_ids = batch['sentence'].to(device)
        b_input_mask = batch['sentence_mask'].to(device)
        b_labels = labels.to(device)
        model.zero_grad()        
        
        outputs = model(b_input_ids, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
                
        loss = outputs[0]

        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)            
    
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

In [27]:
def test():
    print("")
    print("Running Validation...")
    t0 = time.time()
    
    model.eval()
    
    all_predictions = []
    all_labels = []
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch, b_labels in test_loader:
        
        with torch.no_grad():        
            outputs = model(batch['sentence'].to(device), 
                            attention_mask=batch['sentence_mask'].to(device))
        
        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        prediction = list(np.argmax(logits, axis=1).flatten())
        all_predictions.extend(prediction)
        all_labels.extend(label_ids.flatten())
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    
    acc = eval_accuracy/nb_eval_steps
    print("  Accuracy: {0:.2f}".format(acc))
    
    
    matrix = confusion_matrix(all_predictions, all_labels)

    tp = matrix[0][0]
    fp = matrix[0][1]
    fn = matrix[1][0]
    tn = matrix[1][1]
    
    print("  Sarcastic Precision: {0:.4f}".format(tp/ (tp + fp)))
    print("  Sarcastic F1-score: {0:.4f}".format(2*tp / (2*tp + fn + fp)))
    print("  Sarcastic Recall: {0:.4f}".format(tp / (tp + fn))) 
    
    print()
    
    print("  Non-sarcastic Precision: {0:.4f}".format(tn / (tn + fn)))
    print("  Non-Sarcastic F1-score: {0:.4f}".format(2*tn / (2*tn + fn + fp)))
    print("  Non-sarcasm Recall: {0:.4f}".format(tn / (tn + fp)))
    
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    return acc

In [28]:
def save_model(filename = 'model/distilbert-'+ config['model_save_point'] +'.pb'):
    print('Saving model...')
    torch.save(model, filename)

In [29]:
loss_values = []
best_val_acc = 0

for epoch_i in range(0, config['epochs']):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, config['epochs']))
    print('Training...')

    t0 = time.time()
    train()
    acc = test()
    if best_val_acc < acc:
        best_val_acc = acc
        save_model()
    
    
print("")
print("Training complete!")

Training...
  Batch    40  of    192.    Elapsed: 0:00:02.
  Batch    80  of    192.    Elapsed: 0:00:03.
  Batch   120  of    192.    Elapsed: 0:00:04.
  Batch   160  of    192.    Elapsed: 0:00:06.

  Average training loss: 0.65
  Training epoch took: 0:00:07

Running Validation...
  Accuracy: 0.64
  Sarcastic Precision: 0.6677
  Sarcastic F1-score: 0.6152
  Sarcastic Recall: 0.5703

  Non-sarcastic Precision: 0.6241
  Non-Sarcastic F1-score: 0.6667
  Non-sarcasm Recall: 0.7154
  Validation took: 0:00:00
Saving model...
Training...
  Batch    40  of    192.    Elapsed: 0:00:01.
  Batch    80  of    192.    Elapsed: 0:00:03.
  Batch   120  of    192.    Elapsed: 0:00:04.
  Batch   160  of    192.    Elapsed: 0:00:05.

  Average training loss: 0.53
  Training epoch took: 0:00:07

Running Validation...
  Accuracy: 0.69
  Sarcastic Precision: 0.6697
  Sarcastic F1-score: 0.7099
  Sarcastic Recall: 0.7552

  Non-sarcastic Precision: 0.7186
  Non-Sarcastic F1-score: 0.6695
  Non-sarcasm Re