In [1]:
import os
import sys
import json
import re
import itertools

import numpy as np
import os.path as osp

import torch
import csv
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from torch.autograd import Variable

from torch import nn
import torch.nn.functional as F
from tqdm import tqdm
from random import shuffle
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix

from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertModel, BertTokenizer, DistilBertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig

In [2]:
import dgl
from dgl import DGLGraph
from dgl.data import MiniGCDataset

# Load Pytorch as backend
dgl.load_backend('pytorch')

Using backend: pytorch
Using backend: pytorch


In [3]:
from dgl.nn.pytorch import conv as dgl_conv
from dgl.data import citegrh

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Dataset Configs

In [5]:
configs = {
    'news_headline' : {
        'dataset_path': 'data/NewsHeadline_comet_autocomplete.jsonl',
        'model_name': 'distilbert',
        'model_save_point': 'kaggle-news',
        'epochs': 5,
        'test_size': 0.5
    },
    
    'semeval' : {
        'dataset_path': 'data/SemEval_comet_autocomplete.jsonl',
        'model_name': 'distilbert',
        'model_save_point': 'semeval',
        'epochs': 10,
        'test_size': 0.2
    },
    
    'figlang' : {
        'dataset_path': 'data/FigLang_comet_autocomplete.jsonl',
        'model_name': 'distilbert',
        'model_save_point': 'figlang',
        'epochs': 10,
        'test_size': 0.2
    },
}

In [6]:
config = configs['semeval']

### Dataset Load

In [7]:
def load_dataset(filename):
    data = []
    with open(filename) as f:
        for line in f:
            entry = {}
            entry['sentences'] = []
            
            line = line.strip()
            d = json.loads(line)
            
            entry['sentences'].append(d['sentence'])
            entry['label'] = int(d['label'])
            
            for k in d['common_sense'].keys():
                if k == 'xWant' or k == 'xEffect':
                    entry['sentences'].append(d['common_sense'][k])
            data.append(entry)
                
    return data    

In [8]:
dataset = load_dataset(config['dataset_path'])

### Pre-trained BERT model

In [9]:
def get_attn(input_ids):
    attention_masks = []

    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks

In [10]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
tokenizer = DistilBertTokenizer.from_pretrained(config['model_name'] + '-base-uncased', do_lower_case=True)

bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model = torch.load('model/distilbert-' + config['model_save_point'] + '.pb')
bert_model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

## Load dataset from pickle dump

In [11]:
def get_pickle_file(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [21]:
MAX_LEN = 128

def create_dataset_cached(dataset):
    all_data = []
    
    for data, label in tqdm(dataset):
        input_ids = []
        
        input_ids.append(data['sentence'])
        
        for s in data['support']:
            input_ids.append(s)
            
        input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
        att_mask = torch.tensor(get_attn(input_ids))
        input_ids = torch.tensor(input_ids)
        

        hidden_state = bert_model(input_ids.to(device), attention_mask=att_mask.to(device), output_hidden_states=True)['hidden_states']
        output = hidden_state[-1][:, 0, :].detach().to('cpu')
        torch.cuda.empty_cache()

        graph = dgl.graph((torch.tensor([0, 0]), 
                               torch.tensor([1, 2])))
        graph.ndata['x'] = torch.tensor(output)
        all_data.append((graph, label, data['raw_sentence']))
    return all_data

In [19]:
trainset_, validationset_ = get_pickle_file('model/trainset-' + config['model_save_point'] +'.data'), get_pickle_file('model/validationset-' + config['model_save_point'] +'.data')

In [22]:
trainset, validationset = create_dataset_cached(trainset_), create_dataset_cached(validationset_)

  graph.ndata['x'] = torch.tensor(output)
100%|██████████| 3067/3067 [00:20<00:00, 150.40it/s]
100%|██████████| 767/767 [00:05<00:00, 128.30it/s]


### Training Data generation

In [23]:
def collate(samples):
    # The input `samples` is a list of pairs
    #  (graph, label).
    graphs, labels, raw_sentence = map(list, zip(*samples))
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.tensor([labels]), raw_sentence

In [24]:
BATCH_SIZE = 16

In [25]:
train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)
test_loader = DataLoader(validationset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)

## Basic SAGE Model

In [26]:
class GraphSAGEModel(nn.Module):
    def __init__(self,
                 in_feats,
                 n_hidden,
                 out_dim,
                 n_layers,
                 activation,
                 dropout,
                 aggregator_type):
        super(GraphSAGEModel, self).__init__()
        self.node_count = 3
        self.layers = nn.ModuleList()
        self.index = torch.tensor([0]).to(device)

        # input layer
        self.layers.append(dgl_conv.SAGEConv(768, n_hidden, aggregator_type,
                                         feat_drop=dropout, activation=None))
        self.lin1 = nn.Linear(64, 5)
        self.lin2 = nn.Linear(5 * self.node_count, 2)

    def forward(self, g, features):
        h = features
        for layer in self.layers:
            h = layer(g, h)  
        
        x = self.lin1(h.view(-1, self.node_count, 64))
        x = self.lin2(x.view(-1, 5 * self.node_count))
        
        return F.log_softmax(x.view(-1, 2), dim=-1)

In [27]:
# Hyperparameters
n_hidden = 64
n_layers = 2
dropout = 0.5
aggregator_type = 'gcn'
n_classes = 2
in_feats = trainset[0][0].ndata['x'].shape[1]

model = GraphSAGEModel(in_feats,
                             n_hidden,
                             n_classes,
                             n_layers,
                             F.relu,
                             dropout,
                             aggregator_type)

model.to(device)

GraphSAGEModel(
  (layers): ModuleList(
    (0): SAGEConv(
      (feat_drop): Dropout(p=0.5, inplace=False)
      (fc_neigh): Linear(in_features=768, out_features=64, bias=True)
    )
  )
  (lin1): Linear(in_features=64, out_features=5, bias=True)
  (lin2): Linear(in_features=15, out_features=2, bias=True)
)

In [28]:
weight_decay = 5e-4
lr = 2e-3
neg_sample_size = 100

# use optimizer
optimizer = torch.optim.Adam(list(model.parameters()) + list(bert_model.parameters()), lr=lr, weight_decay=weight_decay)
criteria = nn.NLLLoss()

### Evaluation

In [29]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [30]:
def train(epoch):
    model.train()
    
    
    eval_accuracy = 0
    nb_eval_steps = 0
    
    for batch, (g, label, _) in enumerate(train_loader):
        g = g.to(device)
        output = model(g, g.ndata['x'])
        loss = criteria(output, label.view(-1).to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 40 == 0:
            print("Epoch {:d} |Batch {:d} | Loss {:.4f} ".format(epoch, batch, loss.item()))
        eval_accuracy += loss.item()
        nb_eval_steps += 1
            
    
    print("Average Training Loss: {0:.2f}".format(eval_accuracy/nb_eval_steps)) 

In [31]:
def test(test_loader):   
    print("")
    print("Running Validation...")
    
    eval_accuracy = 0
    nb_eval_steps = 0
    
    model.eval()
    
    all_predictions = []
    all_labels = []
    
    for batch, (g, label, _) in enumerate(test_loader):
        g = g.to(device)
        
        with torch.no_grad():
            output = model(g, g.ndata['x']) 
        

        logits = output.detach().cpu().numpy()
        label_ids = label.to('cpu').numpy()
        
        prediction = list(np.argmax(logits, axis=1).flatten())
        all_predictions.extend(prediction)
        all_labels.extend(label_ids.flatten())
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        
    
    accuracy = eval_accuracy/nb_eval_steps
    
    matrix = confusion_matrix(all_predictions, all_labels)

    tp = matrix[0][0]
    fp = matrix[0][1]
    fn = matrix[1][0]
    tn = matrix[1][1]
    
    print("  Sarcastic Precision: {0:.2f}".format(tp/ (tp + fp)))
    print("  Sarcastic F1-score: {0:.2f}".format(2*tp / (2*tp + fn + fp)))
    print("  Sarcastic Recall: {0:.2f}".format(tp / (tp + fn))) 
    
    print()
    
    print("  Non-sarcastic Precision: {0:.2f}".format(tn / (tn + fn)))
    print("  Non-Sarcastic F1-score: {0:.2f}".format(2*tn / (2*tn + fn + fp)))
    print("  Non-sarcasm Recall: {0:.2f}".format(tn / (tn + fp)))


    return accuracy

In [34]:
def runner():
    best_model = model
    best_acc = 0
    
    for epoch in range(config['epochs']):
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, config['epochs']))
        print('Training...')
        train(epoch)
        acc = test(test_loader)
        if acc > best_acc:
            print('Saving ...')
            best_acc = acc
            best_model = model
            print('Done!')

    print("")
    print("Training complete!")
    return best_model, best_acc

In [35]:
runner()

Training...
Epoch 0 |Batch 0 | Loss 0.8374 
Epoch 0 |Batch 40 | Loss 0.6957 
Epoch 0 |Batch 80 | Loss 0.6677 
Epoch 0 |Batch 120 | Loss 0.7447 
Epoch 0 |Batch 160 | Loss 0.6647 
Average Training Loss: 0.70

Running Validation...
  Sarcastic Precision: 0.71
  Sarcastic F1-score: 0.43
  Sarcastic Recall: 0.31

  Non-sarcastic Precision: 0.56
  Non-Sarcastic F1-score: 0.68
  Non-sarcasm Recall: 0.87
Saving ...
Done!
Training...
Epoch 1 |Batch 0 | Loss 0.7113 
Epoch 1 |Batch 40 | Loss 0.6775 
Epoch 1 |Batch 80 | Loss 0.7139 
Epoch 1 |Batch 120 | Loss 0.7215 
Epoch 1 |Batch 160 | Loss 0.5822 
Average Training Loss: 0.67

Running Validation...
  Sarcastic Precision: 0.56
  Sarcastic F1-score: 0.58
  Sarcastic Recall: 0.61

  Non-sarcastic Precision: 0.57
  Non-Sarcastic F1-score: 0.55
  Non-sarcasm Recall: 0.53
Training...
Epoch 2 |Batch 0 | Loss 0.6270 
Epoch 2 |Batch 40 | Loss 0.6582 
Epoch 2 |Batch 80 | Loss 0.6180 
Epoch 2 |Batch 120 | Loss 0.5837 
Epoch 2 |Batch 160 | Loss 0.5532 
Avera

(GraphSAGEModel(
   (layers): ModuleList(
     (0): SAGEConv(
       (feat_drop): Dropout(p=0.5, inplace=False)
       (fc_neigh): Linear(in_features=768, out_features=64, bias=True)
     )
   )
   (lin1): Linear(in_features=64, out_features=5, bias=True)
   (lin2): Linear(in_features=15, out_features=2, bias=True)
 ),
 0.5973090277777778)