In [1]:
import os
import json
import sys
import csv
import re
import torch

import random

import spacy
nlp = spacy.load("en_core_web_sm")

from random import shuffle
from tqdm import tqdm
import numpy as np

from comet2.comet_model import PretrainedCometModel
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification

PyTorch version 1.6.0 available.
TensorFlow version 2.2.0 available.
01/27/2021 20:20:02 - INFO - pytorch_transformers.modeling_bert -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
01/27/2021 20:20:02 - INFO - pytorch_transformers.modeling_xlnet -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


### Load Kaggle Sarcasm Dataset

In [2]:
def load_kaggle_dataset():
    DATASET_PATH = 'data/'
    DATASETS = ['Sarcasm_Headlines_Dataset.json', 'Sarcasm_Headlines_Dataset_v2.json']
    
    dataset = {}
    dataset['data'] = []
    dataset['label'] = []
    
    for file in DATASETS:
        with open(os.path.join(DATASET_PATH, file)) as f:
            for line in f:
                line = line.strip()
                entry = json.loads(line)
                dataset['data'].append(entry['headline'].strip('"'))
                dataset['label'].append(entry['is_sarcastic'])
    return dataset

### SemEval Dataset

In [3]:
SEMEVAL_PATH = 'data/SemEval2018-T3-train-taskA.txt'

def clean_string(text):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', 'ent' + str(random.randint(0, 1000)), text)
    text = re.sub(r'\#', ' ', text) # remove hashtags
    text = re.sub(r'\:\S+\:', '', text) # remove smilies :grim_face:
    text = re.sub(r' +', ' ', text)
    return text.strip()

def load_dataset_semeval():
    dataset = {}
    dataset['data'] = []
    dataset['label'] = []
    
    
    with open(SEMEVAL_PATH) as f:
        next(f)
        for line in f:
            line = line.strip().split('\t')
            text = clean_string(line[2])
            dataset['data'].append(text)
            dataset['label'].append(int(line[1]))
    return dataset

### ACL Shared Task

In [4]:
ACL_REDDIT_PATH = 'data/reddit/sarcasm_detection_shared_task_reddit_training.jsonl'

def load_dataset_figlang():
    dataset = {}
    dataset['data'] = []
    dataset['label'] = []    
    
    with open(ACL_REDDIT_PATH) as f:
        for line in f:
            line = line.strip()
            entry = json.loads(line)
            text = clean_string(entry['response'])
            
            if len(text.split()) < 100:
                dataset['data'].append(text)
                dataset['label'].append(int(entry['label'] == "SARCASM"))
    return dataset

### COMET Model

In [5]:
comet_model = PretrainedCometModel(device=0)



In [6]:
CATEGORIES = ["oReact", "oEffect", "oWant", "xAttr", "xEffect", "xIntent", "xNeed", "xReact", "xWant"]

def gather_commonsense(sentence):
    commonsense = {}
    for c in CATEGORIES:
        commonsense[c] = comet_model.predict(sentence, c, num_beams = 8)
    return commonsense

def get_comet_data(dataset):
    content = []
    wrong =  0
    for i in tqdm(range(len(dataset['data']))):
        entry = {}
        entry['sentence'] = dataset['data'][i].lower()
        entry['label'] = dataset['label'][i]
        
        entry['common_sense'] = gather_commonsense(entry['sentence'])
        content.append(json.dumps(entry))
        
    return  content

In [7]:
def write_content(filename):
    with open(filename, "w") as f:
        for c in content:
            f.write(c + '\n')

In [8]:
def process_dataset(config = 'semeval'):
    if config == 'semeval':
        dataset = load_dataset_semeval()
        write_filename = 'SemEval_comet.jsonl'
        
    elif config == 'news_headline':
        dataset = load_kaggle_dataset()        
        write_filename = 'NewsHeadline_comet.jsonl'
        
    elif config == 'figlang':
        dataset = load_dataset_figlang()
        write_filename = 'FigLang_comet.jsonl'
    
    else:
        print('Unknown dataset!')
        return
    
    content = get_comet_data(dataset)
    write_content('data/' + write_filename)

In [9]:
# Run the above function for all datasets sequentially

# process_dataset(config = 'semeval') 
# process_dataset(config = 'news_headline') 
# process_dataset(config = 'figlang') 

### Auto complete COMET sequences

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased', output_attentions=True)
model.to(device)
model.eval()

01/27/2021 20:20:10 - INFO - pytorch_transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/somnath/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
01/27/2021 20:20:10 - INFO - pytorch_transformers.modeling_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/somnath/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
01/27/2021 20:20:10 - INFO - pytorch_transformers.modeling_utils -   Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [12]:
def get_sentence(trail, subject):
    text = '[CLS] ' + subject + ' [MASK] ' + trail +'[SEP]'
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    masked_index = tokenized_text.index('[MASK]')
    
    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids]).to(device)

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0]
        attention = outputs[-1]
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

    return subject + ' ' + predicted_token + ' ' + trail

In [13]:
def get_subject(text):
    doc = nlp(text)
    sentence = next(doc.sents) 
    for w in sentence:
        if w.dep_ == 'nsubj':
            return w.text
    return ""

def process_concepts(subject, concept_list):
    for c in concept_list:
        if c == 'none':
            continue
            
        if subject == '':
            return c
        
        return get_sentence(c, subject)

def reprocess_dataset(filename):
    num_lines = sum(1 for line in open(filename,'r'))
    content = []
    
    with open(filename) as f:
        for line in tqdm(f, total = num_lines):
            new_entry = {}
            entry = json.loads(line.strip())
            new_entry['sentence'] = entry['sentence']
            new_entry['label'] = entry['label']
            new_entry['common_sense'] = {}
            
            s = get_subject(entry['sentence'])
            
            for c in entry['common_sense'].keys():
                if c in ['xWant', 'xNeed', 'xIntent', 'xEffect']:
                    new_entry['common_sense'][c] = process_concepts(s, entry['common_sense'][c])
            content.append(new_entry)
    return content

In [14]:
def write_to_file(filename):
    with open(filename, "w") as f:
        for line in c:
            f.write(json.dumps(line) + '\n')

In [15]:
def autocomplete_dataset(config = 'semeval'):
    if config == 'semeval':
        read_filename = 'FullSEMEVAL_comet.jsonl'
        write_filename = 'SemEval_comet_autocomplete.jsonl'
        
    elif config == 'news_headline':
        read_filename = 'NewsHeadline_comet.jsonl'
        write_filename = 'NewsHeadline_comet_autocomplete.jsonl'
        
    elif config == 'figlang':
        read_filename = 'FigLang_comet.jsonl'
        write_filename = 'FigLang_comet_autocomplete.jsonl'
    
    else:
        print('Unknown dataset!')
        return
    
    content = reprocess_dataset('data/' + read_filename)
    write_to_file('data/' + write_filename)

In [16]:
# Run the above function for all datasets sequentially

# autocomplete_dataset(config = 'semeval') 
# autocomplete_dataset(config = 'news_headline') 
# autocomplete_dataset(config = 'figlang') 