# Supplementary code package for 'Toward Informal Language Processing: Knowledge of Slang in Large Language Models'.

By: [Zhewei Sun](https://www.cs.toronto.edu/~zheweisun/)

This notebook contains experiment code for the NAACL Paper 'Toward Informal Language Processing: Knowledge of Slang in Large Language Models'. Here, we show how the contributed dataset can be used to reproduce the main experiments in the paper. We include all code used to execute the experiments on BERT-like models (this includes BERT, RoBERTa, and XLNet) and illustrate example usage with BERT.

To run this notebook, you will need the following Python packages:

- nltk
- numpy
- pandas
- pytorch
- scipy
- tqdm
- transformers

In [1]:
import io
import os
import pickle
import re
import time

from tqdm import trange

import numpy as np
import scipy.stats
import pandas as pd

from collections import defaultdict, namedtuple, Counter, defaultdict, OrderedDict

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords as sw

In [2]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, BertForSequenceClassification, BertForTokenClassification
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForTokenClassification
from transformers import XLNetTokenizer, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForTokenClassification
from transformers import BertTokenizerFast, RobertaTokenizerFast, XLNetTokenizerFast

from transformers import logging
logging.set_verbosity_error()

In [3]:
# Use this to specify a GPU as needed. 
# torch.cuda.set_device(1)
device = 'cuda'

In [4]:
stopwords = set(sw.words('english'))

## Load Dataset

The dataset is included in the supplmentary data package. If you downloaded the entire package from the repo, the data can be found in *../Data/*.

In [5]:
data_os = pd.read_csv('../Data/slang_OpenSub.tsv', sep='\t').fillna('')

All attributes attached to the data entries. See the paper and the README file in *../Data* for more details.

In [6]:
data_os.columns

Index(['SENTENCE', 'FULL_CONTEXT', 'SLANG_TERM', 'ANNOTATOR_CONFIDENCE',
       'MOVIE_ID', 'SENT_ID', 'REGION', 'YEAR', 'DEFINITION_SENTENCE',
       'DEFINITION_SOURCE_URL', 'LITERAL_PARAPHRASE_OF_SLANG'],
      dtype='object')

In [7]:
len(data_os)

7488

We also load annotated negative sentences (i.e. sentences that do not contain slang) for slang detection tasks.

In [8]:
data_os_neg = pd.read_csv('../Data/slang_OpenSub_negatives.tsv', sep='\t').fillna('')

In [9]:
len(data_os_neg)

17512

In [10]:
data_sents = data_os['SENTENCE'].values
data_slang = data_os['SLANG_TERM'].values
data_literal = data_os['LITERAL_PARAPHRASE_OF_SLANG'].values
data_region = data_os['REGION'].values
data_year = data_os['YEAR'].values
data_conf = data_os['ANNOTATOR_CONFIDENCE'].values

In [11]:
data_sents_neg = data_os_neg['SENTENCE'].values
data_region_neg = data_os_neg['REGION'].values
data_year_neg = data_os_neg['YEAR'].values

The following indices record the exact set of sentences used in the paper's evaluations.

In [12]:
slang_llm_inds = np.load('../Data/slang_llm_inds.npz')

## Slang Detection

### Preprocessing

In [13]:
punctuations = '!\'"#$%&()\*\+,-\./:;<=>?@[\\]^_`{|}~'

re_punc = re.compile(r"["+punctuations+r"]+")
re_punc2 = re.compile(r"[,.!;:&<>-]+")
re_punc_space = re.compile(r" ["+punctuations+r"]+ ")
re_space = re.compile(r" +")

def add_space(match_obj):
    if match_obj.group() is not None:
        return ' '+match_obj.group()+' '
    
def add_space_trail(match_obj):
    if match_obj.group() is not None:
        return match_obj.group()+' '
    
def remove_space(match_obj):
    if match_obj.group() is not None:
        return match_obj.group()[1:-1]

sents_ind = []
sents_mask = []
for i in range(len(data_sents)):
    
    if data_conf[i] < 2:
        continue
    
    tokens = re_space.sub(' ', re_punc.sub(add_space, data_sents[i])).split(' ')
    slang_pos = []
            
    for j in range(len(tokens)):
        token = tokens[j]
        if token.lower() == data_slang[i].lower():
            slang_pos.append(j)
            
    if len(slang_pos) == 0:
        tokens = re_space.sub(' ', re_punc.sub('', data_sents[i])).split(' ')
        slang_pos = []
        slang_nop = re_punc.sub('', data_slang[i])

        for j in range(len(tokens)):
            token = tokens[j]
            if token.lower() == slang_nop.lower():
                slang_pos.append(j)
        
    if len(slang_pos) == 1:
        sents_ind.append(i)
        t = tokens.copy()
        for p in slang_pos:
            t[p] = '[MASK]'
        sents_mask.append(re_punc2.sub(add_space_trail, re_punc_space.sub(remove_space, ' '.join(t))).strip())
            
sents_ind = np.asarray(sents_ind)

words_slang = data_slang[sents_ind]
words_literal = data_literal[sents_ind]
regions = data_region[sents_ind]
confs = data_conf[sents_ind]

sents_mask_pos = [sents_mask[i].index('[MASK]') for i in range(len(sents_mask))]
sents_slang = np.asarray([sents_mask[i].replace('[MASK]', words_slang[i]) for i in range(len(sents_mask))])

In [14]:
ind_neg_sample = slang_llm_inds['detect_neg']

sents_neg = []

for i in range(len(ind_neg_sample)):
    tokens = re_space.sub(' ', re_punc.sub(add_space, data_sents_neg[ind_neg_sample[i]])).split(' ')
    sents_neg.append(re_punc2.sub(add_space_trail, re_punc_space.sub(remove_space, ' '.join(tokens))).strip())
    
sents_neg = np.asarray(sents_neg)

In [15]:
N_sample = len(sents_slang)*2

data_perm = slang_llm_inds['detect']
sents_all = np.concatenate([sents_neg, sents_slang])
sents_detect = sents_all[data_perm]

pivot_tr = int(np.floor(N_sample*0.8))
pivot_dev = int(np.floor(N_sample*0.85))

tr_sents = sents_detect[:pivot_tr]
dev_sents = sents_detect[pivot_tr:pivot_dev]
te_sents = sents_detect[pivot_dev:]

train_inds = data_perm[:pivot_tr]
dev_inds = data_perm[pivot_tr:pivot_dev]
test_inds = data_perm[pivot_dev:]

In [16]:
labels_detect = np.asarray([0]*len(sents_slang)+[1]*len(sents_slang))[data_perm]

tr_labels_detect = labels_detect[:pivot_tr]
dev_labels_detect = labels_detect[pivot_tr:pivot_dev]
te_labels_detect = labels_detect[pivot_dev:]

### Experiments

#### Sentence-level Detection

In [17]:
def train_detect_classifier(tr_sents, tr_labels, dev_sents, dev_labels, save_path, model_type='bert', num_labels=2, N_EPOCHS = 10, BATCH_SIZE = 20, verbose=False):

    if model_type.lower() == 'bert':
        tokenizer = BertTokenizer.from_pretrained("bert-large-cased")
        model = BertForSequenceClassification.from_pretrained('bert-large-cased', num_labels=num_labels).to(device)
        n_save = 4
    elif model_type.lower() == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
        model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=num_labels).to(device)
        n_save = 4
    elif model_type.lower() == 'xlnet':
        tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")
        model = XLNetForSequenceClassification.from_pretrained("xlnet-large-cased", num_labels=num_labels).to(device)
        n_save = 4
    else:
        print("Invalid model type")
        return

    for name, param in list(model.named_parameters())[:-n_save]:
        param.requires_grad_(False)

    # Training loop

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5)

    best_dev = np.inf
    
    if verbose:
        range_fn = range
        range_fn2 = trange
    else:
        range_fn = trange
        range_fn2 = range

    for e in range_fn(N_EPOCHS):
        
        if verbose:
            print("[Training Epoch - %d]" % (e+1))
            print("")
        time.sleep(0.5)

        tr_shuf = np.random.permutation(tr_sents.shape[0])

        loss_total, loss_total_dev = 0, 0

        for b in range_fn2(0, tr_sents.shape[0], BATCH_SIZE):

            b_end = min(tr_sents.shape[0], b+BATCH_SIZE)
            batch_sample = tr_shuf[b:b_end]

            tr_sents_batch = tr_sents[batch_sample]
            tr_labels_batch = tr_labels[batch_sample]

            optimizer.zero_grad()

            tr_ids_batch = tokenizer(list(tr_sents_batch), return_tensors="pt", padding=True).to(device)
            labels = torch.tensor(tr_labels_batch).to(device)

            loss = model(**tr_ids_batch, labels=labels).loss

            loss.backward()
            optimizer.step()
            loss_total += loss.item() * batch_sample.shape[0]

            del tr_ids_batch, labels, loss
        
        if verbose:
            print("Training Loss: %.3f" % (loss_total / tr_sents.shape[0]))
            time.sleep(0.5)

        for b in range_fn2(0, dev_sents.shape[0], BATCH_SIZE):

            b_end = min(dev_sents.shape[0], b+BATCH_SIZE)
            batch_sample = np.arange(b, b_end)

            dev_sents_batch = dev_sents[batch_sample]
            dev_labels_batch = dev_labels[batch_sample]

            dev_ids_batch = tokenizer(list(dev_sents_batch), return_tensors="pt", padding=True).to(device)
            labels = torch.tensor(dev_labels_batch).to(device)

            with torch.no_grad():
                loss = model(**dev_ids_batch, labels=labels).loss

            loss_total_dev += loss.item() * batch_sample.shape[0]

            del dev_ids_batch, labels, loss
        
        if verbose:
            print("Dev Loss: %.3f" % (loss_total_dev / dev_sents.shape[0]))
        
        scheduler.step(loss_total_dev)

        if loss_total_dev < best_dev:

            best_dev = loss_total_dev
            if verbose:
                print("Best dev loss so far, saving model...")
            d = OrderedDict()
            for name, param in list(model.state_dict().items())[-n_save:]:
                d[name] = param
            torch.save(d, save_path)

        if verbose:
            print("")
        
    return model

def test_detect_classifier(te_sents, save_path, model_type='bert', num_labels=6, BATCH_SIZE = 20):
    
    if model_type.lower() == 'bert':
        tokenizer = BertTokenizer.from_pretrained("bert-large-cased")
        model = BertForSequenceClassification.from_pretrained('bert-large-cased', num_labels=num_labels).to(device)
    elif model_type.lower() == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
        model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=num_labels).to(device)
    elif model_type.lower() == 'xlnet':
        tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")
        model = XLNetForSequenceClassification.from_pretrained("xlnet-large-cased", num_labels=num_labels).to(device)
    else:
        print("Invalid model type")
        return
    
    _ = model.load_state_dict(torch.load(save_path), strict=False)
    model.eval()

    preds = []

    for b in trange(0, te_sents.shape[0], BATCH_SIZE):

        b_end = min(te_sents.shape[0], b+BATCH_SIZE)
        batch_sample = np.arange(b, b_end)

        te_sents_batch = te_sents[batch_sample]

        te_ids_batch = tokenizer(list(te_sents_batch), return_tensors="pt", padding=True).to(device)

        with torch.no_grad():
            pred = model(**te_ids_batch).logits.argmax(axis=1)

        preds.extend(pred.cpu().numpy())

        del te_ids_batch, pred

    return np.asarray(preds)

def run_detect_experiment(tr_sents, tr_labels, dev_sents, dev_labels, te_sents, te_labels, save_path, num_labels=6, num_exp=20, model_type='bert', N_EPOCHS = 10, BATCH_SIZE = 20, verbose=False):
    
    res = []
    acc = []
    
    for e in range(num_exp):
        print("[Experiment - %d]" % (e+1))
        print("")
        
        path = save_path+'_'+model_type+'_'+str(e+1)+'.pt'
        
        train_detect_classifier(tr_sents, tr_labels, dev_sents, dev_labels, save_path=path, model_type=model_type, num_labels=num_labels, N_EPOCHS=N_EPOCHS, BATCH_SIZE=BATCH_SIZE, verbose=verbose)
        preds = test_detect_classifier(te_sents, path, model_type, num_labels=num_labels)
        
        res.append(preds)
        acc.append(np.sum(preds==te_labels))
        
    return np.stack(res), np.asarray(acc)

def compute_detect_result(preds, te_labels):
    
    res = {}
    
    res['acc'] = np.mean([compute_acc_detect(pred, te_labels) for pred in preds])
    
    res['prec'], res['recall'], res['f1'] = np.mean(np.asarray([compute_f1_detect(pred, te_labels) for pred in preds]), axis=0)
    
    return res

def compute_acc_detect(preds, te_labels):
    return np.sum(preds==te_labels) / len(preds)

def compute_f1_detect(preds, te_labels):
    
    TP = np.sum(preds[te_labels==1]==1)
    FP = np.sum(preds[te_labels==0]==1)
    FN = np.sum(preds[te_labels==1]==0)
    
    prec = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = TP / (TP + 0.5*(FP + FN))
    
    return prec, recall, f1

Here, we run one experiment with BERT:

In [18]:
save_path = '../Results/detect_sent/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
res_detect_bert, acc = run_detect_experiment(tr_sents, tr_labels_detect, dev_sents, dev_labels_detect, te_sents, te_labels_detect, save_path=save_path+'detect_sent', num_exp=1, model_type='bert', num_labels=2, N_EPOCHS = 10, BATCH_SIZE = 20, verbose=False)

[Experiment - 1]



100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [01:45<00:00, 10.60s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 15.89it/s]


In [19]:
compute_detect_result(res_detect_bert, te_labels_detect)

{'acc': 0.7822299651567944,
 'prec': 0.7235294117647059,
 'recall': 0.8880866425992779,
 'f1': 0.7974068071312804}

#### Word-level Detection

In [20]:
def align_labels(input_tokens, sample_inds, tokenizer, model_type='bert', test_mode=False):
    
    input_ids = input_tokens.input_ids
    
    labels = np.zeros(input_ids.shape, dtype=np.int64)

    for i in range(len(sample_inds)):

        ind = sample_inds[i]
        token_len = len(tokenizer.encode(sents_all[ind]))

        if model_type == 'bert':
            labels[i][0] = -100
            labels[i][token_len-1:] = -100
        elif model_type == 'roberta':
            labels[i][0] = -100
            labels[i][token_len-1:] = -100
        elif model_type == 'xlnet':
            labels[i][-2:] = -100
            labels[i][:input_ids.shape[1]-token_len] = -100

        if ind >= N_sample // 2:

            ind = ind - N_sample // 2
            
            if model_type == 'bert':
                id_word = tokenizer.encode(words_slang[ind])[1:-1]
                m_start = len(tokenizer.encode(sents_slang[ind][:sents_mask_pos[ind]]))-1
            elif model_type == 'roberta':
                if sents_mask_pos[i] == 0:
                    id_word = tokenizer.encode(words_slang[ind])[1:-1]
                else:
                    id_word = tokenizer.encode(' '+words_slang[ind])[1:-1]
                m_start = len(tokenizer.encode(sents_slang[ind][:sents_mask_pos[ind]].strip()))-1
            elif model_type == 'xlnet':
                id_word = tokenizer.encode(words_slang[ind])[:-2]
                m_start = input_ids.shape[1] - token_len
                m_start += len(tokenizer.encode(sents_slang[ind][:sents_mask_pos[ind]]))-2
            
            for j in range(len(id_word)):
                if j == 0:
                    labels[i][m_start+j] = 1
                else:
                    labels[i][m_start+j] = 2
        
        # During testing, we don't consider subsequent subword tokens in any word so that the total number of tokens evaluated is consistent across models.
        
        if test_mode:
            
            word_ids = input_tokens.word_ids(i)
            seen_ids = set()
            
            assert len(word_ids) == len(labels[i])
            
            for j in range(len(word_ids)):
                if word_ids[j] is not None:
                    if word_ids[j] not in seen_ids:
                        seen_ids.add(word_ids[j])
                    else:
                        labels[i][j] = -1

    return torch.tensor(labels)

def train_ident_classifier(tr_sents, dev_sents, save_path, model_type='bert', num_labels=3, N_EPOCHS = 10, BATCH_SIZE = 20, verbose=False):

    if model_type.lower() == 'bert':
        tokenizer = BertTokenizerFast.from_pretrained("bert-large-cased")
        model = BertForTokenClassification.from_pretrained('bert-large-cased', num_labels=num_labels).to(device)
        n_save = 2
    elif model_type.lower() == 'roberta':
        tokenizer = RobertaTokenizerFast.from_pretrained("roberta-large")
        model = RobertaForTokenClassification.from_pretrained("roberta-large", num_labels=num_labels).to(device)
        n_save = 2
    elif model_type.lower() == 'xlnet':
        tokenizer = XLNetTokenizerFast.from_pretrained("xlnet-large-cased")
        model = XLNetForTokenClassification.from_pretrained("xlnet-large-cased", num_labels=num_labels).to(device)
        n_save = 2
    else:
        print("Invalid model type")
        return

    for name, param in list(model.named_parameters())[:-n_save]:
        param.requires_grad_(False)
        
    # Training loop

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5)

    best_dev = np.inf

    if verbose:
        range_fn = range
        range_fn2 = trange
    else:
        range_fn = trange
        range_fn2 = range

    for e in range_fn(N_EPOCHS):

        if verbose:
            print("[Training Epoch - %d]" % (e+1))
            print("")
        time.sleep(0.5)

        tr_shuf = np.random.permutation(tr_sents.shape[0])

        loss_total, loss_total_dev = 0, 0

        for b in range_fn2(0, tr_sents.shape[0], BATCH_SIZE):

            b_end = min(tr_sents.shape[0], b+BATCH_SIZE)
            batch_sample = tr_shuf[b:b_end]

            tr_sents_batch = tr_sents[batch_sample]

            optimizer.zero_grad()

            tr_ids_batch = tokenizer(list(tr_sents_batch), return_tensors="pt", padding=True).to(device)
            labels = align_labels(tr_ids_batch, train_inds[batch_sample], tokenizer, model_type).to(device)

            loss = model(**tr_ids_batch, labels=labels).loss

            loss.backward()
            optimizer.step()
            loss_total += loss.item() * batch_sample.shape[0]

            del tr_ids_batch, labels, loss

        if verbose:
            print("Training Loss: %.3f" % (loss_total / tr_sents.shape[0]))
            time.sleep(0.5)

        for b in range_fn2(0, dev_sents.shape[0], BATCH_SIZE):

            b_end = min(dev_sents.shape[0], b+BATCH_SIZE)
            batch_sample = np.arange(b, b_end)

            dev_sents_batch = dev_sents[batch_sample]

            dev_ids_batch = tokenizer(list(dev_sents_batch), return_tensors="pt", padding=True).to(device)
            labels = align_labels(dev_ids_batch, dev_inds[batch_sample], tokenizer, model_type).to(device)

            with torch.no_grad():
                loss = model(**dev_ids_batch, labels=labels).loss

            loss_total_dev += loss.item() * batch_sample.shape[0]

            del dev_ids_batch, labels, loss

        if verbose:
            print("Dev Loss: %.3f" % (loss_total_dev / dev_sents.shape[0]))

        scheduler.step(loss_total_dev)

        if loss_total_dev < best_dev:

            best_dev = loss_total_dev
            if verbose:
                print("Best dev loss so far, saving model...")
            d = OrderedDict()
            for name, param in list(model.state_dict().items())[-n_save:]:
                d[name] = param
            torch.save(d, save_path)

        if verbose:
            print("")

    return model

def test_ident_classifier(te_sents, save_path, model_type='bert', num_labels=3, BATCH_SIZE = 20):
    
    if model_type.lower() == 'bert':
        tokenizer = BertTokenizerFast.from_pretrained("bert-large-cased")
        model = BertForTokenClassification.from_pretrained('bert-large-cased', num_labels=num_labels).to(device)
    elif model_type.lower() == 'roberta':
        tokenizer = RobertaTokenizerFast.from_pretrained("roberta-large")
        model = RobertaForTokenClassification.from_pretrained("roberta-large", num_labels=num_labels).to(device)
    elif model_type.lower() == 'xlnet':
        tokenizer = XLNetTokenizerFast.from_pretrained("xlnet-large-cased")
        model = XLNetForTokenClassification.from_pretrained("xlnet-large-cased", num_labels=num_labels).to(device)
    else:
        print("Invalid model type")
        return
    
    _ = model.load_state_dict(torch.load(save_path), strict=False)
    model.eval()

    preds = []
    gt_labels = []

    for b in trange(0, te_sents.shape[0], BATCH_SIZE):

        b_end = min(te_sents.shape[0], b+BATCH_SIZE)
        batch_sample = np.arange(b, b_end)

        te_sents_batch = te_sents[batch_sample]

        te_ids_batch = tokenizer(list(te_sents_batch), return_tensors="pt", padding=True).to(device)
        labels = align_labels(te_ids_batch, test_inds[batch_sample], tokenizer, model_type, test_mode=True)

        with torch.no_grad():
            pred = model(**te_ids_batch).logits.argmax(-1)

        preds.extend(pred.cpu().numpy())
        gt_labels.extend(labels.numpy())

        del te_ids_batch, pred

    return np.asarray(preds, dtype=object), np.asarray(gt_labels, dtype=object)

def run_ident_experiment(tr_sents, dev_sents, te_sents, save_path, num_labels=3, num_exp=20, model_type='bert', N_EPOCHS = 10, BATCH_SIZE = 20, verbose=False):
    
    res = []
    
    for e in range(num_exp):
        print("[Experiment - %d]" % (e+1))
        print("")
        
        path = save_path+'_'+model_type+'_'+str(e+1)+'.pt'
        
        train_ident_classifier(tr_sents, dev_sents, save_path=path, model_type=model_type, num_labels=num_labels, N_EPOCHS=N_EPOCHS, BATCH_SIZE=BATCH_SIZE, verbose=verbose)
        preds, labels = test_ident_classifier(te_sents, path, model_type, num_labels=num_labels)
        
        res.append(preds)
        
    return res, labels

def compute_ident_result(preds, labels):
    
    preds_flat = [[] for i in range(len(preds))]
    labels_flat = []
    
    for i in range(len(labels)):
        label_mask = labels[i] != -100
        labels_flat.extend(labels[i][label_mask])
        for j in range(len(preds)):
            preds_flat[j].extend(preds[j][i][label_mask])
        
    preds_flat = [np.asarray(a) for a in preds_flat]
    labels_flat = np.asarray(labels_flat)
    
    res = {}
    
    res['acc'] = np.mean([compute_acc_ident(pred, labels_flat) for pred in preds_flat])
    
    res['prec'], res['recall'], res['f1'] = np.mean(np.asarray([compute_f1_ident(pred, labels_flat) for pred in preds_flat]), axis=0)
 
    
    return res

def compute_acc_ident(preds, labels):
    return np.sum(preds==labels) / len(preds)

def compute_f1_ident(preds, labels):

    T_mask = np.any([labels==1, labels==2], axis=0)

    TP = np.sum(preds[T_mask]==labels[T_mask])
    FP = np.sum(np.any([preds[labels==0]==1, preds[labels==0]==2], axis=0))
    FN = np.sum(preds[T_mask]!=labels[T_mask])
    
    prec = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = TP / (TP + 0.5*(FP + FN))
    
    return prec, recall, f1

In [21]:
save_path = '../Results/detect_word/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
res_ident_bert, te_labels_bert = run_ident_experiment(tr_sents, dev_sents, te_sents, save_path=save_path+'detect_word', num_labels=3, num_exp=1, model_type='bert', N_EPOCHS = 10, BATCH_SIZE = 20, verbose=False)

[Experiment - 1]



100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [01:44<00:00, 10.41s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 16.65it/s]


In [22]:
compute_ident_result(res_ident_bert, te_labels_bert)

{'acc': 0.9235115167318557,
 'prec': 0.7863247863247863,
 'recall': 0.5974025974025974,
 'f1': 0.6789667896678967}

## Slang Source Inference

### Preprocessing

In [23]:
np.random.seed(65488209)

data_mask = data_literal != ''

sents_all = data_sents
slang_all = data_slang
regions_all = data_region
confs_all = data_conf

punctuations = '!\'"#$%&()\*\+,-\./:;<=>?@[\\]^_`{|}~'

re_punc = re.compile(r"["+punctuations+r"]+")
re_punc2 = re.compile(r"[,.!;:&<>-]+")
re_punc_space = re.compile(r" ["+punctuations+r"]+ ")
re_space = re.compile(r" +")
re_allpuncspace = re.compile(r"^[\s"+punctuations+"]+$")

def add_space(match_obj):
    if match_obj.group() is not None:
        return ' '+match_obj.group()+' '
    
def add_space_trail(match_obj):
    if match_obj.group() is not None:
        return match_obj.group()+' '
    
def remove_space(match_obj):
    if match_obj.group() is not None:
        return match_obj.group()[1:-1]
    
context_sents = [[] for i in range(2)]
context_inds = [[] for i in range(2)]
masked_slang_sents = [[] for i in range(2)]
masked_random_sents = [[] for i in range(2)]
gt_words = [[] for i in range(2)]

for i in range(len(sents_all)):
    
    if confs_all[i] < 2:
        continue
    
    if regions_all[i] == 'US':
        region_tag = 0
    elif regions_all[i] == 'UK':
        region_tag = 1
                
    tokens = re_space.sub(' ', re_punc.sub(add_space, sents_all[i])).split(' ')
    slang_pos = []
    content_pos = []
            
    for j in range(len(tokens)):
        token = tokens[j]
        if token.lower() == slang_all[i].lower():
            slang_pos.append(j)
        elif token.lower() not in stopwords and re_allpuncspace.search(token) is None and len(token) > 0:
            content_pos.append(j)
            
    if len(slang_pos) == 0:
        tokens = re_space.sub(' ', re_punc.sub('', sents_all[i])).split(' ')
        slang_pos = []
        content_pos = []
        slang_nop = re_punc.sub('', slang_all[i])

        for j in range(len(tokens)):
            token = tokens[j]
            if token.lower() == slang_nop.lower():
                slang_pos.append(j)
            elif token.lower() not in stopwords and re_allpuncspace.search(token) is None and len(token) > 0:
                content_pos.append(j)
        
    if len(slang_pos) == 1:
        t = tokens.copy()
        for p in slang_pos:
            t[p] = '[MASK]'
        masked_slang_sents[region_tag].append(re_punc2.sub(add_space_trail, re_punc_space.sub(remove_space, ' '.join(t))).strip())
        
        t = tokens.copy()
        if len(content_pos) > 0:
            for p in np.random.choice(len(content_pos), 1, replace=False):
                t[content_pos[p]] = '[MASK]'
        elif len(tokens) - len(slang_pos) > 0:
            p = np.random.choice(len(tokens), 1, replace=False)[0]
            while p in slang_pos:
                p = np.random.choice(len(tokens), 1, replace=False)[0]
            t[p] = '[MASK]'
        else:
            print(t)
        masked_random_sents[region_tag].append(re_punc2.sub(add_space_trail, re_punc_space.sub(remove_space, ' '.join(t))).strip())
            
        context_sents[region_tag].append(re_punc2.sub(add_space_trail, re_punc_space.sub(remove_space, ' '.join(tokens))).strip())
        context_inds[region_tag].append(i)
        gt_words[region_tag].append(slang_all[i])

In [24]:
N_sample = min([len(s) for s in context_sents])
ind_region = slang_llm_inds['region']

pivot_tr = int(np.floor(N_sample*0.8))
pivot_dev = int(np.floor(N_sample*0.85))

tr_sents, dev_sents, te_sents = [], [], []
tr_labels, dev_labels, te_labels = [], [], []

for i in range(2):
    tr_sents.extend(np.asarray(context_sents[i])[ind_region[i]][:pivot_tr])
    dev_sents.extend(np.asarray(context_sents[i])[ind_region[i]][pivot_tr:pivot_dev])
    te_sents.extend(np.asarray(context_sents[i])[ind_region[i]][pivot_dev:])
    
    tr_labels.extend([i]*pivot_tr)
    dev_labels.extend([i]*(pivot_dev-pivot_tr))
    te_labels.extend([i]*(N_sample-pivot_dev))

tr_sents = np.asarray(tr_sents)
dev_sents = np.asarray(dev_sents)
te_sents = np.asarray(te_sents)

tr_labels = np.asarray(tr_labels)
dev_labels = np.asarray(dev_labels)
te_labels = np.asarray(te_labels)

# Sentences with masked out slang

tr_sents_mslg, dev_sents_mslg, te_sents_mslg = [], [], []

for i in range(2):
    tr_sents_mslg.extend(np.asarray(masked_slang_sents[i])[ind_region[i]][:pivot_tr])
    dev_sents_mslg.extend(np.asarray(masked_slang_sents[i])[ind_region[i]][pivot_tr:pivot_dev])
    te_sents_mslg.extend(np.asarray(masked_slang_sents[i])[ind_region[i]][pivot_dev:])
    
tr_sents_mslg = np.asarray(tr_sents_mslg)
dev_sents_mslg = np.asarray(dev_sents_mslg)
te_sents_mslg = np.asarray(te_sents_mslg)

# Sentences with a random content word (other than the slang) masked out

tr_sents_mrand, dev_sents_mrand, te_sents_mrand = [], [], []

for i in range(2):
    tr_sents_mrand.extend(np.asarray(masked_random_sents[i])[ind_region[i]][:pivot_tr])
    dev_sents_mrand.extend(np.asarray(masked_random_sents[i])[ind_region[i]][pivot_tr:pivot_dev])
    te_sents_mrand.extend(np.asarray(masked_random_sents[i])[ind_region[i]][pivot_dev:])
    
tr_sents_mrand = np.asarray(tr_sents_mrand)
dev_sents_mrand = np.asarray(dev_sents_mrand)
te_sents_mrand = np.asarray(te_sents_mrand)

# For RoBERTa and XLNet

tr_sents_mslg_alt = np.asarray([s.replace('[MASK]', '<mask>') for s in tr_sents_mslg])
dev_sents_mslg_alt = np.asarray([s.replace('[MASK]', '<mask>') for s in dev_sents_mslg])
te_sents_mslg_alt = np.asarray([s.replace('[MASK]', '<mask>') for s in te_sents_mslg])

tr_sents_mrand_alt = np.asarray([s.replace('[MASK]', '<mask>') for s in tr_sents_mrand])
dev_sents_mrand_alt = np.asarray([s.replace('[MASK]', '<mask>') for s in dev_sents_mrand])
te_sents_mrand_alt = np.asarray([s.replace('[MASK]', '<mask>') for s in te_sents_mrand])

### Experiments

In [25]:
def train_demo_classifier(tr_sents, tr_labels, dev_sents, dev_labels, save_path, model_type='bert', num_labels=6, N_EPOCHS = 10, BATCH_SIZE = 20, verbose=False):

    if model_type.lower() == 'bert':
        tokenizer = BertTokenizer.from_pretrained("bert-large-cased")
        model = BertForSequenceClassification.from_pretrained('bert-large-cased', num_labels=num_labels).to(device)
        n_save = 4
    elif model_type.lower() == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
        model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=num_labels).to(device)
        n_save = 4
    elif model_type.lower() == 'xlnet':
        tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")
        model = XLNetForSequenceClassification.from_pretrained("xlnet-large-cased", num_labels=num_labels).to(device)
        n_save = 4
    else:
        print("Invalid model type")
        return

    for name, param in list(model.named_parameters())[:-n_save]:
        param.requires_grad_(False)

    # Training loop

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5)

    best_dev = np.inf
    
    if verbose:
        range_fn = range
        range_fn2 = trange
    else:
        range_fn = trange
        range_fn2 = range

    for e in range_fn(N_EPOCHS):
        
        if verbose:
            print("[Training Epoch - %d]" % (e+1))
            print("")
        time.sleep(0.5)

        tr_shuf = np.random.permutation(tr_sents.shape[0])

        loss_total, loss_total_dev = 0, 0

        for b in range_fn2(0, tr_sents.shape[0], BATCH_SIZE):

            b_end = min(tr_sents.shape[0], b+BATCH_SIZE)
            batch_sample = tr_shuf[b:b_end]

            tr_sents_batch = tr_sents[batch_sample]
            tr_labels_batch = tr_labels[batch_sample]

            optimizer.zero_grad()

            tr_ids_batch = tokenizer(list(tr_sents_batch), return_tensors="pt", padding=True).to(device)
            labels = torch.tensor(tr_labels_batch).to(device)

            loss = model(**tr_ids_batch, labels=labels).loss

            loss.backward()
            optimizer.step()
            loss_total += loss.item() * batch_sample.shape[0]

            del tr_ids_batch, labels, loss
        
        if verbose:
            print("Training Loss: %.3f" % (loss_total / tr_sents.shape[0]))
            time.sleep(0.5)

        for b in range_fn2(0, dev_sents.shape[0], BATCH_SIZE):

            b_end = min(dev_sents.shape[0], b+BATCH_SIZE)
            batch_sample = np.arange(b, b_end)

            dev_sents_batch = dev_sents[batch_sample]
            dev_labels_batch = dev_labels[batch_sample]

            dev_ids_batch = tokenizer(list(dev_sents_batch), return_tensors="pt", padding=True).to(device)
            labels = torch.tensor(dev_labels_batch).to(device)

            with torch.no_grad():
                loss = model(**dev_ids_batch, labels=labels).loss

            loss_total_dev += loss.item() * batch_sample.shape[0]

            del dev_ids_batch, labels, loss
        
        if verbose:
            print("Dev Loss: %.3f" % (loss_total_dev / dev_sents.shape[0]))
        
        scheduler.step(loss_total_dev)

        if loss_total_dev < best_dev:

            best_dev = loss_total_dev
            if verbose:
                print("Best dev loss so far, saving model...")
            d = OrderedDict()
            for name, param in list(model.state_dict().items())[-n_save:]:
                d[name] = param
            torch.save(d, save_path)

        if verbose:
            print("")
        
    return model

def test_demo_classifier(te_sents, save_path, model_type='bert', num_labels=6, BATCH_SIZE = 20):
    
    if model_type.lower() == 'bert':
        tokenizer = BertTokenizer.from_pretrained("bert-large-cased")
        model = BertForSequenceClassification.from_pretrained('bert-large-cased', num_labels=num_labels).to(device)
    elif model_type.lower() == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
        model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=num_labels).to(device)
    elif model_type.lower() == 'xlnet':
        tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")
        model = XLNetForSequenceClassification.from_pretrained("xlnet-large-cased", num_labels=num_labels).to(device)
    else:
        print("Invalid model type")
        return
    
    _ = model.load_state_dict(torch.load(save_path), strict=False)
    model.eval()

    preds = []

    for b in trange(0, te_sents.shape[0], BATCH_SIZE):

        b_end = min(te_sents.shape[0], b+BATCH_SIZE)
        batch_sample = np.arange(b, b_end)

        te_sents_batch = te_sents[batch_sample]

        te_ids_batch = tokenizer(list(te_sents_batch), return_tensors="pt", padding=True).to(device)

        with torch.no_grad():
            pred = model(**te_ids_batch).logits.argmax(axis=1)

        preds.extend(pred.cpu().numpy())

        del te_ids_batch, pred

    return np.asarray(preds)

def run_demo_experiment(tr_sents, tr_labels, dev_sents, dev_labels, te_sents, te_labels, save_path, num_labels=6, num_exp=20, model_type='bert', N_EPOCHS = 10, BATCH_SIZE = 20, verbose=False):
    
    res = []
    acc = []
    
    for e in range(num_exp):
        print("[Experiment - %d]" % (e+1))
        print("")
        
        path = save_path+'_'+model_type+'_'+str(e+1)+'.pt'
        
        train_demo_classifier(tr_sents, tr_labels, dev_sents, dev_labels, save_path=path, model_type=model_type, num_labels=num_labels, N_EPOCHS=N_EPOCHS, BATCH_SIZE=BATCH_SIZE, verbose=verbose)
        preds = test_demo_classifier(te_sents, path, model_type, num_labels=num_labels)
        
        res.append(preds)
        acc.append(np.sum(preds==te_labels))
        
    return np.stack(res), np.asarray(acc)

In [26]:
save_path = '../Results/region/'
if not os.path.exists(save_path):
    os.makedirs(save_path)

Performance with full sentences:

In [27]:
res_region_bert, acc = run_demo_experiment(tr_sents, tr_labels, dev_sents, dev_labels, te_sents, te_labels, save_path=save_path+'base', num_exp=1, model_type='bert', num_labels=2, N_EPOCHS = 10, BATCH_SIZE = 20, verbose=False)

[Experiment - 1]



100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:36<00:00,  3.64s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 15.34it/s]


In [28]:
np.mean(acc/len(te_labels))

0.6170212765957447

Performance with the slang being masked out:

In [29]:
res_regionmslg_bert, acc = run_demo_experiment(tr_sents_mslg, tr_labels, dev_sents_mslg, dev_labels, te_sents_mslg, te_labels, save_path=save_path+'mslg', num_exp=1, model_type='bert', num_labels=2, N_EPOCHS = 10, BATCH_SIZE = 20, verbose=False)

[Experiment - 1]



100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:37<00:00,  3.72s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 14.80it/s]


In [30]:
np.mean(acc/len(te_labels))

0.5797872340425532

Performance with a random content word (non-slang) masked out:

In [31]:
res_regionmrand_bert, acc = run_demo_experiment(tr_sents_mrand, tr_labels, dev_sents_mrand, dev_labels, te_sents_mrand, te_labels, save_path=save_path+'mrand', num_exp=1, model_type='bert', num_labels=2, N_EPOCHS = 10, BATCH_SIZE = 20, verbose=False)

[Experiment - 1]



100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:36<00:00,  3.63s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 15.70it/s]


In [32]:
np.mean(acc/len(te_labels))

0.6117021276595744

## Model Interpretation

### Preprocessing

In [33]:
data_mask = data_literal != ''

sents_all = data_sents[data_mask]
slang_all = data_slang[data_mask]

punctuations = '!\'"#$%&()\*\+,-\./:;<=>?@[\\]^_`{|}~'

re_punc = re.compile(r"["+punctuations+r"]+")
re_punc2 = re.compile(r"[,.!;:&<>-]+")
re_punc_space = re.compile(r" ["+punctuations+r"]+ ")
re_space = re.compile(r" +")

def add_space(match_obj):
    if match_obj.group() is not None:
        return ' '+match_obj.group()+' '
    
def add_space_trail(match_obj):
    if match_obj.group() is not None:
        return match_obj.group()+' '
    
def remove_space(match_obj):
    if match_obj.group() is not None:
        return match_obj.group()[1:-1]

sents_mask = []
for i in range(len(sents_all)):
    tokens = re_space.sub(' ', re_punc.sub(add_space, sents_all[i])).split(' ')
    slang_pos = []
            
    for j in range(len(tokens)):
        token = tokens[j]
        if token.lower() == slang_all[i].lower():
            slang_pos.append(j)
            
    if len(slang_pos) == 0:
        tokens = re_space.sub(' ', re_punc.sub('', sents_all[i])).split(' ')
        slang_pos = []
        slang_nop = re_punc.sub('', slang_all[i])

        for j in range(len(tokens)):
            token = tokens[j]
            if token.lower() == slang_nop.lower():
                slang_pos.append(j)
        
    if len(slang_pos) == 1:
        t = tokens.copy()
        for p in slang_pos:
            t[p] = '[MASK]'
        sents_mask.append(re_punc2.sub(add_space_trail, re_punc_space.sub(remove_space, ' '.join(t))).strip())
    else:
        sents_mask.append('')
            
sents_mask = np.asarray(sents_mask)

pphr_mask = sents_mask != ''
sents_mask = sents_mask[pphr_mask]
sents_all = sents_all[pphr_mask]
words_slang = slang_all[pphr_mask]
words_literal = data_literal[data_mask][pphr_mask]
regions = data_region[data_mask][pphr_mask]
confs = data_conf[data_mask][pphr_mask]

sents_mask_pos = np.asarray([sents_mask[i].index('[MASK]') for i in range(len(sents_mask))])
sents_mask = np.asarray([sents_mask[i][:sents_mask_pos[i]+6] for i in range(len(sents_mask))])
sents_literal = np.asarray([sents_mask[i].replace('[MASK]', words_literal[i])[:sents_mask_pos[i]+len(words_literal[i])] for i in range(len(sents_mask))])
sents_slang = np.asarray([sents_mask[i].replace('[MASK]', words_slang[i])[:sents_mask_pos[i]+len(words_slang[i])] for i in range(len(sents_mask))])

# Alternative masking token used by RoBERTa and XLNet

sents_mask_alt = []
for i in range(len(sents_mask)):
    sents_mask_alt.append(sents_mask[i].replace('[MASK]', '<mask>'))
sents_mask_alt = np.asarray(sents_mask_alt)

In [34]:
ind_mlm = slang_llm_inds['usage']

### Experiments

In [35]:
def preproc_sents_bert(tokenizer, sents, gt_words, sents_mask_pos):
    
    id_words = [tokenizer.encode(word)[1:-1] for word in gt_words]

    sents_repeat = []
    for i in range(len(sents)):
        sents_repeat.extend([sents[i] for j in range(len(id_words[i]))])

    inputs = tokenizer(list(sents_repeat), return_tensors="pt", padding=True)
    input_mask = []

    c = 0
    for i in range(len(sents)):
        m_start = len(tokenizer.encode(sents[i][:sents_mask_pos[i]]))-1
        for j in range(len(id_words[i])):
            inputs.input_ids[c][m_start+j] = tokenizer.mask_token_id
            input_mask.append(i)
            c += 1

    inputs = inputs.to(device)
    input_mask = np.asarray(input_mask)

    id_gt_tokens = []
    for i in range(len(sents)):
        id_gt_tokens.extend(id_words[i])
    id_gt_tokens = np.asarray(id_gt_tokens)
    
    return inputs, input_mask, id_gt_tokens

def preproc_sents_roberta(tokenizer, sents, gt_words, sents_mask_pos):
    
    id_words = []
    for i in range(len(gt_words)):
        if sents_mask_pos[i] == 0:
            id_words.append(tokenizer.encode(gt_words[i])[1:-1])
        else:
            id_words.append(tokenizer.encode(' '+gt_words[i])[1:-1])

    sents_repeat = []
    for i in range(len(sents)):
        sents_repeat.extend([sents[i] for j in range(len(id_words[i]))])

    inputs = tokenizer(list(sents_repeat), return_tensors="pt", padding=True)
    input_mask = []

    c = 0
    for i in range(len(sents)):
        m_start = len(tokenizer.encode(sents[i][:sents_mask_pos[i]].strip()))-1
        for j in range(len(id_words[i])):
            inputs.input_ids[c][m_start+j] = tokenizer.mask_token_id
            input_mask.append(i)
            c += 1

    inputs = inputs.to(device)
    input_mask = np.asarray(input_mask)

    id_gt_tokens = []
    for i in range(len(sents)):
        id_gt_tokens.extend(id_words[i])
    id_gt_tokens = np.asarray(id_gt_tokens)
    
    return inputs, input_mask, id_gt_tokens

def preproc_sents_xlnet(tokenizer, sents, gt_words, sents_mask_pos):
    
    id_words = [tokenizer.encode(word)[:-2] for word in gt_words]

    sents_repeat = []
    for i in range(len(sents)):
        sents_repeat.extend([sents[i] for j in range(len(id_words[i]))])

    inputs = tokenizer(list(sents_repeat), return_tensors="pt", padding=True)
    input_mask = []

    c = 0
    for i in range(len(sents)):
        m_start = 0
        while inputs.input_ids[c][m_start] == 5:
            m_start += 1
        m_start += len(tokenizer.encode(sents[i][:sents_mask_pos[i]]))-2
        for j in range(len(id_words[i])):
            inputs.input_ids[c][m_start+j] = tokenizer.mask_token_id
            input_mask.append(i)
            c += 1

    inputs = inputs.to(device)
    input_mask = np.asarray(input_mask)

    id_gt_tokens = []
    for i in range(len(sents)):
        id_gt_tokens.extend(id_words[i])
    id_gt_tokens = np.asarray(id_gt_tokens)
    
    return inputs, input_mask, id_gt_tokens


def predict_logits_bert(model, inputs):

    with torch.no_grad():
        logits = model(**inputs).logits

    mask_token_index = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
    softmax = torch.nn.functional.softmax(logits[torch.arange(logits.shape[0]), mask_token_index], dim=1)
    
    return softmax

def predict_mlm_bert(logits, id_gt_tokens):
    
    return logits[torch.arange(logits.shape[0]), id_gt_tokens]

def predict_logits_roberta(model, inputs):
    
    return predict_logits_bert(model, inputs)

def predict_mlm_roberta(logits, id_gt_tokens):
    
    return predict_mlm_bert(logits, id_gt_tokens)

def predict_logits_xlnet(model, inputs):

    perm_mask = torch.zeros((inputs.input_ids.shape[0], inputs.input_ids.shape[1], inputs.input_ids.shape[1]), dtype=torch.float).to(device)
    for i in range(inputs.input_ids.shape[0]):
        i_pad = 0
        while inputs.input_ids[i][i_pad]==5:
            i_pad += 1
        perm_mask[i, :, list(inputs.input_ids[i].flatten().cpu().numpy()).index(6)] = 1
        perm_mask[i, :i_pad, :] = 1
        perm_mask[i, :, :i_pad] = 1

    target_mapping = torch.zeros((inputs.input_ids.shape[0], 1, inputs.input_ids.shape[1]), dtype=torch.float).to(device)
    for i in range(inputs.input_ids.shape[0]):
        target_mapping[i, 0, list(inputs.input_ids[i].flatten().cpu().numpy()).index(6)] = 1

    with torch.no_grad():
        logits = model(**inputs, perm_mask=perm_mask, target_mapping=target_mapping).logits

    softmax = torch.nn.functional.softmax(logits, dim=2)
    
    return softmax

def predict_mlm_xlnet(logits, id_gt_tokens):
    
    return logits[torch.arange(logits.shape[0]), 0, id_gt_tokens]

def predict_mlm_batch(model, tokenizer, sents, id_gt_tokens, model_type='bert', BATCH_SIZE=100):
    
    if model_type.lower() == 'bert':
        logit_fn = predict_logits_bert
        mlm_fn = predict_mlm_bert
    elif model_type.lower() == 'roberta':
        logit_fn = predict_logits_roberta
        mlm_fn = predict_mlm_roberta
    elif model_type.lower() == 'xlnet':
        logit_fn = predict_logits_xlnet
        mlm_fn = predict_mlm_xlnet
    else:
        print("Invalid model type")
        return

    N_ex = len(sents)
    results = torch.empty(N_ex, dtype=torch.float64).to(device)

    for b in trange(0, N_ex, BATCH_SIZE):

        b_end = min(N_ex, b+BATCH_SIZE)

        logits = logit_fn(model, tokenizer, sents[b:b_end])
        results[b:b_end] = mlm_fn(logits, tokenizer, id_gt_tokens[b:b_end])
        
    return results
            

In [36]:
tokenizer = BertTokenizer.from_pretrained("bert-large-cased")
model = BertForMaskedLM.from_pretrained("bert-large-cased").to(device)

inputs_literal, input_mask_literal, id_gt_tokens_literal = preproc_sents_bert(tokenizer, sents_literal, words_literal, sents_mask_pos)
inputs_slang, input_mask_slang, id_gt_tokens_slang = preproc_sents_bert(tokenizer, sents_slang, words_slang, sents_mask_pos)

logits_literal = predict_logits_bert(model, inputs_literal)
prob_literal = predict_mlm_bert(logits_literal, id_gt_tokens_literal).cpu().numpy()
prob_literal = np.asarray([np.mean(prob_literal[input_mask_literal==i]) for i in range(len(sents_mask_pos))])

logits_slang = predict_logits_bert(model, inputs_slang)
prob_slang = predict_mlm_bert(logits_slang, id_gt_tokens_slang).cpu().numpy()
prob_slang = np.asarray([np.mean(prob_slang[input_mask_slang==i]) for i in range(len(sents_mask_pos))])

Mean ratio:

In [37]:
np.mean(prob_slang[ind_mlm]) / np.mean(prob_literal[ind_mlm])

0.2493036

Median ratio:

In [38]:
np.median(prob_slang[ind_mlm] / prob_literal[ind_mlm])

0.75862646