In [1]:
%reload_ext autoreload
%autoreload 2

In [354]:
from utils import *
from datasets import load_dataset, load_metric, Dataset
from transformers import AutoTokenizer
import tensorflow as tf
from nltk import word_tokenize

# tf.compat.v1.flags.DEFINE_string('f','','')

# Load Data

In [136]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

def get_ttr_by_ids(input_ids):
    '''
    Type Token Ratio (TTR)
    higher -> more diversity
    '''
    token_ids, counts = np.unique(input_ids, return_counts=True)
    idx = np.isin(token_ids, [0, 101, 102], assume_unique=True, invert=True)
    token_ids, counts = token_ids[idx], counts[idx]
    ttr = len(token_ids) / counts.sum() * 100
    return ttr

def get_ttr_by_text(text):
    '''
    Type Token Ratio (TTR)
    higher -> more diversity
    '''
    tokens = [word_tokenize(x) for x in text]
    tokens = [item for sublist in tokens for item in sublist]
    toks, tok_counts = np.unique(tokens, return_counts=True)
    ttr = len(toks) / tok_counts.sum() * 100
    return ttr

In [355]:
# Init tokenizer and metrics

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

bertscore_metric = load_metric("bertscore")
bleurt_metric = load_metric("bleurt")

Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: datasets.load_metric('bleurt', 'bleurt-large-512').


INFO:tensorflow:Reading checkpoint C:\Users\Fabrice\.cache\huggingface\metrics\bleurt\default\downloads\extracted\0989b2f25cefa5363b32fb8de03c83fe189c82f971eabcd6248d372510de0c71\bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Performs basic checks...
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Loading model...
INFO:tensorflow:BLEURT initialized.


In [None]:
datasets = [('glue', 'sst2'), 'ag_news']

for d in datasets:
    
    n = min(len(ORIG_train_dataset), len(INV_train_dataset), len(SIB_train_dataset))
    
    print('processing {} examples of {}...'.format(n, d))
    
    if type(d) == tuple and d[0] == 'glue':
        task = 'sentiment'
        SIB_type = 'SIB'
        ORIG_train_dataset = load_dataset(d[0], d[1])['train']
        ORIG_train_dataset.rename_column_('sentence', 'text')
        d = d[1]
    else:
        task = 'topic'
        SIB_type = 'SIB-mix'
        ORIG_train_dataset = load_dataset(d)['train']
        
    print('loading {} ORIG'.format(d))
    ORIG_train_dataset_tok = ORIG_train_dataset.map(tokenize, batched=True, batch_size=len(ORIG_train_dataset))
    ORIG_train_dataset_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    
    # INV
    print('loading {} INV'.format(d))
    text = npy_load("./assets/" + d + "/" + task + "/INV/text2.npy")
    label = npy_load("./assets/" + d + "/" + task + "/INV/label2.npy")
    df = pd.DataFrame({'text': text, 'label': label})
    df.text = df.text.astype(str)
    df.label = df.label.astype(int)
    INV_train_dataset = Dataset.from_pandas(df)
    INV_train_dataset_tok = INV_train_dataset.map(tokenize, batched=True, batch_size=len(INV_train_dataset))
    INV_train_dataset_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

    # SIB
    print('loading {} {}'.format(d, SIB_type))
    text = npy_load("./assets/" + d + "/" + task + "/" + SIB_type + "/text2.npy")
    label = npy_load("./assets/" + d + "/" + task + "/" + SIB_type + "/label2.npy")
    if SIB_type == 'SIB-mix':
        df = pd.DataFrame({'text': text, 'label': label.tolist()})
        df.text = df.text.astype(str)
        df.label = df.label.map(lambda y: np.array(y))
    else:
        df = pd.DataFrame({'text': text, 'label': label})
        df.text = df.text.astype(str)
        df.label = df.label.astype(int)
    SIB_train_dataset = Dataset.from_pandas(df)
    SIB_train_dataset_tok = SIB_train_dataset.map(tokenize, batched=True, batch_size=len(SIB_train_dataset))
    SIB_train_dataset_tok.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    
    # Naturalness | BERTScore
    print('calculating BERTScores...')
    
    scores = bertscore_metric._compute(ORIG, ORIG, lang="en")
    for key, value in list(scores.items())[:-1]:
        print('ORIG - ORIG', key, value.mean())
        
    scores = bertscore_metric._compute(INV, ORIG, lang="en")
    for key, value in list(scores.items())[:-1]:
        print('INV - ORIG', key, value.mean())
        
    scores = bertscore_metric._compute(SIB, ORIG, lang="en")
    for key, value in list(scores.items())[:-1]:
        print('SIB - ORIG', key, value.mean())
        
    # Naturalness | BLEURT Scores
    print('calculating BLEURT Scores...')
    
    scores = bleurt_metric._compute(ORIG, ORIG)
    print('ORIG - ORIG score', np.array(scores['scores']).mean())
        
    scores = bleurt_metric._compute(INV, ORIG)
    print('INV - ORIG score', np.array(scores['scores']).mean())

    scores = bleurt_metric._compute(SIB, ORIG)
    print('SIB - ORIG score', np.array(scores['scores']).mean())
    
    # Diversity | TTR (lexical diversity)
    
    # by input_ids (tokenizing replaces many words with [UNK] and reduces lexical diversity)
    print('ORIG TTR: {0:0.2f} %'.format(get_ttr_by_ids(ORIG_train_dataset_tok['input_ids'][:n])))
    print('INV  TTR: {0:0.2f} %'.format(get_ttr_by_ids(INV_train_dataset_tok['input_ids'][:n])))
    print('SIB  TTR: {0:0.2f} %'.format(get_ttr_by_ids(SIB_train_dataset_tok['input_ids'][:n])))
    
    # by text
    print('ORIG TTR: {0:0.2f} %'.format(get_ttr_by_text(ORIG_train_dataset['text'][:n])))
    print('INV  TTR: {0:0.2f} %'.format(get_ttr_by_text(INV_train_dataset['text'][:n])))
    print('SIB  TTR: {0:0.2f} %'.format(get_ttr_by_text(SIB_train_dataset['text'][:n])))

processing 1000 examples of ('glue', 'sst2')...


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-82e6569f2ab6ac94.arrow


loading sst2 ORIG
loading sst2 INV


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


loading sst2 SIB


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


calculating BERTScores...
precision tensor(1.)
recall tensor(1.)
f1 tensor(1.)




precision tensor(0.7802)
recall tensor(0.8215)
f1 tensor(0.7993)
precision tensor(0.7920)
recall tensor(0.8252)
f1 tensor(0.8078)
calculating BLEURT Scores...
score 1.0111331019800156
score -1.5404506878353654


# Naturalness Scores

### BERTScore

In [330]:
bertscore_metric = load_metric("bertscore")

n = 10000
ORIG = ORIG_train_dataset['text'][:n]
INV = INV_train_dataset['text'][:n]
SIB = SIB_train_dataset['text'][:n]

In [331]:
scores = bertscore_metric._compute(ORIG, ORIG, lang="en")
for key, value in list(scores.items())[:-1]:
    print(key, value.mean())

precision tensor(1.)
recall tensor(1.)
f1 tensor(1.)


In [332]:
scores = bertscore_metric._compute(INV, ORIG, lang="en")
for key, value in list(scores.items())[:-1]:
    print(key, value.mean())



precision tensor(0.7802)
recall tensor(0.8215)
f1 tensor(0.7993)


In [333]:
scores = bertscore_metric._compute(SIB, ORIG, lang="en")
for key, value in list(scores.items())[:-1]:
    print(key, value.mean())

precision tensor(0.7920)
recall tensor(0.8252)
f1 tensor(0.8078)


### BLEURT

In [289]:
bleurt_metric = load_metric("bleurt")

Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: datasets.load_metric('bleurt', 'bleurt-large-512').


INFO:tensorflow:Reading checkpoint C:\Users\Fabrice\.cache\huggingface\metrics\bleurt\default\downloads\extracted\0989b2f25cefa5363b32fb8de03c83fe189c82f971eabcd6248d372510de0c71\bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Performs basic checks...
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Loading model...
INFO:tensorflow:BLEURT initialized.


In [290]:
scores = bleurt_metric._compute(ORIG, ORIG)
print('score', np.array(scores['scores']).mean())

score 1.0065633410215378


In [291]:
scores = bleurt_metric._compute(INV, ORIG)
print('score', np.array(scores['scores']).mean())

score -1.563182808160782


In [292]:
scores = bleurt_metric._compute(SIB, ORIG)
print('score', np.array(scores['scores']).mean())

score -1.6391020435094834


# Diversity

### Lexical Diversity

In [324]:
from nltk import word_tokenize

def get_ttr_by_ids(input_ids):
    '''
    Type Token Ratio (TTR)
    higher -> more diversity
    '''
    token_ids, counts = np.unique(input_ids, return_counts=True)
    idx = np.isin(token_ids, [0, 101, 102], assume_unique=True, invert=True)
    token_ids, counts = token_ids[idx], counts[idx]
    ttr = len(token_ids) / counts.sum() * 100
    return ttr

def get_ttr_by_text(text):
    '''
    Type Token Ratio (TTR)
    higher -> more diversity
    '''
    tokens = [word_tokenize(x) for x in text]
    tokens = [item for sublist in tokens for item in sublist]
    toks, tok_counts = np.unique(tokens, return_counts=True)
    ttr = len(toks) / tok_counts.sum() * 100
    return ttr

In [327]:
n = 60614
print('ORIG TTR: {0:0.2f} %'.format(get_ttr_by_ids(ORIG_train_dataset_tok['input_ids'][:n])))
print('INV  TTR: {0:0.2f} %'.format(get_ttr_by_ids(INV_train_dataset_tok['input_ids'][:n])))
print('SIB  TTR: {0:0.2f} %'.format(get_ttr_by_ids(SIB_train_dataset_tok['input_ids'][:n])))

ORIG TTR: 1.68 %
INV  TTR: 2.08 %
SIB  TTR: 0.92 %


In [328]:
n = 60614
print('ORIG TTR: {0:0.2f} %'.format(get_ttr_by_text(ORIG_train_dataset['text'][:n])))
print('INV  TTR: {0:0.2f} %'.format(get_ttr_by_text(INV_train_dataset['text'][:n])))
print('SIB  TTR: {0:0.2f} %'.format(get_ttr_by_text(SIB_train_dataset['text'][:n])))

ORIG TTR: 2.58 %
INV  TTR: 15.63 %
SIB  TTR: 2.57 %


### Entropy

In [35]:
from transformers import BertTokenizer, BertModel
import torch

In [275]:
uniform_dist = torch.ones(tokenizer.vocab_size) * (1./tokenizer.vocab_size)
max_entropy = torch.distributions.Categorical(probs=uniform_dist).entropy().item()
max_entropy

10.326202392578125

In [81]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)

print(outputs.last_hidden_state)

tensor([[[-0.1144,  0.1937,  0.1250,  ..., -0.3827,  0.2107,  0.5407],
         [ 0.5308,  0.3207,  0.3665,  ..., -0.0036,  0.7579,  0.0388],
         [-0.4877,  0.8849,  0.4256,  ..., -0.6976,  0.4458,  0.1231],
         ...,
         [-0.7003, -0.1815,  0.3297,  ..., -0.4838,  0.0680,  0.8901],
         [-1.0355, -0.2567, -0.0317,  ...,  0.3197,  0.3999,  0.1795],
         [ 0.6080,  0.2610, -0.3131,  ...,  0.0311, -0.6283, -0.1994]]],
       grad_fn=<NativeLayerNormBackward>)


In [82]:
output = tokenizer.encode("I love NY")

In [84]:
from scipy.stats import entropy

In [96]:
outputs.last_hidden_state.detach().numpy()

array([[[-0.11437133,  0.19371387,  0.12495928, ..., -0.3826907 ,
          0.21065906,  0.5407081 ],
        [ 0.53082454,  0.32074875,  0.3664592 , ..., -0.00360663,
          0.7578603 ,  0.03884365],
        [-0.4876514 ,  0.8849247 ,  0.42556354, ..., -0.697621  ,
          0.44583374,  0.12309406],
        ...,
        [-0.70027876, -0.18150657,  0.32969624, ..., -0.4837927 ,
          0.06802326,  0.8900844 ],
        [-1.0354625 , -0.2566779 , -0.03165251, ...,  0.31974295,
          0.39990267,  0.17954752],
        [ 0.60799193,  0.2609697 , -0.31307226, ...,  0.03109809,
         -0.62827194, -0.19942497]]], dtype=float32)

In [122]:
entropy(p)

2.059098

In [113]:
p = outputs.last_hidden_state.detach().numpy().squeeze().mean(axis=1)

In [114]:
p

array([-0.00907731, -0.00968672, -0.01235135, -0.01130538, -0.01403303,
       -0.01283057, -0.01095144, -0.01752734], dtype=float32)

In [124]:
x = p / p.sum()

In [125]:
x

array([0.09285002, 0.09908357, 0.12633954, 0.11564054, 0.14354114,
       0.13124135, 0.11202013, 0.17928374], dtype=float32)

10.326202392578125