In [1]:
## parse scales datasets and add all scales to 'data' which is the main var we're going to be working with
data = {}
scale_datanames = ['demelo', 'crowd', 'wilkinson']

# for every dataset there will be a different key in 'data' (e.g. data['demelo'])
# for every scale in a dataset (e.g. 'attractive, unattractive**' in demelo), there will be a key with that name in data['demelo'],
# and the value is an object whose value at "adjectives" will be the list of adjectives in that scale, ordered by intensity (e.g. "[pretty, beautiful, gorgeous]").
for dataset_name in scale_datanames:
    data[dataset_name] = {}
    file_path = f"data\{dataset_name}.txt"
    file_data = open(file_path, 'r').read()
    scales = file_data.split("=== ")[1:]
    for scale in scales:
        scale_data = scale.split('\n')
        scale_name = scale_data[0]
        scale_adjectives = scale_data[1:-1]
        scale_adjectives = [x.split(' || ')[0] for x in scale_adjectives] # remove ties
        data[dataset_name][scale_name] = { "adjectives": scale_adjectives }

# example entries in demelo dataset
print("3 example scales:")
list(data["demelo"].items())[0:3]

3 example scales:


[('attractive, unattractive**',
  {'adjectives': ['plain', 'unattractive', 'ugly']}),
 ('audible, inaudible**', {'adjectives': ['quiet', 'inaudible', 'silent']}),
 ('big, little**',
  {'adjectives': ['small', 'smaller', 'midget', 'minute', 'tiny', 'micro']})]

In [2]:
## download datasets of 2 categories: reviews and news.
import datasets
from tqdm import tqdm
import random

reviews_datasets = []
news_datasets = []

# download datasets
reviews_datanames = ["imdb", "rotten_tomatoes", "yelp_review_full"] # movie / business reviews
news_datanames = ["cc_news"]

for name in reviews_datanames:
    reviews_datasets.append(datasets.load_dataset(name))
for name in news_datanames:
    news_datasets.append(datasets.load_dataset(name))

all_datasets = reviews_datasets + news_datasets

# flatten dataset splits (train/test/validation) into one set, taking only the 'text' column
print(f"flattening datasets...")
for i in tqdm(range(len(all_datasets))):
    flattened = []
    for subset_name in all_datasets[i]:
        flattened += all_datasets[i][subset_name]["text"]
        
    all_datasets[i] = flattened

all_datanames = ["reviews", "news"]
# make sure we have the same number of examples from 'reviews' and 'news' datasets (50k examples each)
all_datasets = [all_datasets[0][:15000] + all_datasets[1][:10000] + all_datasets[2][:25000], all_datasets[3][:50000]] 

# shuffle
print("shuffling datasets...")
random.seed(9001)
for i, dataset in enumerate(all_datasets):
    random.shuffle(dataset)
    print(f"dataset '{all_datanames[i]}' len: {len(dataset)}")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset imdb (C:/Users/alonmizrahi/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 20.49it/s]
Found cached dataset rotten_tomatoes (C:/Users/alonmizrahi/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)
100%|██████████| 3/3 [00:00<00:00, 93.73it/s]
Found cached dataset yelp_review_full (C:/Users/alonmizrahi/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)
100%|██████████| 2/2 [00:00<00:00,  6.53it/s]
Found cached dataset cc_news (C:/Users/alonmizrahi/.cache/huggingface/datasets/cc_news/plain_text/1.0.0/e3d5612f02fe5f11826a0d9614328b1772e27e5d685f4ec438e7f768e4581734)
100%|██████████| 1/1 [00:00<00:00,  1.35it/s]


flattening datasets...


100%|██████████| 4/4 [00:03<00:00,  1.05it/s]

shuffling datasets...
dataset 'reviews' len: 50000
dataset 'news' len: 50000





In [3]:
## finetune BERT on cola
import os
import torch
import numpy as np
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments

if os.path.exists("bert_finetuned_on_cola/model") and os.path.exists("bert_finetuned_on_cola/tokenizer"):
  print(f'loading finetuned model from "bert_finetuned_on_cola/model"...')
  cola_model = BertForSequenceClassification.from_pretrained("bert_finetuned_on_cola/model").to("cuda")
  cola_tokenizer = BertTokenizer.from_pretrained("bert_finetuned_on_cola/tokenizer")
else:
  model_name = "bert-base-uncased"
  print(f"loading and finetuning {model_name} on cola dataset...")
  cola_model = BertForSequenceClassification.from_pretrained(model_name).to("cuda")
  cola_tokenizer = BertTokenizer.from_pretrained(model_name)

  cola = datasets.load_dataset('linxinyuan/cola')
  cola_tokenized = cola.map(lambda examples: cola_tokenizer(examples['text'], padding=True, truncation=True), batched=True, batch_size=16)
  cola_tokenized = cola_tokenized.rename_column("label", "labels")
  cola_tokenized.set_format("torch")

  def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    accuracy = np.mean(preds == labels)
    true_positives = np.sum((labels == 1) & (preds == 1))
    false_positives = np.sum((labels == 0) & (preds == 1))
    false_negatives = np.sum((labels == 1) & (preds == 0))
      
    precision = true_positives / (true_positives + false_positives + 1e-9)
    recall = true_positives / (true_positives + false_negatives + 1e-9)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-9)
    
    # we're mostly interested in precision, that is, reducing the number of times we predict incorrect sentences as correct
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

  # Configure training arguments
  training_args = TrainingArguments(
      output_dir="./bert_finetuned_on_cola",
      evaluation_strategy="epoch",
      num_train_epochs=8,
      per_device_train_batch_size=16,
      per_device_eval_batch_size=16,
  )

  # Create Trainer instance
  trainer = Trainer(
      model=cola_model,
      args=training_args,
      train_dataset=cola_tokenized["train"],
      eval_dataset=cola_tokenized["test"],
      compute_metrics=compute_metrics,
      tokenizer=cola_tokenizer
  )

  trainer.train()

  # save model and tokenizer
  cola_model.save_pretrained('bert_finetuned_on_cola/model')
  cola_tokenizer.save_pretrained('bert_finetuned_on_cola/tokenizer')


# evaluate
print("evaluation:")
input = ["Hello, how are you?",
         "Hello are.",
         "You are very nice",
         "Very nice you.",
         "The movie was horrible",
         "The was movie horrible"]
tokens = cola_tokenizer(input, return_tensors="pt", padding=True).to('cuda')

cola_model.eval()
with torch.no_grad():
  output = cola_model(**tokens)

logits = output.logits.cpu().numpy()

predictions = np.argmax(logits, axis=-1)
predictions_text = ["correct" if x == 1 else "incorrect" for x in predictions]
for item in zip(input, predictions_text):
  print(item)

loading finetuned model from "bert_finetuned_on_cola/model"...
evaluation:
('Hello, how are you?', 'correct')
('Hello are.', 'incorrect')
('You are very nice', 'correct')
('Very nice you.', 'incorrect')
('The movie was horrible', 'correct')
('The was movie horrible', 'incorrect')


In [4]:
# helper function to flatten adjective list with potential ties
# ['pretty || beautiful', 'gorgeous'] -> ['pretty', 'beautiful', 'gorgeous']
def extract_adjectives(adjs):
    return [item for sublist in [[x] if " || " not in x else x.split(" || ") for x in adjs] for item in sublist]

extract_adjectives(['pretty || beautiful', 'gorgeous'])

['pretty', 'beautiful', 'gorgeous']

In [5]:
## parse sentences - add them to each adjective
import nltk

MAX_SENTENCES_PER_ADJ = 100
for dataset_index, dataset in enumerate(all_datasets):
    dataset_type = all_datanames[dataset_index]
    print(f"parsing sentences from '{dataset_type}' dataset...")

    for text in tqdm(dataset):
        text = text.lower().replace("\\", "").replace("<br />", "").replace("\\n\\n", ". ").replace("\\n", ". ").replace("\n", ". ")
        sentences = nltk.sent_tokenize(text)
            
        # for each sentence check all scales
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            if len(words) < 5 or len(words) > 35: continue # not too short, not too long

            for scales_dataset in data.values(): # demelo / crowd / wilkinson
                for scale in scales_dataset.values(): # '{ "adjectives": ["nearby || near", "close"] }'
                    if not dataset_type in scale: scale[dataset_type] = {} # create an obj for 'reviews' or 'news' if doesn't exist
                    adjs = extract_adjectives(scale['adjectives']) # flatten list of adjectives as it might contain ties
                    for adj in adjs:
                        if adj in scale[dataset_type] and len(scale[dataset_type][adj]) >= MAX_SENTENCES_PER_ADJ: continue
                        # check if the adj word appears in the sentence
                        if words.count(adj) != 1: continue
                        # cleaning method 1: part-of-speech check - make sure the POS of word we found in the sentence is indeed adjective
                        words_pos = nltk.pos_tag(words)
                        adj_pos = next(x for x in words_pos if x[0] == adj)[1]
                        if adj_pos != "JJ": continue # JJ = adjective
                        # add sentence to adj dictionary
                        if not adj in scale[dataset_type]: scale[dataset_type][adj] = []
                        scale[dataset_type][adj].append(sentence)


parsing sentences from 'reviews' dataset...


100%|██████████| 50000/50000 [03:33<00:00, 233.89it/s]


parsing sentences from 'news' dataset...


100%|██████████| 50000/50000 [04:56<00:00, 168.37it/s]


In [6]:
import json

# save data to json
open("data.json", mode='w', encoding="utf-8").write(json.dumps(data, sort_keys=True, indent=2))

8336448

In [7]:
# we need this for adjective substitution in sentences. unfortunately nltk doesn't provide this function
def nltk_word_detokenize(tokens):
    # Join tokens with spaces and handle punctuation
    detokenized = ' '.join(tokens)
    detokenized = detokenized.replace(" ,", ",")
    detokenized = detokenized.replace(" .", ".")
    detokenized = detokenized.replace(" !", "!")
    detokenized = detokenized.replace(" ?", "?")
    detokenized = detokenized.replace(" :", ":")
    detokenized = detokenized.replace(" ;", ";")
    # Add space after commas and periods if missing
    detokenized = detokenized.replace(",", ", ")
    detokenized = detokenized.replace(".", ". ")
    return detokenized.strip()

In [8]:
## apply cleaning methods 2,3 on parsed sentences
from transformers import BertModel
import torch
import torch.nn.functional as F

torch.set_grad_enabled(False)
tokenizer = cola_tokenizer # we use the same tokenizer (BERT's) for both cleaning method
bert = BertModel.from_pretrained("bert-base-uncased").to('cuda')
bert.eval()
MAX_SENTENCES_PER_SCALE = 20

# clean our data (get rid of most sentences)
for scales_dataset_name in data.keys(): # 'demelo' / 'crowd' / 'wilkinson'
    print(f"cleaning sentences from adjectives in dataset '{scales_dataset_name}'...")
    for scale in tqdm(data[scales_dataset_name].values()): # '{ "adjectives": ["nearby || near", "close"] }'
        for dataset_type in ['reviews', 'news']:
            if "sentences_similarities" not in scale[dataset_type]: scale[dataset_type]["sentences_similarities"] = {}
            adjs = extract_adjectives(scale['adjectives']) # flatten list of adjectives as it might contain ties
            for adj in adjs:
                if not adj in scale[dataset_type]: continue
                adj_word_token = nltk.word_tokenize(adj)[0]
                other_adjs = [x for x in adjs if x != adj]
                other_adjs_word_tokens = [nltk.word_tokenize(x)[0] for x in other_adjs]
                
                sentences = scale[dataset_type][adj]
                for sentence in sentences:
                    # generate new sentences with substituted adjectives
                    sentence_word_tokens = nltk.word_tokenize(sentence)
                    all_sentences = [sentence_word_tokens] # original sentence is the first element
                    for other_adj_word_token in other_adjs_word_tokens:
                        all_sentences.append([other_adj_word_token if x==adj_word_token else x for x in sentence_word_tokens])
                    all_sentences = [nltk_word_detokenize(x) for x in all_sentences]

                    # cleaning meathod 2: check if sentences (original sentence + and generated sentences) are grammatically correct using a bert model finetuned on cola from earlier
                    sentences_tokens = tokenizer(all_sentences, return_tensors="pt", padding=True, truncation=True).to('cuda')
                    cola_model_output = cola_model(**sentences_tokens)
                    logits = cola_model_output.logits.cpu().numpy()
                    predictions = np.argmax(logits, axis=-1)
                    if 0 in predictions: # 0 means we predicted the sentence to be not grammatically correct
                        continue # skip this sentence

                    # cleaning method 3: compare similarities of generated sentences to the original sentence, and take top k sentences
                    last_hidden_state = bert(**sentences_tokens).last_hidden_state
                    # calculate similarities of the original sentence's embedding with the substituted sentences embeddings
                    # use the embedding of BERT's CLS token as a represenation for the full sequence
                    original_embedding = last_hidden_state[0,0,:] # original sentence is the first in the batch
                    substituted_embeddings = last_hidden_state[1:,0,:]
                    similarities = [F.cosine_similarity(original_embedding, x, dim=0) for x in substituted_embeddings]
                    similarity = torch.mean(torch.tensor(similarities))
                    
                    # save similarity of this sentence with the original adjective replaced by _MASK_
                    sentence_with_mask_token = nltk_word_detokenize(["_MASK_" if x==adj_word_token else x for x in sentence_word_tokens])
                    scale[dataset_type]["sentences_similarities"][sentence_with_mask_token] = similarity.item()
                
            # get top k sentences with higest score for this datatype
            top_k_sentences = [x[0] for x in sorted(scale[dataset_type]["sentences_similarities"].items(), key=lambda x: x[1], reverse=True)[:MAX_SENTENCES_PER_SCALE]]
            scale[dataset_type]["top_sentences"] = top_k_sentences
            # clean remaining sentences for all adjectives
            for adj in adjs:
                if adj in scale[dataset_type]: del scale[dataset_type][adj]
            del scale[dataset_type]['sentences_similarities']



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


cleaning sentences from adjectives in dataset 'demelo'...


100%|██████████| 87/87 [15:41<00:00, 10.82s/it]


cleaning sentences from adjectives in dataset 'crowd'...


100%|██████████| 79/79 [12:32<00:00,  9.53s/it]


cleaning sentences from adjectives in dataset 'wilkinson'...


100%|██████████| 21/21 [04:17<00:00, 12.28s/it]


In [9]:
open("data_cleaned.json", mode='w', encoding="utf-8").write(json.dumps(data, sort_keys=True, indent=2))

1235124

In [10]:
# extract the tokens of a specific adjective a from a sequence
# we need this because a single word may sometimes get tokenized into multiple tokens
def find_sub_list(l,sl) -> tuple[int,int]:
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            return ind,ind+sll-1

In [11]:
## rank scalar adjectives with method 1
from transformers import AutoTokenizer, AutoModel

# compare BERT, DistilBERT, ALBERT, ELECTRA, RoBERTa
# note that for electra we used the generator model as it was trained on word replacement task, which is in the nature of this project
model_names = ["bert-base-uncased", "distilbert-base-uncased", "albert-base-v2", "google/electra-base-generator", "roberta-base"]

for model_name in model_names:
    print(f"evaluating '{model_name}' on ranking method 1")
    model_specific_args = { 'add_prefix_space': True } if model_name == "roberta-base" else {} # Roberta tokenizes words differently whether they are in the beginning or not, so pass this
    tokenizer = AutoTokenizer.from_pretrained(model_name, **model_specific_args)
    model = AutoModel.from_pretrained(model_name).to("cuda")
    model.eval()
    
    for scales_dataset_name in data.keys(): # 'demelo' / 'crowd' / 'wilkinson'
        print(f"evaluating on dataset '{scales_dataset_name}'...")
        for scale in tqdm(data[scales_dataset_name].values()): # '{ "adjectives": ["nearby || near", "close"] }'
            adjs = extract_adjectives(scale["adjectives"]) # flatten list of adjectives as it might contain ties
            for dataset_type in ['reviews', 'news']:
                top_sentences = scale[dataset_type]["top_sentences"]
                if len(top_sentences) == 0: continue
                if len(adjs) <= 2 : continue # we need at least 3 adjective in the scale for this method
                adjs_tokens_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(adj)) for adj in adjs]
                sum_similarities = torch.zeros(len(adjs)-1)
                
                for sentence in top_sentences:
                    adj_sentences = [sentence.replace("_MASK_", adj) for adj in adjs]
                    tokens = tokenizer(adj_sentences, return_tensors="pt", padding=True, truncation=True).to("cuda")
                    last_hidden_state = model(**tokens).last_hidden_state
                    # extract embeddings of the adjectives
                    tokens_input_ids = tokens["input_ids"].tolist()
                    adjs_ids_indices = [find_sub_list(all_tok_ids, adj_ids) for all_tok_ids, adj_ids in zip(tokens_input_ids, adjs_tokens_ids)]
                    adjs_embeddings = [embs[adj_indices[0]:adj_indices[1]+1] for embs, adj_indices in zip(last_hidden_state, adjs_ids_indices)]
                    # average embeddings in the case of a word having multiple tokens
                    adjs_averaged_embeddings = [torch.mean(emb, dim=0) for emb in adjs_embeddings]
                    # compute similarities of all adjectives with the last adjective, which is the extreme adjective in the list
                    similarities = [F.cosine_similarity(emb, adjs_averaged_embeddings[-1], dim=0) for emb in adjs_averaged_embeddings[:-1]]
                    sum_similarities += (torch.tensor(similarities) / torch.tensor(len(top_sentences), dtype=torch.float32))

                scale[dataset_type][f"eval_similarities_method1_{model_name}"] = sum_similarities.tolist()
                    


evaluating 'bert-base-uncased' on ranking method 1


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


evaluating on dataset 'demelo'...


100%|██████████| 87/87 [01:01<00:00,  1.42it/s]


evaluating on dataset 'crowd'...


100%|██████████| 79/79 [00:29<00:00,  2.64it/s]


evaluating on dataset 'wilkinson'...


100%|██████████| 21/21 [00:09<00:00,  2.17it/s]


evaluating 'distilbert-base-uncased' on ranking method 1


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


evaluating on dataset 'demelo'...


100%|██████████| 87/87 [00:31<00:00,  2.74it/s]


evaluating on dataset 'crowd'...


100%|██████████| 79/79 [00:15<00:00,  5.22it/s]


evaluating on dataset 'wilkinson'...


100%|██████████| 21/21 [00:04<00:00,  4.28it/s]


evaluating 'albert-base-v2' on ranking method 1


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


evaluating on dataset 'demelo'...


100%|██████████| 87/87 [01:03<00:00,  1.37it/s]


evaluating on dataset 'crowd'...


100%|██████████| 79/79 [00:32<00:00,  2.41it/s]


evaluating on dataset 'wilkinson'...


100%|██████████| 21/21 [00:10<00:00,  2.06it/s]


evaluating 'google/electra-base-generator' on ranking method 1


Some weights of the model checkpoint at google/electra-base-generator were not used when initializing ElectraModel: ['generator_predictions.LayerNorm.bias', 'generator_predictions.dense.bias', 'generator_lm_head.weight', 'generator_lm_head.bias', 'generator_predictions.dense.weight', 'generator_predictions.LayerNorm.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


evaluating on dataset 'demelo'...


100%|██████████| 87/87 [00:32<00:00,  2.65it/s]


evaluating on dataset 'crowd'...


100%|██████████| 79/79 [00:16<00:00,  4.73it/s]


evaluating on dataset 'wilkinson'...


100%|██████████| 21/21 [00:05<00:00,  3.91it/s]


evaluating 'roberta-base' on ranking method 1


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


evaluating on dataset 'demelo'...


100%|██████████| 87/87 [01:01<00:00,  1.42it/s]


evaluating on dataset 'crowd'...


100%|██████████| 79/79 [00:29<00:00,  2.69it/s]


evaluating on dataset 'wilkinson'...


100%|██████████| 21/21 [00:10<00:00,  2.08it/s]


In [12]:
## rank scalar adjectives with method 2
TRAIN_SET_RATIO = 0.7

for model_name in model_names:
    print(f"evaluating '{model_name}' on ranking method 2")
    model_specific_args = { 'add_prefix_space': True } if model_name == "roberta-base" else {} # Roberta tokenizes words differently whether they are in the beginning or not, so pass this
    tokenizer = AutoTokenizer.from_pretrained(model_name, **model_specific_args)
    model = AutoModel.from_pretrained(model_name).to("cuda")
    model_hidden_size = model.config.hidden_size
    model.eval()
    
    for scales_dataset_name in data.keys(): # 'demelo' / 'crowd' / 'wilkinson'
        scales_dataset = data[scales_dataset_name]
        print(f"dataset {scales_dataset_name} contains {len(scales_dataset.values())} scales, splitting to train/test")
        train_set, test_set = np.split(list(scales_dataset.values()), [int(len(scales_dataset.values())*TRAIN_SET_RATIO)])
        print(f"calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset '{scales_dataset_name}' on {len(train_set)} train scales")
        scales_dataset_diff_vecs = torch.zeros([2, model_hidden_size]) # 2 vecs for 'reviews' and 'news'

        for scale in tqdm(train_set): # '{ "adjectives": ["nearby || near", "close"] }'
            adjs = extract_adjectives(scale["adjectives"])
            adjs = [adjs[0], adjs[-1]] # we only look at the mild intensity adjective and the extreme intensity adjective
            adjs_tokens_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(adj)) for adj in adjs]
            scale_diff_vecs = torch.zeros([2, model_hidden_size]) # for 'reviews' and 'news'
            for dataset_type_index, dataset_type in enumerate(['reviews', 'news']):
                top_sentences = scale[dataset_type]["top_sentences"]
                if len(top_sentences) == 0: continue
                for sentence in top_sentences:
                    adj_sentences = [sentence.replace("_MASK_", adj) for adj in adjs]
                    tokens = tokenizer(adj_sentences, return_tensors="pt", padding=True, truncation=True).to("cuda")
                    last_hidden_state = model(**tokens).last_hidden_state
                    # extract embeddings of the adjectives
                    tokens_input_ids = tokens["input_ids"].tolist()
                    adjs_ids_indices = [find_sub_list(all_tok_ids, adj_ids) for all_tok_ids, adj_ids in zip(tokens_input_ids, adjs_tokens_ids)]
                    adjs_embeddings = [embs[adj_indices[0]:adj_indices[1]+1] for embs, adj_indices in zip(last_hidden_state, adjs_ids_indices)]
                    # average embeddings in the case of a word having multiple tokens
                    adjs_averaged_embeddings = [torch.mean(emb, dim=0) for emb in adjs_embeddings]
                    # compute diff vec
                    sentence_diff = adjs_averaged_embeddings[1] - adjs_averaged_embeddings[0]
                    scale_diff_vecs[dataset_type_index] += (sentence_diff.to('cpu') / torch.tensor(len(top_sentences), dtype=torch.float32))
            scales_dataset_diff_vecs += (scale_diff_vecs / torch.tensor(len(train_set), dtype=torch.float32))
        
        print(f"evaluating method 2 on intensity vectors for dataset '{scales_dataset_name}' on {len(test_set)} test scales")
        for scale in tqdm(test_set): # '{ "adjectives": ["nearby || near", "close"] }'
            adjs = extract_adjectives(scale["adjectives"])
            for dataset_type_index, dataset_type in enumerate(['reviews', 'news']):
                top_sentences = scale[dataset_type]["top_sentences"]
                if len(top_sentences) == 0: continue
                adjs_tokens_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(adj)) for adj in adjs]
                sum_similarities = torch.zeros(len(adjs))
                
                for sentence in top_sentences:
                    adj_sentences = [sentence.replace("_MASK_", adj) for adj in adjs]
                    tokens = tokenizer(adj_sentences, return_tensors="pt", padding=True, truncation=True).to("cuda")
                    last_hidden_state = model(**tokens).last_hidden_state
                    # extract embeddings of the adjectives
                    tokens_input_ids = tokens["input_ids"].tolist()
                    adjs_ids_indices = [find_sub_list(all_tok_ids, adj_ids) for all_tok_ids, adj_ids in zip(tokens_input_ids, adjs_tokens_ids)]
                    adjs_embeddings = [embs[adj_indices[0]:adj_indices[1]+1] for embs, adj_indices in zip(last_hidden_state, adjs_ids_indices)]
                    # average embeddings in the case of a word having multiple tokens
                    adjs_averaged_embeddings = [torch.mean(emb, dim=0) for emb in adjs_embeddings]
                    # compute similarities of all adjectives with the intensity vector
                    similarities = [F.cosine_similarity(emb.to('cpu'), scales_dataset_diff_vecs[dataset_type_index], dim=0) for emb in adjs_averaged_embeddings]
                    sum_similarities += (torch.tensor(similarities) / torch.tensor(len(top_sentences), dtype=torch.float32))

                scale[dataset_type][f"eval_adj_diffs_method2_{model_name}"] = sum_similarities.tolist()

evaluating 'bert-base-uncased' on ranking method 2


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


dataset demelo contains 87 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'demelo' on 60 train scales


100%|██████████| 60/60 [00:37<00:00,  1.59it/s]


evaluating method 2 on intensity vectors for dataset 'demelo' on 27 test scales


100%|██████████| 27/27 [00:20<00:00,  1.30it/s]


dataset crowd contains 79 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'crowd' on 55 train scales


100%|██████████| 55/55 [00:32<00:00,  1.69it/s]


evaluating method 2 on intensity vectors for dataset 'crowd' on 24 test scales


100%|██████████| 24/24 [00:17<00:00,  1.41it/s]


dataset wilkinson contains 21 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'wilkinson' on 14 train scales


100%|██████████| 14/14 [00:08<00:00,  1.63it/s]


evaluating method 2 on intensity vectors for dataset 'wilkinson' on 7 test scales


100%|██████████| 7/7 [00:04<00:00,  1.46it/s]


evaluating 'distilbert-base-uncased' on ranking method 2


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


dataset demelo contains 87 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'demelo' on 60 train scales


100%|██████████| 60/60 [00:18<00:00,  3.22it/s]


evaluating method 2 on intensity vectors for dataset 'demelo' on 27 test scales


100%|██████████| 27/27 [00:10<00:00,  2.57it/s]


dataset crowd contains 79 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'crowd' on 55 train scales


100%|██████████| 55/55 [00:19<00:00,  2.89it/s]


evaluating method 2 on intensity vectors for dataset 'crowd' on 24 test scales


100%|██████████| 24/24 [00:09<00:00,  2.42it/s]


dataset wilkinson contains 21 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'wilkinson' on 14 train scales


100%|██████████| 14/14 [00:04<00:00,  2.84it/s]


evaluating method 2 on intensity vectors for dataset 'wilkinson' on 7 test scales


100%|██████████| 7/7 [00:02<00:00,  2.58it/s]


evaluating 'albert-base-v2' on ranking method 2


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


dataset demelo contains 87 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'demelo' on 60 train scales


100%|██████████| 60/60 [00:39<00:00,  1.50it/s]


evaluating method 2 on intensity vectors for dataset 'demelo' on 27 test scales


100%|██████████| 27/27 [00:21<00:00,  1.24it/s]


dataset crowd contains 79 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'crowd' on 55 train scales


100%|██████████| 55/55 [00:35<00:00,  1.53it/s]


evaluating method 2 on intensity vectors for dataset 'crowd' on 24 test scales


100%|██████████| 24/24 [00:18<00:00,  1.28it/s]


dataset wilkinson contains 21 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'wilkinson' on 14 train scales


100%|██████████| 14/14 [00:09<00:00,  1.53it/s]


evaluating method 2 on intensity vectors for dataset 'wilkinson' on 7 test scales


100%|██████████| 7/7 [00:05<00:00,  1.36it/s]


evaluating 'google/electra-base-generator' on ranking method 2


Some weights of the model checkpoint at google/electra-base-generator were not used when initializing ElectraModel: ['generator_predictions.LayerNorm.bias', 'generator_predictions.dense.bias', 'generator_lm_head.weight', 'generator_lm_head.bias', 'generator_predictions.dense.weight', 'generator_predictions.LayerNorm.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


dataset demelo contains 87 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'demelo' on 60 train scales


100%|██████████| 60/60 [00:25<00:00,  2.33it/s]


evaluating method 2 on intensity vectors for dataset 'demelo' on 27 test scales


100%|██████████| 27/27 [00:11<00:00,  2.28it/s]


dataset crowd contains 79 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'crowd' on 55 train scales


100%|██████████| 55/55 [00:23<00:00,  2.34it/s]


evaluating method 2 on intensity vectors for dataset 'crowd' on 24 test scales


100%|██████████| 24/24 [00:10<00:00,  2.35it/s]


dataset wilkinson contains 21 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'wilkinson' on 14 train scales


100%|██████████| 14/14 [00:05<00:00,  2.36it/s]


evaluating method 2 on intensity vectors for dataset 'wilkinson' on 7 test scales


100%|██████████| 7/7 [00:03<00:00,  2.27it/s]


evaluating 'roberta-base' on ranking method 2


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


dataset demelo contains 87 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'demelo' on 60 train scales


100%|██████████| 60/60 [00:37<00:00,  1.61it/s]


evaluating method 2 on intensity vectors for dataset 'demelo' on 27 test scales


100%|██████████| 27/27 [00:21<00:00,  1.24it/s]


dataset crowd contains 79 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'crowd' on 55 train scales


100%|██████████| 55/55 [00:34<00:00,  1.61it/s]


evaluating method 2 on intensity vectors for dataset 'crowd' on 24 test scales


100%|██████████| 24/24 [00:17<00:00,  1.35it/s]


dataset wilkinson contains 21 scales, splitting to train/test
calculating intensity vectors (1 vec for 'reviews', 1 vec for 'news') for dataset 'wilkinson' on 14 train scales


100%|██████████| 14/14 [00:09<00:00,  1.53it/s]


evaluating method 2 on intensity vectors for dataset 'wilkinson' on 7 test scales


100%|██████████| 7/7 [00:04<00:00,  1.46it/s]


In [13]:
open("data_evaluated.json", mode='w', encoding="utf-8").write(json.dumps(data, sort_keys=True, indent=2))

1507344

In [14]:
## calculate and results to 'results.json'
import itertools

METHOD1_PAIR_EQUALITY_RANGE = 0.0
METHOD2_PAIR_EQUALITY_RANGE = 0.0

results = {}

for scales_dataset_name in data.keys(): # 'demelo' / 'crowd' / 'wilkinson'
    results[scales_dataset_name] = {}
    for dataset_type in ['reviews', 'news']:
        results[scales_dataset_name][dataset_type] = {}
        for model_name in model_names:
            results[scales_dataset_name][dataset_type][model_name] = {}
            for method in ["method1", "method2"]:
                num_pairs = 0
                num_correct_comparisons = 0

                num_concordant_pairs = 0
                num_discordant_pairs = 0
                num_tied_pairs = 0
                similarities_entry_name = f"eval_similarities_method1_{model_name}" if method == "method1" else f"eval_adj_diffs_method2_{model_name}"
                pair_equality_range = METHOD1_PAIR_EQUALITY_RANGE if method == "method1" else METHOD2_PAIR_EQUALITY_RANGE
                
                for scale in data[scales_dataset_name].values():
                    if not similarities_entry_name in scale[dataset_type]: continue
                    adjs = extract_adjectives(scale['adjectives']) # flatten list of adjectives as it might contain ties
                    if method == "method1": adjs = adjs[:-1]
                    rankings = scale[dataset_type][similarities_entry_name]
                    adj_rankings = list(zip(adjs, rankings))
                    all_possible_pairs = itertools.combinations(adj_rankings, 2)
                    for pair in all_possible_pairs:
                        gold_truth = ""
                        pair0_name, pair0_rank_pred = pair[0]
                        pair1_name, pair1_rank_pred = pair[1]
                        adjectives_full_list = scale['adjectives'] # full list with ties (e.g. ["small || little", "tiny"])
                        pair0_index = [i for i, x in enumerate(adjectives_full_list) if pair0_name in x.split(" || ")]
                        pair1_index = [i for i, x in enumerate(adjectives_full_list) if pair1_name in x.split(" || ")]
                        assert len(pair0_index) == len(pair1_index) == 1
                        pair0_index = pair0_index[0]
                        pair1_index = pair1_index[0]

                        if pair1_index > pair0_index:
                            gold_truth = "bigger"
                        elif pair1_index < pair0_index:
                            gold_truth = "smaller"
                        elif pair0_index == pair1_index and "||" in adjectives_full_list[pair0_index]:
                            gold_truth = "equal"
                        assert gold_truth != ""

                        pair_pred_diff = pair1_rank_pred - pair0_rank_pred
                        pair_pred = "equal" if abs(pair_pred_diff) <= pair_equality_range else "bigger" if pair_pred_diff > 0 else "smaller"
                        # pairwise
                        if pair_pred == gold_truth: num_correct_comparisons += 1
                        # kendall
                        if pair_pred == gold_truth and (gold_truth == "bigger" or gold_truth == "smaller"):
                            num_concordant_pairs += 1
                        elif pair_pred == gold_truth:
                            num_tied_pairs +=1
                        else:
                            num_discordant_pairs += 1
                        
                        num_pairs += 1
                    
                results[scales_dataset_name][dataset_type][model_name][method] = {
                    "pairwise": num_correct_comparisons / num_pairs,
                    "kendall": (num_concordant_pairs - num_discordant_pairs) / num_pairs
                }

# round floats to 3 digits after dot
def round_floats(o):
    if isinstance(o, float): return round(o, 3)
    if isinstance(o, dict): return {k: round_floats(v) for k, v in o.items()}
    if isinstance(o, (list, tuple)): return [round_floats(x) for x in o]
    return o

print("writing results to 'results.json'")
open("results.json", mode='w', encoding="utf-8").write(json.dumps(round_floats(results), sort_keys=True, indent=2))

writing results to 'results.json


6626