In [47]:
import torch
from torch.utils.data import DataLoader, Dataset, ConcatDataset, random_split
from torch.distributions import Categorical
from torch.nn.utils.rnn import pad_sequence
import requests
import pandas as pd
import json
from datasets import load_dataset
import os
import shutil
from transformers import RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer, BertForSequenceClassification, AutoModel, logging
logging.set_verbosity_error()
import tqdm
import sys
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import random
import numpy as np
from scipy.stats import entropy, pearsonr
from scipy.special import kl_div
import matplotlib.pyplot as plt
import argparse


In [48]:
def set_seed(seed):
    '''
    Sets seed for random, np, torch
    '''
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [49]:
def globals():
    '''
    Returns the following parameters to be set globally:
        device, model_specifier, classes, alt_classnames, tokenizer, hyperparams, toy_run
    '''
    # To avoid having to pass the same variables over and over, I set some global ones here. Keeps them in the same place.
    # Note that this means the functions will refer to values that are undefined as of function definition that will be defined
    # before function call.

    parser = argparse.ArgumentParser()
    #args = parser.parse_args(args=[])
    parser.add_argument("-c", "--config", help="configuration file", default="config.json")
    args = parser.parse_args(args=[])
    config = json.load(open(args.config))

    set_seed(config["seed"])

    if torch.cuda.is_available():
        device = torch.device(config["device"])
    else:
        device = torch.device('cpu')

    model_selection = {
    'bert_base': {
        "model_name": "bert-base-uncased",
        "tokenizer": BertTokenizer,
        "sequence_classification": BertForSequenceClassification,
    },
    
    'bert_large': {
        "model_name": "bert-large-uncased",
        "tokenizer": BertTokenizer,
        "sequence_classification": BertForSequenceClassification,
    },
    'roberta_large': {
        "model_name": "roberta-large",
        "tokenizer": RobertaTokenizer,
        "sequence_classification": RobertaForSequenceClassification,
    }}

    model_specifier = model_selection[config["model_id"]]
    classes = {'entailment': torch.as_tensor(0), 'neutral': torch.as_tensor(1), 'contradiction': torch.as_tensor(2)}
    alt_classnames = {'e': 'entailment', 'n': 'neutral', 'c': 'contradiction' }
    tokenizer = model_specifier['tokenizer'].from_pretrained(model_specifier['model_name'])
    toy_run = config['toy_run']
    train_eval_mode = config['train_eval_mode']
    manual_modelname = config["manual_modelname"]
    overwrite = config["overwrite"]
    if config['manual_hyperparams']['use']:
        hyperparams = config['manual_hyperparams']
    elif toy_run:
        hyperparams = {
        'batch_size': 4,
        'epochs': 2,
        'lr' : 0.01
        }
    else:
        hyperparams = {
        'batch_size': 32,
        'epochs': 6,
        'lr' : 5e-5
        }
        

    return device, model_specifier, classes, alt_classnames, tokenizer, hyperparams, toy_run, train_eval_mode, manual_modelname, overwrite


code to get NLI variation:
evaluation will assume this exists 

set to markdown since repository has this already

nli_var_url = "https://raw.githubusercontent.com/epavlick/NLI-variation-data/refs/heads/master/sentence-pair-analysis/preprocessed-data.jsonl"
nli_var_r = requests.get(nli_var_url)
if not os.path.exists("./data/NLI_variation"):
        os.mkdir("./data/NLI_variation")
if not os.path.exists("./data/NLI_variation/NLI_variation_data.jsonl"):
    with open("./data/NLI_variation/NLI_variation_data.jsonl", "wb") as f:
        f.write(nli_var_r.content)



In [50]:
def normalise_headers(df):
    '''
    Normalises the column names that the different datasets come with.

    Arg:
        pandas dataframe based on any dataset used in this project
    Returns:
        Column-normalised dataframe
    '''

    #https://stackoverflow.com/questions/55715572/renaming-column-in-pandas-dataframe-if-condition-is-met, modified
    
    # there are potential data variations for which the following would be semantically incorrect, but it works for my data
    df = df
    alt_column_names = {'premise': ['context', 'sentence1'], 'hypothesis': ['statement', 'sentence2'], 'label': ['gold_label', 'labels', 'majority_label']}
    df.columns = ['premise' if any(k == x for k in alt_column_names['premise']) else x for x in df]
    df.columns = ['hypothesis' if any(k == x for k in alt_column_names['hypothesis']) else x for x in df]
    df.columns = ['label' if any(k == x for k in alt_column_names['label']) else x for x in df]
    
    return df

In [51]:
def get_df_dict(setname):
    '''
    Converts raw dataset files into pandas dataframes
    Arg: simple setname string, e.g., 'anli' for Adversarial NLI
    Returns: dict of dataframes for the input dataset
    '''
    df_dict = {}
    dirpath = './data/' + setname
    walking = os.walk(dirpath)

    for root, dirs, files in walking:
        for file in files:
            full_path = os.path.join(root, file)
            name, extension = os.path.splitext(full_path)
            if extension == '.jsonl':
                df = pd.read_json(path_or_buf=full_path, lines=True)
                if setname == 'chaosNLI':
                    cnli_examples = pd.json_normalize(df['example']) # p, h, label are in a single column. This makes a df with that column unpacked
                    cnli_examples = cnli_examples.drop(['uid'], axis=1) # uid is in now redundantly in both the unpacked 'example' column and in the original df
                    df = df.drop(['example'], axis=1)
                    df = df.join(cnli_examples)
            elif extension == '.csv':
                df = pd.read_csv(filepath_or_buffer=full_path)
            else:
                continue # skips e.g. text files

            df = normalise_headers(df)
            
            #anli has a one-deeper filesystem (per-round datasets), so the returned dict is also deeper
            if setname == 'anli':
                round, subset = name.split('/')[-2:]
                if round not in df_dict:
                    df_dict[round] = {}
                if subset not in df_dict[round]:
                    df_dict[round][subset] = df
            else:
                subset = name.split('/')[-1]
                df_dict[subset] = df
    return df_dict


In [52]:
def add_encoded_input(tokenizer, df, max_length=None):
    '''
    Tokenizes premise and hypothesis (including special tokens).
    Adds input ids, token type ids (token types mark belonging to premise or hypothesis), and attention mask to dataframe.
    Args:
        tokenizer
        df to modify
        max_length: max allowed sequence length
    '''
    # do not actually use max_length and there is no way to change this in the config, but it is something that could be wanted so I left it in

    if 'input_encoding' in df.columns:
        # not necessary if notebook is run like a script, but this safeguards against some notebook-type sequencing errors
        # where an already encoding-modified df is input.
        return df
    
    tokenizer = tokenizer
    new_df = df
    premises = list(df['premise'])
    hypotheses = list(df['hypothesis'])
    
    encoded_pairs = [tokenizer.encode_plus(p, h, max_length=max_length, return_token_type_ids=True, truncation=True)
                       for p, h in zip(premises, hypotheses)]
    tensor_pairs = [{'input_ids': torch.as_tensor(encoded_pair['input_ids']),
                      'token_type_ids': torch.as_tensor(encoded_pair['token_type_ids']),
                      'attention_mask': torch.as_tensor(encoded_pair['attention_mask'])} 
                      for encoded_pair in encoded_pairs]
    new_df.insert(0, 'input_encoding', tensor_pairs)
    
    return new_df

In [53]:
def add_encoded_labels(df):
    '''
    Adds tensor version of labels to dataframe
    '''    
    if 'label_tensor' in df.columns:
        return df
    new_df = df
    encoded_labels = [classes[lb] if lb in classes else classes[alt_classnames[lb]] for lb in list(df.label)]
    encoded_labels = torch.as_tensor(encoded_labels)
    
    new_df.insert(1, 'label_tensor', encoded_labels)
    return new_df


In [54]:
class NLIData(Dataset):
    '''
    Subclass of pytorch Dataset
    '''
    def __init__(self, dataframe, require_label=True):

        '''
        args:
            dataframe: pandas dataframe in style of get_df_dict output
            require_label: bool indicating whether dataset has a discrete gold label
        '''
        # NLI variation does not have a discrete label, instead using a label distribution for each example

        df = add_encoded_input(tokenizer, dataframe)
        general_data = ['input_encoding', 'premise', 'hypothesis']
        if require_label:
            df = add_encoded_labels(df)
            general_data = ['input_encoding', 'label_tensor', 'premise', 'hypothesis', 'label']
        self.set_specific_data = df.drop(general_data, axis=1)
        # set_specific_data is a bit of a garbage dump category that serves the purpose of making this class able to handle the diverse
        # categories in the input sets

        self.df = df
        self.require_label = require_label


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        out_dict = {'input_encoding': row['input_encoding'],
                    'premise': row['premise'],
                    'hypothesis': row['hypothesis'],
                    'input_length': len(row['input_encoding']['input_ids']),
                    'unique_data': self.set_specific_data.iloc[idx]
                    }
        if self.require_label:
            out_dict['label'] = row['label']
            out_dict['label_tensor'] = [row['label_tensor']]

        return out_dict

In [55]:
def collate_fn(batch_data):
    """
    Collate function for pytorch dataloader
    """
    # mainly used for padding

    input_ids = [example['input_encoding']['input_ids'] for example in batch_data]
    token_type_ids = [example['input_encoding']['token_type_ids'] for example in batch_data]
    attention_masks = [example['input_encoding']['attention_mask'] for example in batch_data]

    p_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    p_token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0)
    p_attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    unique_data = [example['unique_data'] for example in batch_data]
   
    batch = {'input_ids': p_input_ids, 'token_type_ids': p_token_type_ids, 'attention_masks': p_attention_masks,
             'unique_data': unique_data}
    
    if 'label_tensor' in batch_data[0]:
        label_tensors = [example['label_tensor'] for example in batch_data]
        label_tensors = torch.as_tensor(label_tensors)  #'label_tensors': label_tensors
        batch['label_tensors'] = label_tensors
    
    return batch

In [56]:
def get_datasets():
    '''
    Gets and cleans datasets. Dataset selection is specified by notebook config modes

    Returns:
        datasets: list of datasets. If dataset is split into subsets, those subsets are grouped in dicts.
    '''

    sets_to_get = []
    if toy_run:
        sets_to_get.append('evalsets') # some evals used for training in this case, since this assumes the big training sets are not locally available
    else:
        if not train_eval_mode == 'eval':
            sets_to_get.append('trainsets') # trainsets = snli, mnli, anli (all rounds)
        if not train_eval_mode == 'train':
            sets_to_get.append('evalsets') # evalsets = cnli (both subsets), NLIvar
    
    # Notes:
    # NLIvar df will always be fetched (but its dataset not always returned) since it is used to filter trainsets for duplicates
    # Code could be restructured to require fewer identical conditionals. The reason for the structure is only that I wrote the fetching modes
    # after the fact
    
    def get_NLIvar_duplicate_indices(trainset_str='snli'):
        '''
        Get the indices in NLI variation that correspond to examples that exist in the training data.
        Arg:
            trainset_str: string corresponding to a trainset
                valid options: 'snli', 'mnli'
        '''


        # The NLI variation dataset partially uses inference pairs from SNLI and MNLI training data.
        # I remove duplicates from the training sets since the effect on the size is negligible (~100 pairs
        # per set, out of hundreds of thousands). Doing it the other way around (remove from NLIvar) would
        # remove a notable chunk out of this eval set

        # chaosNLI also draws from MNLI and SNLI, but from the dev sets, so there is no data contamination

        if trainset_str == 'snli':
            trainset = snli['snli_train']
            trainset_id_list = list(snli['snli_train']['pairID'])
        elif trainset_str == 'mnli':
            trainset = mnli['multinli_train']
            trainset_id_list = list(mnli['multinli_train']['pairID'])
        else:
            return None

        nli_var_filtered = nli_var.loc[nli_var['task'] == trainset_str]
        nli_var_ids = list(nli_var_filtered['id'])
        count = 0
        duplicate_var_indices = []
        duplicate_train_indices = []
        for item in nli_var_ids:
            if item in trainset_id_list:
                count +=1
                varidx = nli_var_filtered.loc[nli_var_filtered['id'] == item].index[0]
                trainsetidx = trainset['pairID'].loc[trainset['pairID'] == item].index[0]
                duplicate_var_indices.append(varidx)
                duplicate_train_indices.append(trainsetidx)

        return duplicate_train_indices
    
    def column_filter():
        '''
        Removes unused dataset columns
        '''

        if 'trainsets' in sets_to_get:
            snli['snli_train'].drop(['annotator_labels', 'captionID', 'pairID', 'sentence1_binary_parse', 'sentence1_parse',
                                    'sentence2_binary_parse', 'sentence2_parse' ], axis=1, inplace=True),
            mnli['multinli_train'].drop(['annotator_labels', 'genre', 'pairID', 'promptID', 'sentence1_binary_parse', 
                                        'sentence1_parse', 'sentence2_binary_parse', 'sentence2_parse'], axis=1, inplace=True)
            
            for round_str, sets in anli1.items():
                anli1[round_str]['train'].drop(['uid', 'model_label', 'emturk', 'genre', 'reason', 'tag'], axis=1, inplace=True)
        
        
        if 'evalsets' in sets_to_get:
            cnli['chaosNLI_snli'].drop(['uid', 'label_dist', 'old_labels', 'old_label', 'source'], axis=1, inplace=True),
            cnli['chaosNLI_mnli_m'].drop(['uid', 'label_dist', 'old_labels', 'old_label', 'source'], axis=1, inplace=True),

            nli_var.drop(['task', 'original-dataset-label', 'id', 'num-NA'], axis=1, inplace=True)
    
    #anli2 = get_df_dict('anli_reanalyzed')

    if 'trainsets' in sets_to_get:
        print('Getting SNLI dataframe...')
        snli = get_df_dict('snli')
        snli['snli_train'].drop(snli['snli_train'].loc[snli['snli_train']['label']=='-'].index, inplace=True) # some nonlabels in snli

        print('Getting MNLI dataframe...')
        mnli = get_df_dict('multinli')

        print('Getting ANLI dataframes...')
        anli1 = get_df_dict('anli')

    
    if 'evalsets' in sets_to_get:
        print('Getting CNLI dataframes...')
        cnli = get_df_dict('chaosNLI')
    
    print('Getting NLI variation dataframe...')
    nli_var = pd.read_json(path_or_buf='./data/NLI_variation/NLI_variation_data.jsonl', lines=True)

    if 'trainsets' in sets_to_get:
        duplicate_snli_indices = get_NLIvar_duplicate_indices('snli')
        duplicate_mnli_indices = get_NLIvar_duplicate_indices('mnli')
        snli['snli_train'].drop(index=duplicate_snli_indices, inplace=True)
        snli['snli_train'].reset_index(drop=True, inplace=True)
        mnli['multinli_train'].drop(index=duplicate_mnli_indices, inplace=True)
        mnli['multinli_train'].reset_index(drop=True, inplace=True)
    
    column_filter()

    datasets = []
    
    if 'trainsets' in sets_to_get:
        print('Transforming SNLI data...')
        snli_train_dataset = NLIData(snli['snli_train'])
        datasets.append(snli_train_dataset)
        
        print('Transforming MNLI data...')
        mnli_train_dataset = NLIData(mnli['multinli_train'])
        datasets.append(mnli_train_dataset)


        print('Transforming ANLI data...')
    
        anli_training_datasets = {}
        for round_str in anli1.keys():
            print('\t', round_str+'...')
            anli_training_datasets[round_str] = NLIData(anli1[round_str]['train'])
        datasets.append(anli_training_datasets)
    

    if 'evalsets' in sets_to_get:
        print('Transforming CNLI data...')

        cnli_eval_datasets = {}
        for name, df in cnli.items():
            print('\t', name+'...')
            cnli_eval_datasets[name] = NLIData(df)
        datasets.append(cnli_eval_datasets)

        print('Transforming NLI variation data...')

        NLI_var_eval_dataset = NLIData(nli_var, require_label=False)
        datasets.append(NLI_var_eval_dataset)

    return  datasets
    
    

    

In [57]:
def reduce_train(dataset, len_new_data):
    '''
    Reduces an input set to a target size

    Args:
        dataset: original dataset
        len_new_data: target length of new dataset
    Returns:
        sample of input dataset of target length
    '''
    len_kept_data = len(dataset) - len_new_data
    new_train_set, _ = random_split(dataset, [len_kept_data, len(dataset)-len_kept_data])
    return new_train_set

def prepare_trainsets(snli_train_dataset, mnli_train_dataset, anli_training_datasets):
    '''
    Allocate data to each round of training.

    Args:
        snli, mnli trainsets
        dicts of per round anli trainsets
    Returns:
        round1: sample of snli + mnli equal to sum(len(anli1), len(anli2), len(anli3))
        subsequent rounds adds anli sequentially and subtracts the added anli round's 
        length from the snli+mnli portion of the previous round, so:
        round2: reduced smnli sample + anli1
        round3: further reduced smnli sample + anli2
        round4: further reduced smnli sample + anli3
    '''

    # This function depends on the trainsets, so it is the only part of the notebook that is not exemplified at all by a toy run
    
    smnli = ConcatDataset([snli_train_dataset, mnli_train_dataset])

    len_anli = 0
    for round, dataset in anli_training_datasets.items():
    
        len_anli+=len(dataset)
   
    reduced_smnli1, _ = random_split(smnli, [len_anli, len(smnli)-len_anli])

    round1 = reduced_smnli1

    reduced_smnli2 = reduce_train(reduced_smnli1, len(anli_training_datasets['R1']))
    round2 = ConcatDataset([reduced_smnli2, anli_training_datasets['R1']])

    reduced_smnli3 = reduce_train(reduced_smnli2, len(anli_training_datasets['R2']))
    round3 = ConcatDataset([reduced_smnli3, anli_training_datasets['R1'], anli_training_datasets['R2']])
    

    round4 = ConcatDataset([anli_training_datasets['R1'], anli_training_datasets['R2'], anli_training_datasets['R3'] ])

    print(len(round1), len(round2), len(round3), len(round4)) # same length

    return round1, round2, round3, round4

In [58]:
def print_metrics(y_pred, y_test, classes, logfile = False):

    """
    Prints accuracy and macro f-score based on inputs.
    Writes to file instead if logfile == truthy
    Args:
        y_pred: list of class predictions made by model
        y_test: list of gold labels of evaluation set where y_test[i] corresponds to y_pred[i]
        logfile: path to log. Must be in an existing directory
    """
    # logfile is vestigial in final version. Nothing to change at the config-level to turn this on.
    # function is not really meant to store final data, just to track mid-training whether the model is getting better

    if logfile:
        print(logfile)
        print('Logging results to file instead of printing: see model directory')
        resultspath = os.path.join(os.getcwd(), logfile) 
        print('Logging to',resultspath)
        log = open(resultspath, 'a')
        sys.stdout = log

    accuracy = accuracy_score(y_test, y_pred)
    #p_scores = precision_score(y_test, y_pred, average = None, zero_division=0.0)
    #r_scores = recall_score(y_test, y_pred, average = None, zero_division=0.0)
    f_scores = f1_score(y_test, y_pred, average = None, zero_division=0.0)
    #macro_p = precision_score(y_test, y_pred, average = "macro", zero_division=0.0)
    #macro_r = recall_score(y_test, y_pred, average = "macro", zero_division=0.0)
    macro_f = f1_score(y_test, y_pred, average = "macro", zero_division=0.0)

    print()
    print("accuracy is", accuracy)
    print()

    for label, f_score in zip(classes, f_scores):
        print("f-score for label '{}' is {}".format(label, f_score))
    print("macro f-score is", macro_f)
    print('------------------------------------------------')
    if logfile:
        log.close()

In [59]:
def train(model_specifier, train_ds, dirpath, hyperparams, midtrain_eval=False):
    '''
    Trains model and saves a checkpoint.

    Args:
        model_specifier: dict of huggingface classes for loading, see globals() -> model_selection
        train_ds: Dataset of class NLIData
        dirpath: path to where model will be saved
        midtrain_eval: if truthy, should be NLIData dataset used for midtrain evaluation. Evaluation skipped if falsy
    '''

    model = model_specifier['sequence_classification'].from_pretrained(model_specifier['model_name'], num_labels=3).to(device)
    model = model.to(device)

    models = {}

    optimizer = torch.optim.AdamW(model.parameters(), lr=hyperparams['lr'])
    for epoch in range(hyperparams['epochs']):
        print('Epoch', str(epoch+1)+':')
        print()  
        loader = DataLoader(dataset=train_ds,
                        batch_size=hyperparams['batch_size'],
                        shuffle=True,
                        collate_fn=collate_fn)
        loss_metrics = {}
        total_loss = 0
        for batch_id, batch in enumerate(tqdm.tqdm(loader, desc="Batches")):
            model.train()
            outputs = model(batch['input_ids'].to(device),
                            attention_mask=batch['attention_masks'].to(device),
                            token_type_ids=batch['token_type_ids'].to(device),
                            labels=batch['label_tensors'].to(device))
            
            loss, logits = outputs[:2] #pretrained bert includes the loss if labels are provided to the model,
                                        # so I don't need to do that separately

            loss.backward()
            optimizer.step()
  
            total_loss += loss.item()
            model.zero_grad()
            del outputs, loss, logits

        average_loss = total_loss/len(loader)
        loss_metrics['total_loss'] = total_loss
        loss_metrics['average_loss'] = average_loss           
        print('Average loss for epoch:', average_loss)

        print()

        checkpoint_path = os.path.join(dirpath, 'e'+str(epoch+1))
        model.save_pretrained(checkpoint_path)
        tokenizer.save_pretrained(checkpoint_path)
        loss_path = os.path.join(checkpoint_path, 'losses.json')
        with open(loss_path, 'w') as f:
            json.dump(loss_metrics, f)
        models['e'+str(epoch+1)] = model
        
        if midtrain_eval:
            print('Evaluating epoch {}.'.format(epoch+1))
            # just printing some results to see if the model seems to be improving
            test(eval_dataset=midtrain_eval, modelin=model, print_results=True)
    return models
        
    

In [60]:
def entropy_testing(dataset, modelpath):
    # just some work I did for myself to ensure I got the entropy right, since different functions
    # work somewhat differently and accept different input forms.
    # end up using scipy entropy, base2

    loader = DataLoader(dataset, collate_fn=collate_fn, batch_size=1)
    model = model_specifier['sequence_classification'].from_pretrained(modelpath).to(device)
    model.eval()
    
    for batch in loader:
        outputs = model(batch['input_ids'].to(device),
                        attention_mask=batch['attention_masks'].to(device),
                        token_type_ids=batch['token_type_ids'].to(device),
                    )
        
        # cnli includes entropy with base2 logarithm:
        print()
        print(type(batch))
        cnli_ent = batch['unique_data'][0]['entropy']
        print('cnli entropy, base2:',cnli_ent)
        print()

        # label count of the same example:
        lc = batch['unique_data'][0]['label_count']

        #from count to label % distribution:
        dist = [count/100 for count in lc]

        # DIY numpy entropy from dist
        p = np.array(dist)
        logp = np.log2(p)
        numpy_ent1 = np.sum(-p*logp)

        # numpy entropy from lc
        p = np.array(lc)
        logp = np.log2(p)
        numpy_ent2 = np.sum(-p*logp)

        print('numpy:')
        print('np log2 entropy from dist:', numpy_ent1)
        print('np log2 entropy from count:', numpy_ent2)
        print()

        p_tensor_dist = torch.Tensor(dist)
        tensor_lc = torch.Tensor(lc)

        pt_ent1 = Categorical(probs = p_tensor_dist).entropy()
        pt_ent2 = Categorical(probs = tensor_lc).entropy()

        print('pt, default=natlog:')
        print('pt ent from dist:', pt_ent1)
        print('pt ent from count:', pt_ent2)
        print()

        sp_ent_dist = entropy(dist)
        sp_ent_lc = entropy(lc)
        sp_ent_dist_b2 = entropy(dist, base=2)

        print('scipy, default=natlog')
        print('scipy entropy from dist:', sp_ent_dist)
        print('scipy entropy from count:', sp_ent_lc)
        print('scipy entropy from dist base2:', sp_ent_dist_b2)
        print() 

        logits = outputs[0][0]
        probs = torch.softmax(logits, -1).squeeze()

        print('logits',logits)
        print('probs',probs)
        pt_ent_from_logits = Categorical(logits = p_tensor_dist).entropy()
        pt_ent_from_softmax = Categorical(probs = p_tensor_dist).entropy()
        np_probs = probs.cpu().detach().numpy()
        print('np_probs',np_probs)
        scipy_ent_from_logits = entropy(logits.cpu().detach().numpy())
        scipy_ent_from_softmax = entropy(np_probs)
        scipy_ent_from_logits_b2 = entropy(logits.cpu().detach().numpy(), base=2)
        scipy_ent_from_softmax_b2 = entropy(np_probs, base=2)

        p = np.array(np_probs)
        log2p = np.log2(p)
        logp = np.log(p) #nat log
        model_numpy_entb2 = np.sum(-p*log2p)
        model_numpy_natent = np.sum(-p*logp)

        print('pt entropy from logits:', pt_ent_from_logits)
        print('pt entropy from softmax:', pt_ent_from_softmax)
        print('scipy entropy from logits:', scipy_ent_from_logits)
        print('scipy entropy from softmax:', scipy_ent_from_softmax)
        print('scipy entropy from logits, b2:', scipy_ent_from_logits_b2)
        print('scipy entropy from softmax, b2:', scipy_ent_from_softmax_b2)
        print()
        print('model numpy entropy, natlog', model_numpy_natent)
        print('model numpy entropy, b2', model_numpy_entb2)
        break


In [61]:
def get_entropy(unique_data, logits):
    '''
    Computes model entropy and human entropy.

    Arg:
        unique_data: data from which to extract label distributions, unique structure per original set
        logits: model output logits
    '''
    if 'label_count' in unique_data:
        hum_dist = [c/100 for c in unique_data['label_count']]
    else:
        #print(unique_data)
        hum_dist = [0, 0, 0]
        lb_scales = unique_data['labels']

        # Discretize the labels to make them interface in the model, by the same thresholds as
        # in the original paper (Pavlick, Kwiatkowski, 2019).
        # This means that while I don't make use of the grading scale, I still utilise the annotator variation.
        # Also makes the NLI_val results more comparable to those from CNLI.
        for l in lb_scales:
            if l > 16.7:
                hum_dist[0] = hum_dist[0]+1
            elif l < -16.7:
                hum_dist[2] = hum_dist[2]+1
            else:
                hum_dist[1] = hum_dist[1]+1
        hum_dist = [c/sum(hum_dist) for c in hum_dist]
    
    hum_ent = entropy(hum_dist, base=2)

    model_dist = torch.softmax(logits, -1).squeeze().cpu().detach().numpy()
    model_ent = entropy(model_dist, base=2)  
    return hum_ent, model_ent

In [62]:
def test(eval_dataset, modelpath=False, modelin=False, print_results=False):
    '''
    Evaluates model on one eval set.

    Args:
        eval_dataset: NLIData type dataset
        modelpath: path to model, must be supplied if modelin is not. Will be used if both are provided
        modelin: model object, must be supplied if modelpath is not. Will not be used if both are provided
        print_results: prints accuracy and macro f-score if truthy
    Returns:
        lists of y_pred, y_test, human_entropies, model_entropies
        element i of each list corresponds to the same example, except if eval set is NLI_val,
        in which case y_test and y_pred are empty        
    '''
    # print_results is mostly for a mid-training overview. Full results come later

    if not modelpath and not modelin:
        print("Suppy either 'modelpath' or 'modelin' argument")
        return None    

    loader = DataLoader(dataset=eval_dataset, collate_fn=collate_fn, batch_size=1)
    if modelpath:
        model = model_specifier['sequence_classification'].from_pretrained(modelpath).to(device)
    else:
        model = modelin

    y_test = []
    y_pred = []
    human_entropies = []
    model_entropies = []

    model.eval()
    with torch.no_grad():
        if toy_run:
            print('Toy run: stopping early')
        for batch_id, batch in enumerate(tqdm.tqdm(loader)):
            if toy_run:
                if len(human_entropies) > 10:
                    break
            if 'label_tensors' in batch:
                label_input = batch['label_tensors'].to(device)
            else:
                label_input = None # no gold labels in NLI_var

            outputs = model(batch['input_ids'].to(device),
                                attention_mask=batch['attention_masks'].to(device),
                                token_type_ids=batch['token_type_ids'].to(device),
                                labels=label_input)
            
            
            if 'label_tensors' in batch:
                loss, logits = outputs[:2]
                pred = torch.argmax(logits)
                gold = batch['label_tensors'][0][0]
                y_test.append(gold.cpu())
                y_pred.append(pred.cpu())
            else:
                logits = outputs[0]


            hum_ent, model_ent = get_entropy(batch['unique_data'][0], logits)
            human_entropies.append(hum_ent)
            model_entropies.append(model_ent)

    if print_results:
        print_metrics(y_pred, y_test, classes)
    
    return y_pred, y_test, human_entropies, model_entropies

In [63]:
def plot_categorical(data_dict):
    '''
    Plots a line graph based on input dict
    '''
    #if 'linestyle' in data_dict:
    #    linestyle = data_dict['linestyle']
    #else:
    #    linestyle='-'
    x = data_dict['x']
    for y, label in zip(data_dict['y'], data_dict['legend_lables']):
        # y is list of numerical data
        # data_dict['y'] is a list of y, with each element = a graph line
        # x is list of epochs (xticks)
        # label is legend name
        plt.plot(x, y, label=label, marker = 'o', ) #linestyle=linestyle
    if 'ylim' in data_dict:
        plt.ylim(data_dict['ylim'])
    plt.legend()
    plt.title(data_dict['title'], fontsize=16)
    plt.savefig(data_dict['filepath'], bbox_inches="tight")
    plt.close()

In [64]:
def reorganise_for_crossround(round_set_epoch):
    '''
    Changes input nested list shape from [rounds[sets[epochs]]] to [epochs[sets[rounds]]]
    '''
    # The loop unfurling order in full_eval is based on the needs of the intra-round plots (epochs=x, sets=legend labels, rounds=new figure).
    # This causes a nesting issue for inter-round plots (epochs=new plot, sets=legend labels, rounds=x)
    # Hence, this:

    epoch_set_round = []

    for e in range(len(round_set_epoch[0][0])):
        set_data = []
        for s in range(len(round_set_epoch[0])):
            rounddata = [round_set_epoch[r][s][e] for r in range(len(round_set_epoch))]
            set_data.append(rounddata)
        epoch_set_round.append(set_data)
        
    return epoch_set_round
        

In [65]:
def full_eval(model_specifier, modeldir, eval_sets):
    '''
    Evaluates each checkpoint of a model on all evaluation sets for accuracy and macro f1 (if available),
    and entropy correlation with human distributions. Plots scatterplots per epoch per set for entropy correlation,
    and lineplots for development across epochs within-round and across rounds.

    Args:
        model_specifier: id for huggingface model, see globals()
        modeldir: path to model directory at the level of children to ./models
        eval_sets: dict of evaluation sets
        '''
    
    plot_data_dicts = []
    overall_resultspath = os.path.join(modeldir, 'results')
    if not os.path.exists(overall_resultspath):
        os.mkdir(overall_resultspath)
    cross_round_accuracies = []
    cross_round_mfscores = []
    cross_round_ent_corrs = []
    rounds = []
    epoch_names = []
    print('Running evaluation for every epoch-checkpoint of every round on each set.')
    for roundname in os.listdir(modeldir):
        if 'json' in roundname or 'results' in roundname:
            continue
        rounds.append(roundname)
        rounddir = os.path.join(modeldir, roundname)
        round_resultspath = os.path.join(rounddir, 'results')
        if not os.path.exists(round_resultspath):
            os.mkdir(round_resultspath)
        epochs = []
        per_set_accs = []
        per_set_mfs = []
        per_set_entcorrs = []
        for eval_set_name, eval_set in list(eval_sets.items()):        
            per_epoch_accs =  []
            per_epoch_mfs = []
            per_epoch_entcorrs = []
       
            for epoch in os.listdir(rounddir):
                epochdir = os.path.join(rounddir, epoch)
                if 'results' in epoch: # checks if the current iteration is over a results directory at the round level (i.e sibling of 'e1' etc.)
                                        # not to be mistaken with the next few lines of code,
                                        # which creates a results directory for epoch-level results.
                    continue
                epoch_resultspath = os.path.join(epochdir, 'results')
                if not os.path.exists(epoch_resultspath):
                    os.mkdir(epoch_resultspath)

                model =  model_specifier['sequence_classification'].from_pretrained(epochdir).to(device)
                if len(epochs) < len(os.listdir(rounddir))-1: # Number of checkpoints per round
                                                # could have just hardcoded 6, since that's the number of epochs I train with,
                                                # but this is technically more flexible  
                    epochs.append(epoch)

                print('Getting scores for {} {} on {}.'.format(roundname, epoch, eval_set_name))
                y_pred, y_test, human_entropies, model_entropies = test(modelin=model, eval_dataset=eval_set)
                human_entropies = np.array(human_entropies)
                model_entropies = np.array(model_entropies)
                pearson_c = np.corrcoef(model_entropies, human_entropies)[0][1]
                #r, p = pearsonr(model_entropies, human_entropies) # changed to np as it behaved somewhat better on toy runs
                                                                    # still cannot rescue the obviously terrible toy models
                            
                per_epoch_entcorrs.append(pearson_c)
                if not eval_set_name == 'NLIVariation':
                    accuracy = accuracy_score(y_test, y_pred)
                    macro_f = f1_score(y_test, y_pred, average = "macro", zero_division=0.0)
                    per_epoch_accs.append(accuracy)
                    per_epoch_mfs.append(macro_f)

                plt.scatter(human_entropies, model_entropies)
        
                # from https://pythonguides.com/matplotlib-best-fit-line/ and https://www.statology.org/line-of-best-fit-python/

                a, b = np.polyfit(human_entropies, model_entropies, 1)
                plt.plot(human_entropies, a*human_entropies+b, color='orange') 
                plt.title(" ".join((eval_set_name, roundname, epoch, 'entropy with line of best fit')))
                plt.xlabel('Human entropy')
                plt.ylabel('Model entropy')
                plt.annotate('r = {:.2f}'.format(pearson_c), xy=(0.05, 0.95), xycoords='axes fraction')
                plt.savefig(os.path.join(epoch_resultspath, eval_set_name+'-entropies'), bbox_inches="tight")
                plt.close()
                
            if not eval_set_name == 'NLIVariation':               
                per_set_accs.append(per_epoch_accs)
                per_set_mfs.append(per_epoch_mfs)
            per_set_entcorrs.append(per_epoch_entcorrs)

        # per round results dicts to be plugged into plot_categorical:    
        plot_input_accuracy = {'y': per_set_accs, 'x': epochs, 'legend_lables': list(eval_sets.keys())[:2],
                        'title': roundname+' Accuracy', 'filepath': os.path.join(round_resultspath, 'accuracy'), 'ylim': (0,1)}
        plot_input_mf = {'y': per_set_mfs, 'x': epochs, 'legend_lables': list(eval_sets.keys())[:2],
                        'title': roundname+' Macro F1', 'filepath': os.path.join(round_resultspath, 'macrof1'), 'ylim': (0,1)}
        plot_input_ent = {'y': per_set_entcorrs, 'x': epochs, 'legend_lables': list(eval_sets.keys()),
                        'title': roundname+' Model/human entropy correlation', 'filepath': os.path.join(round_resultspath, 'entropy_correlation'), 'ylim': (-1,1)}

        plot_data_dicts.extend([plot_input_accuracy, plot_input_mf, plot_input_ent])

        if not epoch_names: # just to have the number of epochs at the cross-round level I do this once:
            epoch_names.extend(epochs)

        cross_round_accuracies.append(per_set_accs)
        cross_round_mfscores.append(per_set_mfs)
        cross_round_ent_corrs.append(per_set_entcorrs)
        
        # Get metrics from last epoch for each set:
        # I don't do this here, but this could instead fetch the result of a specified epoch,
        # e.g., if I wanted to get specifically the best epoch

        # for each metric:
        # from structure [set1[e1,e2,e3,e4,e5,e6], set2[...]...] to [e1[set1, set2...], e2[...]...]


    # final_epoch_acc/mf/ents are lists of lists whose outer elements are per round data and subelements of those elements are per test set
    # data for the respective round. I want to plot metrics across rounds (so round number on X), and plot_categorical expects y to be
    # list of lists where the INNER list corresponds to the x-ticks. So I invert:
    # (could have avoided this by iterating over sets before over rounds, but my per-round plotting above has a different preference)

    

    #reorganised_acc = reorder_sublists_by_idx(cross_round_accuracies)
    #reorganised_mf = reorder_sublists_by_idx(cross_round_mfscores)
    #reorganised_ent = reorder_sublists_by_idx(cross_round_ent_corrs)

    reorganised_acc = reorganise_for_crossround(cross_round_accuracies)
    reorganised_mf = reorganise_for_crossround(cross_round_mfscores)
    reorganised_ent = reorganise_for_crossround(cross_round_ent_corrs)


    accpath = os.path.join(overall_resultspath, 'accuracy')
    mfpath = os.path.join(overall_resultspath, 'macrof1')
    entpath = os.path.join(overall_resultspath, 'entropy_correlation')

    for p in (accpath, mfpath, entpath):
        if not os.path.exists(p):
            os.mkdir(p)


    for i in range(len(epoch_names)):         
        # plotdicts comparing epoch n across rounds
        plot_input_accuracy = {'y': reorganised_acc[i], 'x': rounds, 'legend_lables': list(eval_sets.keys())[:2],
                            'title': 'Epoch {} accuracy across rounds'.format(i+1),
                            'filepath': os.path.join(accpath, epoch_names[i]), 'ylim': (0,1)}
        plot_input_mf = {'y': reorganised_mf[i], 'x': rounds, 'legend_lables': list(eval_sets.keys())[:2],
                            'title': 'Epoch {} macro F1 across rounds'.format(i+1),
                            'filepath': os.path.join(mfpath, epoch_names[i]), 'ylim': (0,1)}
        plot_input_ent = {'y': reorganised_ent[i], 'x': rounds, 'legend_lables': list(eval_sets.keys()), 
                          'title': 'Epoch {} model/human entropy correlation across rounds'.format(i+1),
                          'filepath': os.path.join(entpath, epoch_names[i]), 'ylim': (-1,1)} #'linestyle': 'None',

        plot_data_dicts.extend([plot_input_accuracy, plot_input_mf, plot_input_ent])
    
    for plot_data in plot_data_dicts:
        plot_categorical(plot_data)                    


In [66]:
def main():

    if (train_eval_mode == 'eval') or toy_run:
        cnli_eval_datasets, NLI_var_eval_dataset = get_datasets()
    else:
        snli_train_dataset, mnli_train_dataset, anli_training_datasets, cnli_eval_datasets, NLI_var_eval_dataset = get_datasets()
        
    if not train_eval_mode == 'eval':
        if toy_run:
            training_rounds = list(cnli_eval_datasets.values())
            print('Toy run:')
            print('Training on toy parameters')
            print(hp)
            print('Training on', list(cnli_eval_datasets.keys()))
        else:
            r1train, r2train, r3train, r4train = prepare_trainsets(snli_train_dataset, mnli_train_dataset, anli_training_datasets)
            training_rounds = (r1train, r2train, r3train, r4train)
            print('Running full training on all four rounds. Hyperparameters:')
            print(hp)

        
        newpath = os.path.join(os.getcwd(), 'models')
        if not os.path.exists(newpath):
            os.mkdir(newpath)
        
        # the modelname is the name of a subdirectory in ./models, which will have its own filesystem:
            # a directory 'r'+n per round and a sibling 'results' directory for cross-round results
            # each 'r' dir has subdirs 'e'+n per epoch with a sibling 'results' directory for in-round results
            # each 'e' dir has various files, mostly huggingface-style model files (safetensors removed for git submission due to size),
                # and a 'results' directory for per-epoch results
        
        if manual_modelname:
            dirname = manual_modelname
        elif toy_run:
            dirname = 'toy-models'
        else:
            modelname = model_specifier['model_name']
            batch_size_str = 'bs'+str(hp['batch_size'])
            epochs_str = 'eps'+str(hp['epochs'])
            lr_str = 'lr'+str(hp['lr'])
            dirname = '-'.join((modelname, batch_size_str, epochs_str, lr_str))
        
        dirpath = os.path.join(newpath, dirname)

        if overwrite:
            if os.path.exists(dirpath):
                shutil.rmtree(dirpath)
        else:
            num = 2
            dirpath_base = dirpath
            while os.path.exists(dirpath):
                dirpath = dirpath_base+'_'+str(num)
                num+=1

        os.mkdir(dirpath)

        hppath = os.path.join(dirpath, 'hyperparameters.json')
        
        with open(hppath, 'w') as f:
                json.dump(hp, f)
        
        if torch.cuda.is_available() or toy_run:          
            for round_num, round_data in enumerate(training_rounds):
                roundnum_str = 'r'+str(round_num+1)
                roundpath = os.path.join(dirpath, roundnum_str)
                os.mkdir(roundpath)
                resultspath = os.path.join(roundpath, 'results')
                os.mkdir(resultspath)
                    #dirname = '-'.join((roundnum_str, modelname, batch_size_str, epochs_str, lr_str))

                #dirpath = os.path.join(newpath, dirname)
                #os.mkdir(dirpath)
                print()
                print('-------------------------------')
                print('Training', roundnum_str)
                round_models = train(model_specifier, round_data, roundpath, hp, midtrain_eval=cnli_eval_datasets['chaosNLI_snli'])
                print()
                #models[roundnum_str] = round_models
        else:
            print('Cuda unavailable. Set toy_run to true or try again with cuda available.')
            return None
        
        
        
    if not train_eval_mode == 'train':
        eval_sets = {'ChaosNLI-MNLI': cnli_eval_datasets['chaosNLI_mnli_m'], 'ChaosNLI-SNLI': cnli_eval_datasets['chaosNLI_snli'], 'NLIVariation': NLI_var_eval_dataset}

        try:
            modeldir = dirpath # works if model has been trained in the same session, i.e., train_eval_mode is not 'eval'
            print('Evaluating just-trained model, saving results to {} and its subdirectories'.format(modeldir))
        except:
            if manual_modelname:
                dirpath = os.path.join('./models', manual_modelname)
            else:
                dirpath = os.path.join('./models', 'toy_models') # default: this will be created by one as-submitted training mode run with no changes to config/code

            if os.path.exists(dirpath):
                modeldir = dirpath
                print('Evaluating {}\nFind results in this directory and its subdirectories'.format(dirpath))
                print()
            else:
                print('Model directory not found.')
                print("Try training a new model or give the name of an existing model (see ./models/) to 'manual_modelname' in the config file")
                print('Note: the preexisting model directory does not have loadable models.')
                return None
        

        full_eval(model_specifier, modeldir, eval_sets)
        

In [None]:
device, model_specifier, classes, alt_classnames, tokenizer, hp, toy_run, train_eval_mode, manual_modelname, overwrite = globals()
main()

Getting CNLI dataframes...
Getting NLI variation dataframe...
Transforming CNLI data...
	 chaosNLI_mnli_m...
	 chaosNLI_snli...
Transforming NLI variation data...
Toy run:
Training on toy parameters
{'batch_size': 4, 'epochs': 2, 'lr': 0.01}
Training on ['chaosNLI_mnli_m', 'chaosNLI_snli']

-------------------------------
Training r1
Epoch 1:



Batches: 100%|██████████| 400/400 [00:25<00:00, 15.70it/s]


Average loss for epoch: 2.355997349500831

Evaluating epoch 1.
Toy run: stopping early


  1%|          | 11/1514 [00:00<00:22, 68.26it/s]



accuracy is 0.36363636363636365

f-score for label 'entailment' is 0.5333333333333333
f-score for label 'neutral' is 0.0
f-score for label 'contradiction' is 0.0
macro f-score is 0.17777777777777778
------------------------------------------------
Epoch 2:



Batches: 100%|██████████| 400/400 [00:25<00:00, 15.81it/s]


Average loss for epoch: 2.177538239993155

Evaluating epoch 2.
Toy run: stopping early


  1%|          | 11/1514 [00:00<00:22, 68.05it/s]


accuracy is 0.09090909090909091

f-score for label 'entailment' is 0.0
f-score for label 'neutral' is 0.0
f-score for label 'contradiction' is 0.16666666666666666
macro f-score is 0.05555555555555555
------------------------------------------------


-------------------------------
Training r2





Epoch 1:



Batches: 100%|██████████| 379/379 [00:23<00:00, 16.39it/s]


Average loss for epoch: 2.4032738186605767

Evaluating epoch 1.
Toy run: stopping early


  1%|          | 11/1514 [00:00<00:22, 66.46it/s]



accuracy is 0.09090909090909091

f-score for label 'entailment' is 0.0
f-score for label 'neutral' is 0.0
f-score for label 'contradiction' is 0.16666666666666666
macro f-score is 0.05555555555555555
------------------------------------------------
Epoch 2:



Batches:  69%|██████▊   | 260/379 [00:15<00:07, 16.48it/s]