In [2]:
import torch
from torch.utils.data import DataLoader, Dataset, ConcatDataset, random_split
from torch.distributions import Categorical
from torch.distributions.continuous_bernoulli import ContinuousBernoulli
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import requests
import pandas as pd
import json
from datasets import load_dataset
import os
import shutil
#from transformers import BertModel
from transformers import BertTokenizer
#from torch.utils.data import Dataset#, Dataloader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer, BertForSequenceClassification, AutoModel, logging
logging.set_verbosity_error()
import tqdm
import sys
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import random
import numpy as np
from scipy.stats import entropy, pearsonr
from scipy.special import kl_div
import matplotlib.pyplot as plt
import argparse


In [10]:
def set_seed(seed):
    '''
    Sets seed for random, np, torch
    '''
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [None]:
def globals():
    '''
    Returns the following parameters to be set globally:
        device, model_specifier, classes, alt_classnames, tokenizer, hyperparams, toy_run
    '''
    # To avoid having to pass the same variables over and over, I set some global ones here. Keeps them in the same place

    parser = argparse.ArgumentParser()
    #args = parser.parse_args(args=[])
    parser.add_argument("-c", "--config", help="configuration file", default="config.json")
    args = parser.parse_args(args=[])
    config = json.load(open(args.config))

    set_seed(config["seed"])

    if torch.cuda.is_available():
        device = torch.device(config["device"])
    else:
        device = torch.device('cpu')

    model_selection = {
    'bert_base': {
        "model_name": "bert-base-uncased",
        "tokenizer": BertTokenizer,
        "sequence_classification": BertForSequenceClassification,
    },
    
    'bert_large': {
        "model_name": "bert-large-uncased",
        "tokenizer": BertTokenizer,
        "sequence_classification": BertForSequenceClassification,
    },
    'roberta_large': {
        "model_name": "roberta-large",
        "tokenizer": RobertaTokenizer,
        "sequence_classification": RobertaForSequenceClassification,
    }}

    model_specifier = model_selection[config["model_id"]]
    classes = {'entailment': torch.as_tensor(0), 'neutral': torch.as_tensor(1), 'contradiction': torch.as_tensor(2)}
    alt_classnames = {'e': 'entailment', 'n': 'neutral', 'c': 'contradiction' }
    tokenizer = model_specifier['tokenizer'].from_pretrained(model_specifier['model_name'])
    toy_run = config['toy_run']
    if config['manual_hyperparams']:
        hyperparams = config['manual_hyperparams']
    elif toy_run:
        hyperparams = {
        'batch_size': 4,
        'epochs': 2,
        'lr' : 0.01
        }
    else:
        hyperparams = {
        'batch_size': 32,
        'epochs': 6,
        'lr' : 5e-5
        }
        

    return device, model_specifier, classes, alt_classnames, tokenizer, hyperparams, toy_run


In [9]:

device, model_specifier, classes, alt_classnames, tokenizer, hyperparams, toy_run = globals()

In [17]:
# code to get NLI variation:
# evaluation will assume this exists 

nli_var_url = "https://raw.githubusercontent.com/epavlick/NLI-variation-data/refs/heads/master/sentence-pair-analysis/preprocessed-data.jsonl"
nli_var_r = requests.get(nli_var_url)
if not os.path.exists("./data/NLI_variation"):
        os.mkdir("./data/NLI_variation")
if not os.path.exists("./data/NLI_variation/NLI_variation_data.jsonl"):
    with open("./data/NLI_variation/NLI_variation_data.jsonl", "wb") as f:
        f.write(nli_var_r.content)



In [None]:
def normalise_headers(df):
    '''
    Normalises the column names that the different datasets come with.

    Arg:
        pandas dataframe based on any dataset used in this project
    Returns:
        Column-normalised dataframe
    '''

    #https://stackoverflow.com/questions/55715572/renaming-column-in-pandas-dataframe-if-condition-is-met, modified
    
    # there are potential data variations for which the following would be semantically incorrect, but it works for my data
    df = df
    alt_column_names = {'premise': ['context', 'sentence1'], 'hypothesis': ['statement', 'sentence2'], 'label': ['gold_label', 'labels', 'majority_label']}
    df.columns = ['premise' if any(k == x for k in alt_column_names['premise']) else x for x in df]
    df.columns = ['hypothesis' if any(k == x for k in alt_column_names['hypothesis']) else x for x in df]
    df.columns = ['label' if any(k == x for k in alt_column_names['label']) else x for x in df]
    
    return df

In [21]:
def get_df_dict(setname):
    '''
    Converts raw dataset files into pandas dataframes
    Arg: simple setname string, e.g., 'anli' for Adversarial NLI
    Returns: dict of dataframes for the input dataset
    '''
    df_dict = {}
    dirpath = './data/' + setname
    walking = os.walk(dirpath)

    for root, dirs, files in walking:
        for file in files:
            full_path = os.path.join(root, file)
            name, extension = os.path.splitext(full_path)
            if extension == '.jsonl':
                df = pd.read_json(path_or_buf=full_path, lines=True)
                if setname == 'chaosNLI':
                    cnli_examples = pd.json_normalize(df['example']) # p, h, label are in a single column. This makes a df with that column unpacked
                    cnli_examples = cnli_examples.drop(['uid'], axis=1) # uid is in now redundantly in both the unpacked 'example' column and in the original df
                    df = df.drop(['example'], axis=1)
                    df = df.join(cnli_examples)
            elif extension == '.csv':
                df = pd.read_csv(filepath_or_buffer=full_path)
            else:
                continue # skips e.g. text files

            df = normalise_headers(df)
            
            #anli has a one-deeper filesystem (per-round datasets), so the returned dict is also deeper
            if setname == 'anli':
                round, subset = name.split('/')[-2:]
                if round not in df_dict:
                    df_dict[round] = {}
                if subset not in df_dict[round]:
                    df_dict[round][subset] = df
            else:
                subset = name.split('/')[-1]
                #print(subset)
                df_dict[subset] = df
    return df_dict


def drop_unlabeled(df):
    # snli has some unlabeled examples
    df.drop(df.loc[df['label']=='-'].index, inplace=True)

drop_unlabeled(snli['snli_train'])

#s_id_list = list(snli['snli_train']['pairID'])
#m_id_list = list(mnli['multinli_train']['pairID'])

def get_NLIvar_duplicate_indices(trainset_str='snli', comparative_print=False):
    # The NLI variation dataset partially uses inference pairs from SNLI and MNLI training data.
    # I remove duplicates from the training sets since the effect on the size is negligible (~100 pairs
    # per set, out of hundreds of thousands). Doing it the other way around (remove from NLIvar) would
    # remove a notable chunk out of this eval set

    # 

    # chaosNLI also draws from MNLI and SNLI, but from the dev sets, so there is no data contamination

    # call with 'snli' (default) or 'mnli'

    #
    if trainset_str == 'snli':
        trainset = snli['snli_train']
        trainset_id_list = list(snli['snli_train']['pairID'])
    elif trainset_str == 'mnli':
        trainset = mnli['multinli_train']
        trainset_id_list = list(mnli['multinli_train']['pairID'])
    else:
        return None
    #s_id_list = list(snli['snli_train']['pairID'])
    #m_id_list = list(mnli['multinli_train']['pairID'])

    nli_var_filtered = nli_var.loc[nli_var['task'] == trainset_str]
    nli_var_ids = list(nli_var_filtered['id'])
    #nli_var_mnli = nli_var.loc[nli_var['task'] == 'mnli']
    #nli_var_mnli_ids = list(nli_var_mnli['id'])
    count = 0
    duplicate_var_indices = []
    duplicate_train_indices = []
    for item in nli_var_ids:
        if item in trainset_id_list:
            count +=1
            varidx = nli_var_filtered.loc[nli_var_filtered['id'] == item].index[0]
            trainsetidx = trainset['pairID'].loc[trainset['pairID'] == item].index[0]
            duplicate_var_indices.append(varidx)
            duplicate_train_indices.append(trainsetidx)

            if comparative_print:
                # for checking that there are no discrepancies, e.g. that the given ids in NLI variation actually represents
                # the pairIDs in snli and mnli       
                print(nli_var_filtered.premise[varidx])
                print(nli_var_filtered.hypothesis[varidx])
                print(nli_var_filtered.label[varidx])
                print(nli_var_filtered.id[varidx])
                print()
                print(trainset.premise[trainsetidx])
                print(trainset.hypothesis[trainsetidx])
                print(trainset.label[trainsetidx])
                print(trainset.pairID[trainsetidx])
                print('--------------------------')
    return duplicate_train_indices

    


duplicate_snli_indices = get_NLIvar_duplicate_indices('snli')
duplicate_mnli_indices = get_NLIvar_duplicate_indices('mnli')

labels = []
for dataset in sets:
    for name, df in dataset.items():
        labelset = set(df['label'])
        labels.append((name, labelset))
labels

#self.class_to_idx = {c: idx for idx, c in enumerate(self.classes)}

classes = {'entailment': torch.as_tensor(0), 'neutral': torch.as_tensor(1), 'contradiction': torch.as_tensor(2)}
alt_classnames = {'e': 'entailment', 'n': 'neutral', 'c': 'contradiction' }

model_selection = {'bert_base': {
        "model_name": "bert-base-uncased",
        "tokenizer": BertTokenizer,
        "sequence_classification": BertForSequenceClassification,
        "padding_token_value": 0,
        "padding_segement_value": 0,
        "padding_att_value": 0,
        "do_lower_case": True,
        "internal_model_name": "bert",
        'insight_supported': True,
    },
    
    'bert_large': {
        "model_name": "bert-large-uncased",
        "tokenizer": BertTokenizer,
        "sequence_classification": BertForSequenceClassification,
        "padding_token_value": 0,
        "padding_segement_value": 0,
        "padding_att_value": 0,
        "do_lower_case": True,
        "internal_model_name": "bert",
        'insight_supported': True,
    },
'roberta_large': {
        "model_name": "roberta-large",
        "tokenizer": RobertaTokenizer,
        "sequence_classification": RobertaForSequenceClassification,
        "padding_segement_value": 0,
        "padding_att_value": 0,
        "internal_model_name": "roberta",
        'insight_supported': True,
    }}


#change model here
model_specifier = model_selection['bert_base']
tokenizer = model_specifier['tokenizer'].from_pretrained(model_specifier['model_name'])

In [22]:
def add_encoded_input(tokenizer, df, max_length=None):
    '''
    Tokenizes premise and hypothesis (including special tokens).
    Adds input ids, token type ids (token types mark belonging to premise or hypothesis), and attention mask to dataframe.
    Args:
        tokenizer
        df to modify
        max_length: max allowed sequence length
    '''
    # do not actually use max_length and there is no way to change this in the config, but it is something that could be wanted so I left it in

    if 'input_encoding' in df.columns:
        # not necessary if notebook is run like a script, but this safeguards against some notebook-type sequencing errors
        # where an already encoding-modified df is input.
        return df
    
    tokenizer = tokenizer
    new_df = df
    premises = list(df['premise'])
    hypotheses = list(df['hypothesis'])
    
    encoded_pairs = [tokenizer.encode_plus(p, h, max_length=max_length, return_token_type_ids=True, truncation=True)
                       for p, h in zip(premises, hypotheses)]
    tensor_pairs = [{'input_ids': torch.as_tensor(encoded_pair['input_ids']),
                      'token_type_ids': torch.as_tensor(encoded_pair['token_type_ids']),
                      'attention_mask': torch.as_tensor(encoded_pair['attention_mask'])} 
                      for encoded_pair in encoded_pairs]
    new_df.insert(0, 'input_encoding', tensor_pairs)
    
    return new_df

In [23]:
def add_encoded_labels(df):
    '''
    Adds tensor version of labels to dataframe
    '''    
    if 'label_tensor' in df.columns:
        return df
    new_df = df
    encoded_labels = [classes[lb] if lb in classes else classes[alt_classnames[lb]] for lb in list(df.label)]
    encoded_labels = torch.as_tensor(encoded_labels)
    
    new_df.insert(1, 'label_tensor', encoded_labels)
    return new_df


def get_encoded_df(tokenizer, df):
    updated_df = df
    updated_df = add_encoded_input(tokenizer, updated_df)
    updated_df = add_encoded_labels(updated_df)
    return updated_df

tokenizer = bert_large['tokenizer'].from_pretrained(bert_large['model_name'])
                                                              #cache_dir=str(config.PRO_ROOT / "trans_cache"),
                                                              #do_lower_case=do_lower_case)

#tokenizer = bert_large['tokenizer']
for pr, hy in zip(p,h):
    enc_p = tokenizer.encode_plus(pr, hy, max_length=None, return_token_type_ids=True, Truncation=True, return_tensors='pt') #, truncation=True
    #encoding = tokenizer.encode_plus(pr, hy, truncation=True, return_token_type_ids=True, padding=True, return_tensors='pt')
    print(pr)
    print(hy)
    print(enc_p)
    print()
    #print(encoding)
    break

In [33]:
class NLIData(Dataset):
    '''
    Subclass of pytorch Dataset
    '''
    def __init__(self, dataframe, require_label=True):

        '''
        args:
            dataframe: pandas dataframe in style of get_df_dict output
            require_label: bool indicating whether dataset has a discrete gold label

        '''
        # NLI variation does not have a discrete label, instead using a label distribution for each example

        df = add_encoded_input(tokenizer, dataframe)
        general_data = ['input_encoding', 'premise', 'hypothesis']
        if require_label:
            df = add_encoded_labels(df)
            general_data = ['input_encoding', 'label_tensor', 'premise', 'hypothesis', 'label']
        self.set_specific_data = df.drop(general_data, axis=1)
        # set_specific_data is a bit of a garbage dump category that serves the purpose of making this class able to handle the diverse
        # categories in the input sets

        self.df = df
        self.require_label = require_label


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        out_dict = {'input_encoding': row['input_encoding'],
                    'premise': row['premise'],
                    'hypothesis': row['hypothesis'],
                    'input_length': len(row['input_encoding']['input_ids']),
                    'unique_data': self.set_specific_data.iloc[idx]
                    }
        if self.require_label:
            out_dict['label'] = row['label']
            out_dict['label_tensor'] = [row['label_tensor']]

        return out_dict

In [34]:
def collate_fn(batch_data):
    """
    Collate function for pytorch dataloader
    """
    # mainly used for padding

    input_ids = [example['input_encoding']['input_ids'] for example in batch_data]
    token_type_ids = [example['input_encoding']['token_type_ids'] for example in batch_data]
    attention_masks = [example['input_encoding']['attention_mask'] for example in batch_data]

    p_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    p_token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0)
    p_attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    unique_data = [example['unique_data'] for example in batch_data]
   
    batch = {'input_ids': p_input_ids, 'token_type_ids': p_token_type_ids, 'attention_masks': p_attention_masks,
             'unique_data': unique_data}
    
    if 'label_tensor' in batch_data[0]:
        label_tensors = [example['label_tensor'] for example in batch_data]
        label_tensors = torch.as_tensor(label_tensors)  #'label_tensors': label_tensors
        batch['label_tensors'] = label_tensors
    
    return batch

In [None]:
def train(model_specifier, train_ds, dirpath, hyperparams, midtrain_eval=False):
    '''
    
    '''

    model = model_specifier['sequence_classification'].from_pretrained(model_specifier['model_name'], num_labels=3).to(device)
    model = model.to(device)

    models = {}

    optimizer = torch.optim.AdamW(model.parameters(), lr=hyperparams['lr'])
    for epoch in range(hyperparams['epochs']):
        print('Epoch', str(epoch+1)+':')
        print()
                
        loader = DataLoader(dataset=train_ds,
                        batch_size=hyperparams['batch_size'],
                        shuffle=True,
                        collate_fn=collate_fn)
        loss_metrics = {}
        total_loss = 0
        for batch_id, batch in enumerate(tqdm.tqdm(loader, desc="Batches")):
            model.train()
    


            outputs = model(batch['input_ids'].to(device),
                            attention_mask=batch['attention_masks'].to(device),
                            token_type_ids=batch['token_type_ids'].to(device),
                            labels=batch['label_tensors'].to(device))
            
            loss, logits = outputs[:2] #pretrained bert includes the loss if labels are provided to the model,
                                        # so I don't need to do that separately

            loss.backward()
            optimizer.step()
  
            total_loss += loss.item()
            model.zero_grad()
            del outputs, loss, logits

        average_loss = total_loss/len(loader)
        loss_metrics['total_loss'] = total_loss
        loss_metrics['average_loss'] = average_loss           
        print('Average loss for epoch:', average_loss)

        print()

        checkpoint_path = os.path.join(dirpath, 'e'+str(epoch+1))
        model.save_pretrained(checkpoint_path)
        tokenizer.save_pretrained(checkpoint_path)
        loss_path = os.path.join(checkpoint_path, 'losses.json')
        with open(loss_path, 'w') as f:
            json.dump(loss_metrics, f)
        models['e'+str(epoch+1)] = model
        
        if midtrain_eval:
            print('Evaluating epoch {}. See associated results '.format(epoch+1))
            test(modelin=model, device=device, eval_dataset=cnli_eval_datasets['chaosNLI_mnli_m'])
    #return model, ipi, li, am, lbs
    return models
        
    
    

model, ipi, li, am, lbs = train(model_specifier, cnli_eval_datasets['chaosNLI_mnli_m'], 'lalala', hyperparams)

#model, ipi, li, am, lbs = train(model, cnli_eval_datasets['chaosNLI_mnli_m'], 'lalala', hyperparams)

loader = DataLoader(dataset=cnli_eval_datasets['chaosNLI_mnli_m'],
                        batch_size=1,
                        shuffle=True,
                        collate_fn=collate_fn)

model.eval()
outputs_list = []
with torch.no_grad():
    for batch_id, batch in enumerate(tqdm.tqdm(loader, desc="Batches")):   
                outputs = model(batch['input_ids'],
                                attention_mask=batch['attention_masks'],
                                token_type_ids=batch['token_type_ids'],
                                labels=batch['label_tensors'].to(device))
                outputs_list.append(outputs)
                #print(batch['premise'])

model.eval()
cnt = {}
for l in li:
    if l == 'BATCHBREAK':
        print('batchbreak')
    else:
        am = int(torch.argmax(l))
        if am not in cnt:
            cnt[am] = 0
        cnt[am] += 1
        print(torch.argmax(l))
cnt

In [None]:
def get_datasets():

    
    def get_NLIvar_duplicate_indices(trainset_str='snli', comparative_print=False):
        # The NLI variation dataset partially uses inference pairs from SNLI and MNLI training data.
        # I remove duplicates from the training sets since the effect on the size is negligible (~100 pairs
        # per set, out of hundreds of thousands). Doing it the other way around (remove from NLIvar) would
        # remove a notable chunk out of this eval set

        # chaosNLI also draws from MNLI and SNLI, but from the dev sets, so there is no data contamination

        # call with 'snli' (default) or 'mnli'
        if trainset_str == 'snli':
            trainset = snli['snli_train']
            trainset_id_list = list(snli['snli_train']['pairID'])
        elif trainset_str == 'mnli':
            trainset = mnli['multinli_train']
            trainset_id_list = list(mnli['multinli_train']['pairID'])
        else:
            return None
        #s_id_list = list(snli['snli_train']['pairID'])
        #m_id_list = list(mnli['multinli_train']['pairID'])

        nli_var_filtered = nli_var.loc[nli_var['task'] == trainset_str]
        nli_var_ids = list(nli_var_filtered['id'])
        #nli_var_mnli = nli_var.loc[nli_var['task'] == 'mnli']
        #nli_var_mnli_ids = list(nli_var_mnli['id'])
        count = 0
        duplicate_var_indices = []
        duplicate_train_indices = []
        for item in nli_var_ids:
            if item in trainset_id_list:
                count +=1
                varidx = nli_var_filtered.loc[nli_var_filtered['id'] == item].index[0]
                trainsetidx = trainset['pairID'].loc[trainset['pairID'] == item].index[0]
                duplicate_var_indices.append(varidx)
                duplicate_train_indices.append(trainsetidx)

                if comparative_print:
                    # for checking that there are no discrepancies, e.g. that the given ids in NLI variation actually represents
                    # the pairIDs in snli and mnli       
                    print(nli_var_filtered.premise[varidx])
                    print(nli_var_filtered.hypothesis[varidx])
                    print(nli_var_filtered.label[varidx])
                    print(nli_var_filtered.id[varidx])
                    print()
                    print(trainset.premise[trainsetidx])
                    print(trainset.hypothesis[trainsetidx])
                    print(trainset.label[trainsetidx])
                    print(trainset.pairID[trainsetidx])
                    print('--------------------------')
        return duplicate_train_indices
    
    def column_filter():
    
        snli['snli_train'].drop(['annotator_labels', 'captionID', 'pairID', 'sentence1_binary_parse', 'sentence1_parse',
                                'sentence2_binary_parse', 'sentence2_parse' ], axis=1, inplace=True),
        mnli['multinli_train'].drop(['annotator_labels', 'genre', 'pairID', 'promptID', 'sentence1_binary_parse', 
                                    'sentence1_parse', 'sentence2_binary_parse', 'sentence2_parse'], axis=1, inplace=True)
        
        for round_str, sets in anli1.items():
            anli1[round_str]['train'].drop(['uid', 'model_label', 'emturk', 'genre', 'reason', 'tag'], axis=1, inplace=True)


        cnli['chaosNLI_snli'].drop(['uid', 'label_dist', 'old_labels', 'old_label', 'source'], axis=1, inplace=True),

        cnli['chaosNLI_mnli_m'].drop(['uid', 'label_dist', 'old_labels', 'old_label', 'source'], axis=1, inplace=True),

        nli_var.drop(['task', 'original-dataset-label', 'id', 'num-NA'], axis=1, inplace=True)
    
    #anli2 = get_df_dict('anli_reanalyzed')

    print('Getting SNLI dataframe...')
    snli = get_df_dict('snli')
    snli['snli_train'].drop(snli['snli_train'].loc[snli['snli_train']['label']=='-'].index, inplace=True)

    print('Getting MNLI dataframe...')
    mnli = get_df_dict('multinli')

    print('Getting ANLI dataframes...')
    anli1 = get_df_dict('anli')

    
    print('Getting CNLI dataframes...')
    cnli = get_df_dict('chaosNLI')

    print('Getting NLI variation dataframe...')
    nli_var = pd.read_json(path_or_buf='./data/NLI_variation/NLI_variation_data.jsonl', lines=True)
    #nli_var = normalise_headers(nli_var)

    duplicate_snli_indices = get_NLIvar_duplicate_indices('snli')
    duplicate_mnli_indices = get_NLIvar_duplicate_indices('mnli')
    #print(len(snli['snli_train']))
    snli['snli_train'].drop(index=duplicate_snli_indices, inplace=True)
    snli['snli_train'].reset_index(drop=True, inplace=True)
    #print(len(snli['snli_train']))

    #print(len(mnli['multinli_train']))
    mnli['multinli_train'].drop(index=duplicate_mnli_indices, inplace=True)
    mnli['multinli_train'].reset_index(drop=True, inplace=True)
    #print(len(mnli['multinli_train']))
    
    column_filter()
    
    print('Transforming SNLI data...')
    snli_train_dataset = NLIData(snli['snli_train'])
    
    print('Transforming MNLI data...')
    mnli_train_dataset = NLIData(mnli['multinli_train'])


    print('Transforming ANLI data...')
    
    anli_training_datasets = {}
    for round_str in anli1.keys():
        print('\t', round_str+'...')
        anli_training_datasets[round_str] = NLIData(anli1[round_str]['train'])
    
    print('Transforming CNLI data...')

    cnli_eval_datasets = {}
    for name, df in cnli.items():
        print('\t', name+'...')
        cnli_eval_datasets[name] = NLIData(df)

    print('Transforming NLI variation data...')

    NLI_var_eval_dataset = NLIData(nli_var, require_label=False)

    return  snli_train_dataset, mnli_train_dataset, anli_training_datasets, cnli_eval_datasets, NLI_var_eval_dataset
    
    

    

In [129]:
cnli_eval_datasets['chaosNLI_mnli_m'][1]

{'input_encoding': {'input_ids': tensor([ 101, 2017, 2215, 2000, 8595, 1996, 6462, 1998, 2175,  102, 2017, 2123,
          1005, 1056, 2215, 2000, 5245, 1996, 6462, 8217, 1010, 2021, 2738, 8595,
          2009, 2524, 1012,  102]),
  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1])},
 'premise': 'you want to punch the button and go',
 'hypothesis': "You don't want to push the button lightly, but rather punch it hard.",
 'input_length': 28,
 'unique_data': label_counter    {'e': 48, 'n': 45, 'c': 7}
 label_count                     [48, 45, 7]
 entropy                            1.295225
 Name: 1, dtype: object,
 'label': 'e',
 'label_tensor': [0]}

In [69]:
def get_evalsets():
    def column_filter():
        cnli['chaosNLI_snli'].drop(['uid', 'label_dist', 'old_labels', 'old_label', 'source'], axis=1, inplace=True),

        cnli['chaosNLI_mnli_m'].drop(['uid', 'label_dist', 'old_labels', 'old_label', 'source'], axis=1, inplace=True),

        nli_var.drop(['task', 'original-dataset-label', 'id', 'num-NA'], axis=1, inplace=True)
    
    print('Getting CNLI dataframes...')



    cnli = get_df_dict('chaosNLI')

    print('Getting NLI variation dataframe...')
    nli_var = pd.read_json(path_or_buf='./data/NLI_variation/NLI_variation_data.jsonl', lines=True)

    column_filter()

    print('Transforming CNLI data...')
    cnli_eval_datasets = {}
    for name, df in cnli.items():
        print('\t', name+'...')
        cnli_eval_datasets[name] = NLIData(df)
    
    print('Transforming NLI variation data...')
    
    NLI_var_eval_dataset = NLIData(nli_var, require_label=False)

    return cnli_eval_datasets, NLI_var_eval_dataset


In [None]:
def print_metrics(y_pred, y_test, classes, logfile = False):
    # vestigial function, used to check mid-training if the model seemed to make any progress
    # retired after I implemented mid-training plots for the same purpose
    
    """
    Prints accuracy and macro f-score based on inputs.
    Writes to file instead if logfile == truthy
    Args:
        y_pred: list of class predictions made by model
        y_test: list of gold labels of evaluation set where y_test[i] corresponds to y_pred[i]
    
    """

    if logfile:
        print(logfile)
        print('Logging results to file instead of printing: see model directory')
        resultspath = os.path.join(os.getcwd(), logfile)
        print(resultspath)
        log = open(resultspath, 'a')
        sys.stdout = log

    accuracy = accuracy_score(y_test, y_pred)
    #p_scores = precision_score(y_test, y_pred, average = None, zero_division=0.0)
    #r_scores = recall_score(y_test, y_pred, average = None, zero_division=0.0)
    f_scores = f1_score(y_test, y_pred, average = None, zero_division=0.0)
    #macro_p = precision_score(y_test, y_pred, average = "macro", zero_division=0.0)
    #macro_r = recall_score(y_test, y_pred, average = "macro", zero_division=0.0)
    macro_f = f1_score(y_test, y_pred, average = "macro", zero_division=0.0)

    print()
    print("accuracy is", accuracy)
    print()

    for label, f_score in zip(classes, f_scores):
        print("f-score for label '{}' is {}".format(label, f_score))
    print("macro f-score is", macro_f)
    print('------------------------------------------------')
    if logfile:
        log.close()

In [None]:
#ent = cnli_eval_datasets['chaosNLI_mnli_m'][0]['unique_data']['entropy']
#lc = cnli_eval_datasets['chaosNLI_mnli_m'][0]['unique_data']['label_count']

model = model_specifier['sequence_classification'].from_pretrained(model_specifier['model_name'], num_labels=3).to(device)
model(batch['input_ids'],
                                attention_mask=batch['attention_masks'],
                                token_type_ids=batch['token_type_ids'],
                                labels=batch['label_tensors'].to(device))

In [71]:
def entropy_testing(dataset, modelpath):
    # just some work I did for myself to ensure I got the entropy right, figured I'd leave it in

    loader = DataLoader(dataset, collate_fn=collate_fn, batch_size=1)
    #modelpath = './models/toy-models/r2/e2'
    model = model_specifier['sequence_classification'].from_pretrained(modelpath).to(device)
    tokenizer = model_specifier['tokenizer'].from_pretrained(modelpath)
    #model.load_state_dict(torch.load(modelpath, weights_only=True))
    model.eval()
    
    for batch in loader:
        outputs = model(batch['input_ids'].to(device),
                        attention_mask=batch['attention_masks'].to(device),
                        token_type_ids=batch['token_type_ids'].to(device),
                    )
        
        # cnli includes entropy with base2 logarithm
        # example:
        #cnli_ent = cnli_eval_datasets['chaosNLI_mnli_m'][0]['unique_data']['entropy']
        print()
        print(type(batch))
        #print(batch['unique_data'][0]['entropy'])
        cnli_ent = batch['unique_data'][0]['entropy']
        print('cnli entropy, base2:',cnli_ent)
        print()

        # label count of the same example:
        #lc = cnli_eval_datasets['chaosNLI_mnli_m'][0]['unique_data']['label_count']
        lc = batch['unique_data'][0]['label_count']

        #from count to label % distribution:
        dist = [count/100 for count in lc]

        # DIY numpy entropy from dist
        p = np.array(dist)
        logp = np.log2(p)
        numpy_ent1 = np.sum(-p*logp)

        # numpy entropy from lc
        p = np.array(lc)
        logp = np.log2(p)
        numpy_ent2 = np.sum(-p*logp)

        print('numpy:')
        print('np log2 entropy from dist:', numpy_ent1)
        print('np log2 entropy from count:', numpy_ent2)
        print()

        p_tensor_dist = torch.Tensor(dist)
        tensor_lc = torch.Tensor(lc)

        pt_ent1 = Categorical(probs = p_tensor_dist).entropy()
        pt_ent2 = Categorical(probs = tensor_lc).entropy()

        print('pt, default=natlog:')
        print('pt ent from dist:', pt_ent1)
        print('pt ent from count:', pt_ent2)
        print()

        sp_ent_dist = entropy(dist)
        sp_ent_lc = entropy(lc)
        sp_ent_dist_b2 = entropy(dist, base=2)

        print('scipy, default=natlog')
        print('scipy entropy from dist:', sp_ent_dist)
        print('scipy entropy from count:', sp_ent_lc)
        print('scipy entropy from dist base2:', sp_ent_dist_b2)
        print()

        #entropy = batch['unique_data']['entropy']
        # torch.softmax(scores, -1).squeeze()
        # probs = torch.softmax(scores, -1).squeeze()        

        logits = outputs[0][0]
        probs = torch.softmax(logits, -1).squeeze()
        #probs2 = torch.softmax(logits, -1).squeeze().cpu().detach().numpy()
        #probs2 = probs = torch.softmax(logits, -1)
        #print(logits)
        #print(probs)
        #print(probs2)
        #pt_ent_from_logits_CB = ContinuousBernoulli(logits = p_tensor_dist).entropy()
        #pt_ent_from_softmax_CB = ContinuousBernoulli(probs = p_tensor_dist).entropy()
        print('logits',logits)
        print('probs',probs)
        pt_ent_from_logits = Categorical(logits = p_tensor_dist).entropy()
        pt_ent_from_softmax = Categorical(probs = p_tensor_dist).entropy()
        np_probs = probs.cpu().detach().numpy()
        print('np_probs',np_probs)
        scipy_ent_from_logits = entropy(logits.cpu().detach().numpy())
        scipy_ent_from_softmax = entropy(np_probs)
        scipy_ent_from_logits_b2 = entropy(logits.cpu().detach().numpy(), base=2)
        scipy_ent_from_softmax_b2 = entropy(np_probs, base=2)
        #scipy_ent_from_softmax2_b2 = entropy(probs2, base=2)
    

        


        p = np.array(np_probs)
        log2p = np.log2(p)
        logp = np.log(p) #nat log
        model_numpy_entb2 = np.sum(-p*log2p)
        model_numpy_natent = np.sum(-p*logp)

        #print('pt entropy from logits, CB:', pt_ent_from_logits_CB)
        #print('pt entropy from softmax, CB:', pt_ent_from_softmax_CB)
        print('pt entropy from logits:', pt_ent_from_logits)
        print('pt entropy from softmax:', pt_ent_from_softmax)
        print('scipy entropy from logits:', scipy_ent_from_logits)
        print('scipy entropy from softmax:', scipy_ent_from_softmax)
        print('scipy entropy from logits, b2:', scipy_ent_from_logits_b2)
        print('scipy entropy from softmax, b2:', scipy_ent_from_softmax_b2)
        #print('scipy entropy from softmax2:', scipy_ent_from_softmax2_b2)
        print()
        print('model numpy entropy, natlog', model_numpy_natent)
        print('model numpy entropy, b2', model_numpy_entb2)
        break


In [None]:
#one_example, _ = random_split(cnli_eval_datasets['chaosNLI_mnli_m'], [1, len(cnli_eval_datasets['chaosNLI_mnli_m'])-1])

# just some work I did for myself to ensure I got the entropy right, figured I'd leave it in

# cnli includes entropy with base2 logarithm
# from example 0 of cnli_mnli:
cnli_ent = cnli_eval_datasets['chaosNLI_mnli_m'][0]['unique_data']['entropy']
print('cnli entropy, base2:',cnli_ent)
print()

# label count of the same:
lc = cnli_eval_datasets['chaosNLI_mnli_m'][0]['unique_data']['label_count']

#from count to label % distribution:
dist = [count/100 for count in lc]

# DIY numpy entropy from dist
p = np.array(dist)
logp = np.log2(p)
numpy_ent1 = np.sum(-p*logp)

# numpy entropy from lc
p = np.array(lc)
logp = np.log2(p)
numpy_ent2 = np.sum(-p*logp)

print('numpy:')
print('np log2 entropy from dist:', numpy_ent1)
print('np log2 entropy from count:', numpy_ent2)

p_tensor_dist = torch.Tensor(dist)
tensor_lc = torch.Tensor(lc)

pt_ent1 = Categorical(probs = p_tensor_dist).entropy()
pt_ent2 = Categorical(probs = tensor_lc).entropy()

print('pt, default=natlog:')
print('pt ent from dist:', pt_ent1)
print('pt ent from count:', pt_ent2)
print()

sp_ent_dist = entropy(dist)
sp_ent_lc = entropy(lc)
sp_ent_dist_b2 = entropy(dist, base=2)

print('scipy, default=natlog')
print('scipy entropy from dist:', sp_ent_dist)
print('scipy entropy from count:', sp_ent_lc)
print('scipy entropy from dist base2:', sp_ent_dist_b2)

#dist = torch.softmax(count, -1)
#dist = torch.softmax(dist, -1).cpu()
#print(dist)

#lc = torch.as_tensor(lc, dtype=float)
#lc = torch.softmax(lc, -1).cpu()

In [72]:
cnli_eval_datasets, NLI_var_eval_dataset = get_evalsets()

Getting CNLI dataframes...
Getting NLI variation dataframe...
Transforming CNLI data...
	 chaosNLI_snli...
	 chaosNLI_mnli_m...
Transforming NLI variation data...


In [22]:
cnli_eval_datasets['chaosNLI_mnli_m'][0]

{'input_encoding': {'input_ids': tensor([  101,  8529,  1011, 14910,  8529,  1011, 14910,  3398,  2092,  7910,
           1045,  2064,  2156,  2017,  2113,  2009,  1005,  1055,  2009,  1005,
           1055,  2009,  1005,  1055,  2009,  1005,  1055,  2785,  1997,  6057,
           2138,  2057,  2009,  3849,  2066,  2057,  5414,  2769,  2017,  2113,
           2057,  2769,  2007,  7817,  4987,  1998,  2065,  1996,  2231,  3431,
           1998,  1996,  2406,  2008,  2057,  5414,  1996,  2769,  2000,  8529,
           1045,  2064,  2156,  2339,  1996,  2453,  2031,  1037,  2367,  7729,
           2875,  7079,  2009,  2067,  2009,  1005,  1055,  1037,  2843,  2149,
           2008,  2017,  2113,  2057,  2123,  1005,  1056,  2428,  5414,  2769,
           2000,  2000,  3032,  2057,  5414,  2769,  2000,  6867,  1998,  2009,
           1005,  1055,  1996,   102,  2057,  2123,  1005,  1056,  5414,  1037,
           2843,  1997,  2769,  1012,   102]),
  'token_type_ids': tensor([0, 0, 0, 0, 0,

In [45]:
l = DataLoader(cnli_eval_datasets['chaosNLI_snli'], collate_fn=collate_fn, batch_size=1)

In [144]:
lst = [2,3,5,6]
arr = np.array(lst)
arr=arr/100
arr

array([0.02, 0.03, 0.05, 0.06])

In [None]:
def get_entropy(unique_data, logits):
    '''
    Extracts label distribution from evaluation set data and computes entropy

    Arg:
        unique_data: unbatched 
    '''
    if 'label_count' in unique_data:
        hum_dist = [c/100 for c in unique_data['label_count']]
    else:
        #print(unique_data)
        hum_dist = [0, 0, 0]
        lb_scales = unique_data['labels']

        # Discretize the labels to make them interface in the model, by the same thresholds as
        # in the original paper (Pavlick, Kwiatkowski, 2019).
        # This means that while I don't make use of the grading scale, I still utilise the annotator variation.
        # Also makes the NLI_val results more comparable to those from CNLI.
        for l in lb_scales:
            if l > 16.7:
                hum_dist[0] = hum_dist[0]+1
            elif l < -16.7:
                hum_dist[2] = hum_dist[2]+1
            else:
                hum_dist[1] = hum_dist[1]+1
        hum_dist = [c/sum(hum_dist) for c in hum_dist]
    
    hum_ent = entropy(hum_dist, base=2)

    model_dist = torch.softmax(logits, -1).squeeze().cpu().detach().numpy()
    model_ent = entropy(model_dist, base=2)  
    return hum_ent, model_ent

In [154]:
test(modelpath='./models/bert-base-uncased-bs32-eps6-lr5e-05/r4/e6', device=device, eval_dataset=NLI_var_eval_dataset) #eval_dataset=NLI_var_eval_dataset

  0%|          | 0/496 [00:00<?, ?it/s]

labels           [49.999903547520425, -49.99995111441214, -49.9...
normed-labels    [-1.049158911374859, -2.056001753990448, -1.38...
Name: 0, dtype: object
xnorm: 4.380989543882038
1.3704165065672542 0.648357739453004





([], [], [], [], [])

In [30]:
def test(eval_dataset, modelpath=False, modelin= False, device='cpu', toy_run=toy_run):
    #print(eval_dataset[0])
    '''

    '''

    if not modelpath and not modelin:
        print("Suppy either 'modelpath' or 'modelin' argument")
        return None    

    loader = DataLoader(dataset=eval_dataset, collate_fn=collate_fn, batch_size=1)
    if modelpath:
        model = model_specifier['sequence_classification'].from_pretrained(modelpath).to(device)
    if modelin:
        model = modelin


    y_test = []
    y_pred = []
    human_entropies = []
    model_entropies = []

    model.eval()
    with torch.no_grad():
        for batch_id, batch in enumerate(tqdm.tqdm(loader)):
            if toy_run:
                if len(human_entropies) > 10:
                    break
            if 'label_tensors' in batch:
                label_input = batch['label_tensors'].to(device)
            else:
                label_input = None # no gold labels in NLI_var

            outputs = model(batch['input_ids'].to(device),
                                attention_mask=batch['attention_masks'].to(device),
                                token_type_ids=batch['token_type_ids'].to(device),
                                labels=label_input)
            
            
            if 'label_tensors' in batch:
                loss, logits = outputs[:2]
                pred = torch.argmax(logits)
                gold = batch['label_tensors'][0][0]
                y_test.append(gold.cpu())
                y_pred.append(pred.cpu())
            else:
                logits = outputs[0]


            hum_ent, model_ent = get_entropy(batch['unique_data'][0], logits)
            human_entropies.append(hum_ent)
            model_entropies.append(model_ent)
    
        return y_pred, y_test, human_entropies, model_entropies

test_id,test_log, test_oe, test_e, test_out, y_test, y_pred, collected_logits, collected_entropies = test(modelpath=False, modelin= model, device=device, eval_dataset=cnli_eval_datasets['chaosNLI_mnli_m'])

In [31]:
NLI_var_eval_dataset[0]['unique_data']['normed-labels']

NameError: name 'NLI_var_eval_dataset' is not defined

In [75]:
modeldir = './models/bert-base-uncased-bs32-eps6-lr5e-05'

In [74]:
y_pred, y_test, collected_entropies, collected_prior_entropies, unique_data = test(modelpath=False, device=device, eval_dataset=cnli_eval_datasets['chaosNLI_mnli_m'])

Suppy either 'modelpath' or 'modelin' argument


TypeError: cannot unpack non-iterable NoneType object

In [24]:
os.listdir(modeldir)

['hyperparameters.json', 'r1', 'r2', 'r3', 'r4']

In [None]:
entropy(d, base=2)

In [34]:
ud = cnli_eval_datasets['chaosNLI_mnli_m'][0]['unique_data']

In [37]:
d = [c/100 for c in ud['label_count']]

In [39]:
ud['entropy']

1.209800338660482

In [44]:
entropy(d, base=2)

1.2098003386604825

In [None]:
sp_ent_dist = entropy(dist)
        sp_ent_lc = entropy(lc)
        sp_ent_dist_b2 = entropy(dist, base=2)

In [38]:
d

[0.12, 0.68, 0.2]

In [76]:
eval_sets = {'ChaosNLI-MNLI': cnli_eval_datasets['chaosNLI_mnli_m'], 'ChaosNLI-SNLI': cnli_eval_datasets['chaosNLI_snli'], 'NLIVariation': NLI_var_eval_dataset}

In [77]:
def plot_categorical(data_dict):
    x = data_dict['x']
    if 'type' in data_dict:
        plotfunc = data_dict['type']
    else:
        plotfunc = plt.plot
    for y, label in zip(data_dict['y'], data_dict['legend_lables']):
        print(y)
        print(x)
        #names = list(line_data.keys())
        #values = list(line_data.values())
        # y is list of numerical data
        # data_dict['y'] is a list of y, with each element = a graph line
        # x is list of epochs (xticks)
        # label is legend name
        #if plotfunc == plt.plot:
        #    plotfunc(x, y, label=label, marker = 'o')
        #else:
        #    plotfunc(x, y, label=label)
        plt.plot(x, y, label=label, marker = 'o')
    plt.legend()
    plt.title(data_dict['title'], fontsize=16)
    plt.savefig(data_dict['filepath'], bbox_inches="tight")
    plt.close()

In [78]:
def epoch_n_data(all_sets_data, n):
    epoch_n_data = [setdata[n] for setdata in all_sets_data]
    return epoch_n_data

In [79]:
def reorder_sublists_by_idx(list_of_lists):
    print(list_of_lists)
    
    reorganised = []
    for i in range(len(list_of_lists[0])):
        reorganised.append([item[i] for item in list_of_lists])
    print(reorganised)
    return reorganised

In [54]:
reorder_sublists_by_idx([[1,2,3], [4,5,6]])

[[1, 2, 3], [4, 5, 6]]
[[1, 4], [2, 5], [3, 6]]


[[1, 4], [2, 5], [3, 6]]

In [None]:
def full_eval(model_specifier, modeldir, eval_sets):
    
    plot_data_dicts = []
    overall_resultspath = os.path.join(modeldir, 'results')
    if not os.path.exists(overall_resultspath):
        os.mkdir(overall_resultspath)
    overall_results = {}
    #for eval_set_name, eval_set in list(eval_sets.items()):
    final_epoch_accuracies = []
    final_epoch_mfscores = []
    final_epoch_ent_corrs = []
    rounds = []
    testcount = 1
    print('Running evaluation for every epoch-checkpoint of every round on each set.')
    for roundname in os.listdir(modeldir):
        if 'json' in roundname or 'results' in roundname:
            continue
        rounds.append(roundname)
        rounddir = os.path.join(modeldir, roundname)
        round_resultspath = os.path.join(rounddir, 'results')
        if not os.path.exists(round_resultspath):
            os.mkdir(round_resultspath)
        #accuracy_per_epoch = []
        #mf_scores_per_epoch = []
        #entropy_correlation_per_epoch = []
        epochs = []
        per_set_accs = []
        per_set_mfs = []
        per_set_entcorrs = []
        for eval_set_name, eval_set in list(eval_sets.items()):
            #if 'results' in epoch:
            #    continue            
            per_epoch_accs =  []
            per_epoch_mfs = []
            per_epoch_entcorrs = []
            #setnames = []
            #print(len(os.listdir(rounddir)))
            #print(os.listdir(rounddir))
       
            for epoch in os.listdir(rounddir):
                epochdir = os.path.join(rounddir, epoch)
                if 'results' in epoch: # checks if the current iteration is over a results directory at round the round level (i.e sibling of 'e1' etc.)
                                        # not to be mistaken with the next few lines of code
                                        # which creates a results directory for epoch-level results.
                    continue

                epoch_resultspath = os.path.join(epochdir, 'results')
                if not os.path.exists(epoch_resultspath):
                    os.mkdir(epoch_resultspath)




                model =  model_specifier['sequence_classification'].from_pretrained(epochdir).to(device)
                if len(epochs) < len(os.listdir(rounddir))-1: # Number of checkpoints per round
                                                # could have just hardcoded 6, since that's the number of epochs I train with,
                                                # but I guess this is technically more flexible  
                    epochs.append(epoch)
                #setnames.append(eval_set_name)

                nrounds = len(os.listdir(modeldir))-2
                nepochs = len(rounddir)-1
                testruns = len(eval_sets)*nrounds*nepochs
                print('Evaluation {} out of {}'.format(testcount, testruns))
                print('Getting scores for {} {} on {}.'.format(roundname, epoch, eval_set_name))
                y_pred, y_test, human_entropies, model_entropies = test(modelin=model, device=device, eval_dataset=eval_set)
                human_entropies = np.array(human_entropies)
                model_entropies = np.array(model_entropies)
                pearson_c = np.corrcoef(model_entropies, human_entropies)[0][1]
                r, p = pearsonr(model_entropies, human_entropies)
                #print(r, pearson_c)
                #print(p)
                
                
                per_epoch_entcorrs.append(pearson_c)
                if not eval_set_name == 'NLIVariation':
                    accuracy = accuracy_score(y_test, y_pred)
                    #accuracy_per_epoch.append(accuracy)
                    macro_f = f1_score(y_test, y_pred, average = "macro", zero_division=0.0)
                    per_epoch_accs.append(accuracy)
                    per_epoch_mfs.append(macro_f)

                plt.scatter(human_entropies, model_entropies)

                #a, b = np.polyfit(x, y, 1)        
                # from https://pythonguides.com/matplotlib-best-fit-line/ and https://www.statology.org/line-of-best-fit-python/
                #y_line = theta[1] + theta[0] * np.array(human_entropies)
                #plt.plot(model_entropies, y_line, 'r')
                a, b = np.polyfit(human_entropies, model_entropies, 1)
                plt.plot(human_entropies, a*human_entropies+b, color='orange') 
                plt.title(" ".join((eval_set_name, roundname, epoch, 'entropy with line of best fit')))
                plt.xlabel('Human entropy')
                plt.ylabel('Model entropy')
                plt.annotate('r = {:.2f}, p = {:.2f}'.format(r, p), xy=(0.05, 0.95), xycoords='axes fraction')
                plt.savefig(os.path.join(epoch_resultspath, eval_set_name+'-entropies'), bbox_inches="tight")
                plt.close()
                #mf_scores_per_epoch.append(macro_f)

                '''if epoch == 'e6':
                    final_epoch_accuracies.append(accuracy)
                    final_epoch_mfscores.append(macro_f)
                    final_epoch_ent_corrs.append(pearson_c)'''
                
            if not eval_set_name == 'NLIVariation':               
                per_set_accs.append(per_epoch_accs)
                per_set_mfs.append(per_epoch_mfs)
            per_set_entcorrs.append(per_epoch_entcorrs)
    
        

        plot_input_accuracy = {'y': per_set_accs, 'x': epochs, 'legend_lables': list(eval_sets.keys())[:2],
                               'title': roundname+' Accuracy', 'filepath': os.path.join(round_resultspath, 'accuracy')}
        plot_input_mf = {'y': per_set_mfs, 'x': epochs, 'legend_lables': list(eval_sets.keys())[:2],
                               'title': roundname+' Macro F1', 'filepath': os.path.join(round_resultspath, 'macrof1')}
        plot_input_ent = {'y': per_set_entcorrs, 'x': epochs, 'legend_lables': list(eval_sets.keys()),
                               'title': roundname+' Model/human entropy correlation', 'filepath': os.path.join(round_resultspath, 'entropy_correlation')}

        
        # Get metrics from last epoch for each set:
        # I don't do this here, but this could instead fetch the result of a specified epoch,
        # e.g., if I wanted to get specifically the best epoch
        final_epoch_accs = epoch_n_data(per_set_accs, -1)
        final_epoch_mfs = epoch_n_data(per_set_mfs, -1)
        final_epoch_ent = epoch_n_data(per_set_entcorrs, -1)

        final_epoch_accuracies.append(final_epoch_accs)
        final_epoch_mfscores.append(final_epoch_mfs)
        final_epoch_ent_corrs.append(final_epoch_ent)
    
    # final_epoch_acc/mf/ents are lists of lists whose outer elements are per round data and subelements of those elements are per test set
    # data for the respective round. I want to plot metrics across rounds (so round number on X), and plot_categorical expects y to be
    # list of lists where the INNER list corresponds to the x-ticks. So I invert:
    # (could have avoided this by iterating over sets before over rounds, but my per-round plotting above has a different preference)

    reorganised_acc = reorder_sublists_by_idx(final_epoch_accuracies)
    reorganised_mf = reorder_sublists_by_idx(final_epoch_mfscores)
    reorganised_ent = reorder_sublists_by_idx(final_epoch_ent_corrs)


    
    plot_input_accuracy = {'y': reorganised_acc, 'x': rounds, 'legend_lables': list(eval_sets.keys())[:2],
                        'title': 'Final epoch accuracy across rounds', 'filepath': os.path.join(overall_resultspath, 'accuracy'), 'type': plt.bar}
    plot_input_mf = {'y': reorganised_mf, 'x': rounds, 'legend_lables': list(eval_sets.keys())[:2],
                        'title': 'Final epoch macro F1 across rounds', 'filepath': os.path.join(overall_resultspath, 'macrof1'), 'type': plt.bar}
    plot_input_ent = {'y': reorganised_ent, 'x': rounds, 'legend_lables': list(eval_sets.keys()),
                        'title': ' Final epoch model/human entropy correlation across rounds', 'filepath': os.path.join(overall_resultspath, 'entropy_correlation'),
                        'type': plt.bar}

    plot_data_dicts.extend([plot_input_accuracy, plot_input_mf, plot_input_ent])
    
    


    
    for plot_data in plot_data_dicts:
        print(plot_data)
        plot_categorical(plot_data)
        


                    

                        #p_scores = precision_score(y_test, y_pred, average = None, zero_division=0.0)
                        #r_scores = recall_score(y_test, y_pred, average = None, zero_division=0.0)
                        #f_scores = f1_score(y_test, y_pred, average = None, zero_division=0.0)
                        #macro_p = precision_score(y_test, y_pred, average = "macro", zero_division=0.0)
                        #macro_r = recall_score(y_test, y_pred, average = "macro", zero_division=0.0)
                    
                    


In [59]:
data1 = {'e1': 1, 'e2':2, 'e3': 2}
data2 = {'e1':2, 'e2':2, 'e3': 5}
datalist = [data1, data2]

In [76]:
data3 = {'1': [1,2,3,4,3], '2': data2}

In [85]:
datalist[:1]

[{'e1': 1, 'e2': 2, 'e3': 2}]

In [78]:
data3['1'].append(10)

In [79]:
data3

{'1': [1, 2, 3, 4, 3, 10], '2': {'e1': 2, 'e2': 2, 'e3': 5}}

In [None]:
d

In [113]:
plot_categorical(datalist, ['r1', 'r2'], 'testtitle',os.getcwd())

TypeError: plot_categorical() takes 1 positional argument but 4 were given

In [83]:
full_eval(model_specifier, modeldir, eval_sets)

Getting scores for r1 e1 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.86it/s]


Getting scores for r1 e2 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.88it/s]


Getting scores for r1 e3 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:18<00:00, 84.17it/s]


Getting scores for r1 e4 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.90it/s]


Getting scores for r1 e5 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 84.00it/s]


Getting scores for r1 e6 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 81.95it/s]


Getting scores for r1 e1 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.65it/s]


Getting scores for r1 e2 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.26it/s]


Getting scores for r1 e3 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.60it/s]


Getting scores for r1 e4 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.30it/s]


Getting scores for r1 e5 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.51it/s]


Getting scores for r1 e6 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.16it/s]


Getting scores for r1 e1 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 86.00it/s]


Getting scores for r1 e2 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.89it/s]


Getting scores for r1 e3 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.90it/s]


Getting scores for r1 e4 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.69it/s]


Getting scores for r1 e5 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.96it/s]


Getting scores for r1 e6 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.89it/s]


Getting scores for r2 e1 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 82.18it/s]


Getting scores for r2 e2 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.84it/s]


Getting scores for r2 e3 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.65it/s]


Getting scores for r2 e4 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.68it/s]


Getting scores for r2 e5 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.75it/s]


Getting scores for r2 e6 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.67it/s]


Getting scores for r2 e1 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.65it/s]


Getting scores for r2 e2 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.35it/s]


Getting scores for r2 e3 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.40it/s]


Getting scores for r2 e4 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.38it/s]


Getting scores for r2 e5 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.44it/s]


Getting scores for r2 e6 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.56it/s]


Getting scores for r2 e1 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.87it/s]


Getting scores for r2 e2 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.58it/s]


Getting scores for r2 e3 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.59it/s]


Getting scores for r2 e4 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.67it/s]


Getting scores for r2 e5 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.81it/s]


Getting scores for r2 e6 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.61it/s]


Getting scores for r3 e1 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.42it/s]


Getting scores for r3 e2 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.67it/s]


Getting scores for r3 e3 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.32it/s]


Getting scores for r3 e4 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.40it/s]


Getting scores for r3 e5 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.21it/s]


Getting scores for r3 e6 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.30it/s]


Getting scores for r3 e1 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 81.50it/s]


Getting scores for r3 e2 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.46it/s]


Getting scores for r3 e3 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.16it/s]


Getting scores for r3 e4 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.31it/s]


Getting scores for r3 e5 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.39it/s]


Getting scores for r3 e6 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.36it/s]


Getting scores for r3 e1 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 86.06it/s]


Getting scores for r3 e2 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.75it/s]


Getting scores for r3 e3 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.74it/s]


Getting scores for r3 e4 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.44it/s]


Getting scores for r3 e5 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.67it/s]


Getting scores for r3 e6 on NLIVariation.


100%|██████████| 496/496 [00:05<00:00, 85.52it/s]


Getting scores for r4 e1 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.32it/s]


Getting scores for r4 e2 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.63it/s]


Getting scores for r4 e3 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.59it/s]


Getting scores for r4 e4 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.66it/s]


Getting scores for r4 e5 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.35it/s]


Getting scores for r4 e6 on ChaosNLI-MNLI.


100%|██████████| 1599/1599 [00:19<00:00, 83.50it/s]


Getting scores for r4 e1 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.19it/s]


Getting scores for r4 e2 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.21it/s]


Getting scores for r4 e3 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:18<00:00, 83.44it/s]


Getting scores for r4 e4 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:20<00:00, 73.07it/s]


Getting scores for r4 e5 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:21<00:00, 69.91it/s]


Getting scores for r4 e6 on ChaosNLI-SNLI.


100%|██████████| 1514/1514 [00:21<00:00, 69.89it/s]


Getting scores for r4 e1 on NLIVariation.


100%|██████████| 496/496 [00:06<00:00, 72.00it/s]


Getting scores for r4 e2 on NLIVariation.


100%|██████████| 496/496 [00:06<00:00, 72.00it/s]


Getting scores for r4 e3 on NLIVariation.


100%|██████████| 496/496 [00:06<00:00, 71.99it/s]


Getting scores for r4 e4 on NLIVariation.


100%|██████████| 496/496 [00:06<00:00, 71.63it/s]


Getting scores for r4 e5 on NLIVariation.


100%|██████████| 496/496 [00:06<00:00, 71.80it/s]


Getting scores for r4 e6 on NLIVariation.


100%|██████████| 496/496 [00:06<00:00, 71.65it/s]


[[0.5272045028142589, 0.6941875825627477], [0.4965603502188868, 0.6994715984147952], [0.48530331457160725, 0.6955085865257595], [0.5240775484677924, 0.5495376486129459]]
[[0.5272045028142589, 0.4965603502188868, 0.48530331457160725, 0.5240775484677924], [0.6941875825627477, 0.6994715984147952, 0.6955085865257595, 0.5495376486129459]]
[[0.5141128564738868, 0.6784889375770216], [0.48903402547019564, 0.6834244914047201], [0.4794599767472101, 0.6734750207351858], [0.5096427333253989, 0.512771292645598]]
[[0.5141128564738868, 0.48903402547019564, 0.4794599767472101, 0.5096427333253989], [0.6784889375770216, 0.6834244914047201, 0.6734750207351858, 0.512771292645598]]
[[0.013409190772003471, 0.25620178453112236, 0.10670535153856693], [0.04230379712101827, 0.23248622458546903, 0.09171723522793551], [0.037526260861654184, 0.23770724063120177, 0.022042792682386927], [0.014017172268292301, 0.12545435679807823, 0.09835242863559304]]
[[0.013409190772003471, 0.04230379712101827, 0.037526260861654184

In [None]:
#test('./models/toy-models/r1/e2', device=device)

def test2(modelpath=None, modelinput=None, device='cpu', eval_dataset=cnli_eval_datasets['chaosNLI_mnli_m']):

    loader = DataLoader(dataset=eval_dataset, collate_fn=collate_fn, batch_size=1)
    if modelpath:
        #model = model_specifier['sequence_classification'].from_pretrained(modelpath).to(device)
        model = BertForSequenceClassification.from_pretrained(modelpath).to(device)
    elif modelinput:
        model = modelinput.to(device)
    else:
        print('Provide a path to the model or the model itself.')
        return None

    #model = AutoModel.from_pretrained(modelpath).to(device)
    

    #modelpath = os.path.join('./models', modelname)
    #model.load_state_dict(torch.load(modelpath, weights_only=True)) #weights_only=True
    #model.load_state_dict(torch.load(modelname, weights_only=True)) #weights_only=True
    #model = modelname.to(device)
    #print(model)

    y_test = []
    y_pred = []
    collected_logits = []
    collected_entropies = []

    #test_p = []
    #test_h = []
    test_id = []
    test_log = []
    test_oe = []
    test_e = []
    test_out =[]
    model.eval()
    with torch.no_grad():
        count = 0
        for batch_id, batch in enumerate(tqdm.tqdm(loader)):
            outputs = model(batch['input_ids'],
                                attention_mask=batch['attention_masks'],
                                token_type_ids=batch['token_type_ids'],
                                labels=batch['label_tensors'].to(device))
            
            #print(len(outputs))
            # 
            #print(batch.keys())
            #test_p.append(batch['premise'])
            #test_h.append(batch['hypothesis'])
            test_id.append(batch['input_ids'])
            
            #outputs = outputs#.cpu()                

            loss, logits = outputs[:2]
            test_out.append(outputs)

            test_log.append(logits)
            #print(logits)
            pred = torch.argmax(logits)
            gold = batch['label_tensors'][0][0]#.to(device)
            y_test.append(gold.cpu()) #cpu tensors seem to be required for some sklearn functions down the line
            y_pred.append(pred.cpu())
            collected_logits.append(logits)

            og_ent = batch['unique_data'][0]['entropy']
            collected_entropies.append(og_ent)

            dist = torch.softmax(logits, -1).squeeze()
            #logits.cpu().detach().numpy()
            ent = entropy(dist.cpu().detach().numpy(), base=2)
            #print(og_ent, ent)
            test_oe.append(og_ent)
            test_e.append(ent)
            #print()
            count+=1
            
            

        #print(y_test, 'y_test')
        return  test_id,test_log, test_oe, test_e, test_out #test_p , test_h
        return y_test, y_pred, collected_logits, collected_entropies

        print_metrics(y_pred, y_test, list(classes.keys()), model)

#test_id2,test_log2, test_oe2, test_e2, test_out2 = test2(test_model, device=device, eval_dataset=cnli_eval_datasets['chaosNLI_mnli_m'])
test_id2,test_log2, test_oe2, test_e2, test_out2 = test2(modelpath='./models2', device=device, eval_dataset=cnli_eval_datasets['chaosNLI_mnli_m'])

test_id,test_log, test_oe, test_e, test_out = test('test.pt', device=device, eval_dataset=cnli_eval_datasets['chaosNLI_mnli_m'])

y_test, y_pred, collected_logits, collected_entropies = test('bert-base-uncased-r3-bs16-eps3-lr0.0025-e3.pt', device=device, eval_dataset=cnli_eval_datasets['chaosNLI_mnli_m'])

In [None]:
def full_eval():
    namelist = os.listdir('./models')

    #sorts model names first by round, then by epoch:
    sorted_namelist = sorted(namelist, key=lambda x: (x.split('-')[3], x.split('-')[7]))
    #sorts model names first by round, then by epoch
    for modelname in sorted_namelist:
        
        #modelpath = os.path.join(os.getcwd(), 'models', modelname)
        print('Evaluating', modelname, 'on chaosNLI_mnli_m:')
        test(modelname, device=device, eval_dataset=cnli_eval_datasets['chaosNLI_mnli_m'])
        

sorted_list = sorted(list, key=lambda x: (x[0], -x[1]))

namelist = os.listdir('./models')
sorted_namelist = sorted(namelist, key=lambda x: (x.split('-')[3], x.split('-')[7])) #sorts model names first by round, then by epoch

In [72]:
#uncomment for train

# The transformations take a while, but I figure they're better here rather than having to do them once per epoch later
# It gets better after SNLI and MNLI since these are quite big
snli_train_dataset, mnli_train_dataset, anli_training_datasets, cnli_eval_datasets, NLI_var_eval_dataset = get_datasets()

Getting SNLI dataframe...
Getting MNLI dataframe...
Getting ANLI dataframes...
Getting CNLI dataframes...
Getting NLI variation dataframe...
Transforming SNLI data...
Transforming MNLI data...
Transforming ANLI data...
	 R1...
	 R2...
	 R3...
Transforming CNLI data...
	 chaosNLI_snli...
	 chaosNLI_mnli_m...
Transforming NLI variation data...


snli 549267
mnli 392603
R1 16946
R2 45460
R3 100459

In [20]:
def reduce_train(dataset, len_new_data):
    len_kept_data = len(dataset) - len_new_data
    new_train_set, _ = random_split(dataset, [len_kept_data, len(dataset)-len_kept_data])
    return new_train_set

In [None]:
#train_size = int(0.8 * len(full_dataset))

In [None]:
print()

In [74]:
def prepare_trainsets():
    #R1_train_data = ConcatDataset([snli_train_data, mnli_train_data])
    
    smnli = ConcatDataset([snli_train_dataset, mnli_train_dataset])

    len_anli = 0
    for round, dataset in anli_training_datasets.items():
    
        len_anli+=len(dataset)
   
    reduced_smnli1, _ = random_split(smnli, [len_anli, len(smnli)-len_anli])

    round1 = reduced_smnli1

    reduced_smnli2 = reduce_train(reduced_smnli1, len(anli_training_datasets['R1']))
    round2 = ConcatDataset([reduced_smnli2, anli_training_datasets['R1']])

    reduced_smnli3 = reduce_train(reduced_smnli2, len(anli_training_datasets['R2']))
    round3 = ConcatDataset([reduced_smnli3, anli_training_datasets['R1'], anli_training_datasets['R2']])
    
    #reduced_smnli4 = reduce_train(reduced_smnli3, len(anli_training_datasets['R3']))
    #print(len(reduced_smnli4))

    round4 = ConcatDataset([anli_training_datasets['R1'], anli_training_datasets['R2'], anli_training_datasets['R3'] ])

    print(len(round1), len(round2), len(round3), len(round4))
    
    #round2 = ConcatDataset([round1, anli_training_datasets['R1']])
    #round3 = ConcatDataset([round2, anli_training_datasets['R2']])
   #round4 = ConcatDataset([round3, anli_training_datasets['R3']])

    return round1, round2, round3, round4

In [75]:
#uncomment for train
r1train, r2train, r3train, r4train = prepare_trainsets()

162865 162865 162865 162865


In [None]:
def main(toy_run=True, overwrite=False, manual_modelname=False, log_to_file=False):

    modeldir = './models/bert-base-uncased-bs32-eps6-lr5e-05'
    
    
    #cnli_eval_datasets, NLI_var_eval_dataset = get_evalsets()
    
    if toy_run:
        hp = toy_hyperparams
        training_rounds = list(cnli_eval_datasets.values())
        print('Toy run:')
        print('Training on toy parameters')
        print(hp)
        print('Training on', list(cnli_eval_datasets.keys()))
    else:
        hp = hyperparams
        training_rounds = (r1train, r2train, r3train, r4train)
        print('Running full training on all four rounds. Hyperparameters:')
        print(hp)


    #print(hp)
    models = {}

    newpath = os.path.join(os.getcwd(), 'models')
    if not os.path.exists(newpath):
        os.mkdir(newpath)
    
    if manual_modelname:
        dirname = manual_modelname
    elif toy_run:
        dirname = 'toy-models'
    else:
        modelname = model_specifier['model_name']
        batch_size_str = 'bs'+str(hp['batch_size'])
        epochs_str = 'eps'+str(hp['epochs'])
        lr_str = 'lr'+str(hp['lr'])
        dirname = '-'.join((modelname, batch_size_str, epochs_str, lr_str))
    
    dirpath = os.path.join(newpath, dirname)

    if overwrite:
        if os.path.exists(dirpath):
            shutil.rmtree(dirpath)
    else:
        num = 2
        dirpath_base =dirpath
        while os.path.exists(dirpath):
            dirpath = dirpath_base+'_'+str(num)
            num+=1

    os.mkdir(dirpath)

    hppath = os.path.join(dirpath, 'hyperparameters.json')
    
    with open(hppath, 'w') as f:
            json.dump(hp, f)
    #os.mkdir(dirpath, 'hyperparams')
    

    
    if torch.cuda.is_available():          
        for round_num, round_data in enumerate(training_rounds):
            roundnum_str = 'r'+str(round_num+1)
            roundpath = os.path.join(dirpath, roundnum_str)
            os.mkdir(roundpath)
            resultspath = os.path.join(roundpath, 'results')
            os.mkdir(resultspath)
                #dirname = '-'.join((roundnum_str, modelname, batch_size_str, epochs_str, lr_str))

            #dirpath = os.path.join(newpath, dirname)
            #os.mkdir(dirpath)
            print()
            print('-------------------------------')
            print('Training', roundnum_str)
            round_models = train(model_specifier, round_data, roundpath, hp, logfile=log_to_file)
            print()
            models[roundnum_str] = round_models
    else:
        print('cuda unavailable')
        return None
        #sys.exit()
    
    return models
        
        #train(model_specifier, round, filename)

    #train(model_specifier, train_ds, model_filename):

In [121]:
modelname = model_specifier['model_name']
batch_size_str = 'bs'+str(hyperparams['batch_size'])
epochs_str = 'eps'+str(hyperparams['epochs'])
lr_str = 'lr'+str(hyperparams['lr'])
dirname = '-'.join((modelname, batch_size_str, epochs_str, lr_str))
dirname = dirname+'_2'
roundnum_str = 'r4'
roundpath = os.path.join(dirname, roundnum_str)

In [None]:
roundpath = os.path.join(os.getcwd(), 'models', roundpath)

'bert-base-uncased-bs32-eps6-lr5e-05_2/r4'

In [124]:
round4_models = train(model_specifier, r4train, roundpath, hyperparams)

Epoch 1:



Batches: 100%|██████████| 5090/5090 [44:02<00:00,  1.93it/s]


Average loss for epoch: 0.6154814206371139

Evaluating epoch 1


100%|██████████| 1599/1599 [00:16<00:00, 94.64it/s]



accuracy is 0.5222013758599124

f-score for label 'entailment' is 0.5456026058631922
f-score for label 'neutral' is 0.5634629493763756
f-score for label 'contradiction' is 0.38220757825370677
macro f-score is 0.49709104449775826

logits are:
tensor([[ 0.7211, -0.6332, -0.2054]], device='cuda:0')
tensor([[ 0.5344,  0.0083, -1.1166]], device='cuda:0')
tensor([[-0.6575,  0.9722, -0.9504]], device='cuda:0')
tensor([[ 0.0104,  0.5398, -0.8381]], device='cuda:0')
------------------------------------------------
Epoch 2:



Batches: 100%|██████████| 5090/5090 [44:35<00:00,  1.90it/s]


Average loss for epoch: 0.3896588913100993

Evaluating epoch 2


100%|██████████| 1599/1599 [00:16<00:00, 94.26it/s]



accuracy is 0.5422138836772983

f-score for label 'entailment' is 0.5980861244019139
f-score for label 'neutral' is 0.5529676934635612
f-score for label 'contradiction' is 0.40456769983686786
macro f-score is 0.5185405059007809

logits are:
tensor([[ 0.7210, -0.2840, -0.8494]], device='cuda:0')
tensor([[ 2.2026, -1.1779, -1.4370]], device='cuda:0')
tensor([[ 1.4133, -0.4116, -1.3232]], device='cuda:0')
tensor([[-0.8649,  1.9976, -1.7745]], device='cuda:0')
------------------------------------------------
Epoch 3:



Batches: 100%|██████████| 5090/5090 [44:28<00:00,  1.91it/s]


Average loss for epoch: 0.25439449695903216

Evaluating epoch 3


100%|██████████| 1599/1599 [00:16<00:00, 94.34it/s]



accuracy is 0.49280800500312694

f-score for label 'entailment' is 0.4737327188940092
f-score for label 'neutral' is 0.5572354211663066
f-score for label 'contradiction' is 0.39779005524861877
macro f-score is 0.47625273176964483

logits are:
tensor([[-1.5769, -0.0840,  1.8061]], device='cuda:0')
tensor([[ 0.5345, -0.9842,  0.3569]], device='cuda:0')
tensor([[-2.1313,  2.2738, -0.6427]], device='cuda:0')
tensor([[-1.9879,  2.5472, -1.1147]], device='cuda:0')
------------------------------------------------
Epoch 4:



Batches: 100%|██████████| 5090/5090 [44:38<00:00,  1.90it/s]


Average loss for epoch: 0.18102171369691056

Evaluating epoch 4


100%|██████████| 1599/1599 [00:16<00:00, 94.08it/s]



accuracy is 0.5303314571607255

f-score for label 'entailment' is 0.5941358024691358
f-score for label 'neutral' is 0.5343018563357547
f-score for label 'contradiction' is 0.39819004524886875
macro f-score is 0.508875901351253

logits are:
tensor([[ 0.1934, -0.0716, -0.6924]], device='cuda:0')
tensor([[ 0.5043, -1.0086,  0.1268]], device='cuda:0')
tensor([[-1.0822,  1.3658, -0.7335]], device='cuda:0')
tensor([[-0.6925,  1.8984, -1.9054]], device='cuda:0')
------------------------------------------------
Epoch 5:



Batches: 100%|██████████| 5090/5090 [44:36<00:00,  1.90it/s]  


Average loss for epoch: 0.1363214034188268

Evaluating epoch 5


100%|██████████| 1599/1599 [00:16<00:00, 94.30it/s]



accuracy is 0.4978111319574734

f-score for label 'entailment' is 0.5178268251273345
f-score for label 'neutral' is 0.5291338582677165
f-score for label 'contradiction' is 0.41333333333333333
macro f-score is 0.4867646722427948

logits are:
tensor([[-1.9468,  1.9830, -0.4993]], device='cuda:0')
tensor([[ 2.1056, -1.1216, -1.4655]], device='cuda:0')
tensor([[-2.2670,  0.9638,  1.1806]], device='cuda:0')
tensor([[-1.4583,  1.5647, -0.5525]], device='cuda:0')
------------------------------------------------
Epoch 6:



Batches: 100%|██████████| 5090/5090 [44:27<00:00,  1.91it/s]


Average loss for epoch: 0.11289937087048343

Evaluating epoch 6


100%|██████████| 1599/1599 [00:16<00:00, 94.48it/s]



accuracy is 0.5240775484677924

f-score for label 'entailment' is 0.5791190864600326
f-score for label 'neutral' is 0.5358306188925082
f-score for label 'contradiction' is 0.41397849462365593
macro f-score is 0.5096427333253989

logits are:
tensor([[-0.2502,  1.4599, -1.7514]], device='cuda:0')
tensor([[ 3.4691, -1.4240, -2.5073]], device='cuda:0')
tensor([[-2.5682,  1.1918,  1.1003]], device='cuda:0')
tensor([[-1.6757,  3.0545, -1.7923]], device='cuda:0')
------------------------------------------------


In [44]:
ent_test_model = os.path.join('./models/bert-base-uncased-bs32-eps6-lr5e-05', 'r4', 'e1')
#os.listdir(ent_test_model)
entropy_testing(cnli_eval_datasets['chaosNLI_mnli_m'], ent_test_model)


<class 'dict'>
cnli entropy, base2: 1.209800338660482

numpy:
np log2 entropy from dist: 1.2098003386604825
np log2 entropy from count: -543.4055851114242

pt, default=natlog:
pt ent from dist: tensor(0.8386)
pt ent from count: tensor(0.8386)

scipy, default=natlog
scipy entropy from dist: 0.8385696937829805
scipy entropy from count: 0.8385696937829805
scipy entropy from dist base2: 1.2098003386604825

logits tensor([ 0.7211, -0.6332, -0.2054], device='cuda:0', grad_fn=<SelectBackward0>)
probs tensor([0.6046, 0.1561, 0.2394], device='cuda:0', grad_fn=<SqueezeBackward0>)
np_probs [0.60456604 0.15605876 0.23937519]
pt entropy from logits: tensor(1.0656)
pt entropy from softmax: tensor(0.8386)
scipy entropy from logits: -inf
scipy entropy from softmax: 0.9363674
scipy entropy from logits, b2: -inf
scipy entropy from softmax, b2: 1.3508925936675558
scipy entropy from softmax2: 1.3508925936675558

model numpy entropy, natlog 0.9363674
model numpy entropy, b2 1.3508925


In [115]:
models = main(toy_run=False)

Running full training on all four rounds. Hyperparameters:
{'batch_size': 32, 'epochs': 6, 'lr': 5e-05}

-------------------------------
Training r1
Epoch 1:



Batches: 100%|██████████| 5090/5090 [21:38<00:00,  3.92it/s]


Average loss for epoch: 0.5560356694173016

Evaluating epoch 1


100%|██████████| 1599/1599 [00:17<00:00, 93.24it/s]



accuracy is 0.5003126954346466

f-score for label 'entailment' is 0.5718849840255591
f-score for label 'neutral' is 0.4664371772805508
f-score for label 'contradiction' is 0.4362244897959184
macro f-score is 0.49151555036734274

logits are:
tensor([[-0.2055, -1.6975,  1.3530]], device='cuda:2')
tensor([[ 0.9640,  0.6165, -1.3053]], device='cuda:2')
tensor([[-0.7779,  0.3319,  0.1786]], device='cuda:2')
tensor([[ 1.8448, -0.8016, -0.9436]], device='cuda:2')
------------------------------------------------
Epoch 2:



Batches: 100%|██████████| 5090/5090 [21:43<00:00,  3.90it/s]


Average loss for epoch: 0.3594666064934611

Evaluating epoch 2


100%|██████████| 1599/1599 [00:17<00:00, 92.94it/s]



accuracy is 0.5184490306441526

f-score for label 'entailment' is 0.5847389558232932
f-score for label 'neutral' is 0.4975206611570248
f-score for label 'contradiction' is 0.4414535666218035
macro f-score is 0.5079043945340406

logits are:
tensor([[ 1.3271, -0.6975, -0.0983]], device='cuda:2')
tensor([[ 0.0468,  1.0733, -0.7880]], device='cuda:2')
tensor([[-2.2526,  0.0821,  1.9443]], device='cuda:2')
tensor([[ 1.3821,  1.0656, -2.2963]], device='cuda:2')
------------------------------------------------
Epoch 3:



Batches: 100%|██████████| 5090/5090 [21:40<00:00,  3.91it/s]


Average loss for epoch: 0.23954731916280533

Evaluating epoch 3


100%|██████████| 1599/1599 [00:17<00:00, 93.46it/s]



accuracy is 0.5053158223889931

f-score for label 'entailment' is 0.5471698113207547
f-score for label 'neutral' is 0.4991869918699187
f-score for label 'contradiction' is 0.4538653366583541
macro f-score is 0.5000740466163425

logits are:
tensor([[ 0.3134, -1.8172,  1.6553]], device='cuda:2')
tensor([[ 3.0834,  0.4560, -2.8963]], device='cuda:2')
tensor([[-2.1469,  1.0498,  0.7062]], device='cuda:2')
tensor([[ 1.0628,  1.1924, -2.1592]], device='cuda:2')
------------------------------------------------
Epoch 4:



Batches: 100%|██████████| 5090/5090 [21:40<00:00,  3.91it/s]


Average loss for epoch: 0.1690204484490696

Evaluating epoch 4


100%|██████████| 1599/1599 [00:17<00:00, 93.27it/s]



accuracy is 0.5265791119449656

f-score for label 'entailment' is 0.5899053627760252
f-score for label 'neutral' is 0.5121759622937941
f-score for label 'contradiction' is 0.4322678843226788
macro f-score is 0.511449736464166

logits are:
tensor([[ 2.0973, -1.0321, -0.8201]], device='cuda:2')
tensor([[ 3.1704,  0.7837, -3.2546]], device='cuda:2')
tensor([[ 0.0269,  1.5999, -1.7531]], device='cuda:2')
tensor([[ 2.4923, -0.6417, -1.7425]], device='cuda:2')
------------------------------------------------
Epoch 5:



Batches: 100%|██████████| 5090/5090 [21:41<00:00,  3.91it/s]


Average loss for epoch: 0.1338824885995228

Evaluating epoch 5


100%|██████████| 1599/1599 [00:17<00:00, 93.43it/s]



accuracy is 0.5015634771732332

f-score for label 'entailment' is 0.5402201524132092
f-score for label 'neutral' is 0.5068078668683812
f-score for label 'contradiction' is 0.42589928057553955
macro f-score is 0.4909757666190433

logits are:
tensor([[ 2.6929, -0.3646, -1.9384]], device='cuda:2')
tensor([[ 2.0625,  2.7482, -4.3763]], device='cuda:2')
tensor([[ 1.5546,  0.8374, -2.1716]], device='cuda:2')
tensor([[ 3.8639, -0.1744, -3.2066]], device='cuda:2')
------------------------------------------------
Epoch 6:



Batches: 100%|██████████| 5090/5090 [21:44<00:00,  3.90it/s]


Average loss for epoch: 0.1090744101629942

Evaluating epoch 6


100%|██████████| 1599/1599 [00:17<00:00, 93.22it/s]



accuracy is 0.5272045028142589

f-score for label 'entailment' is 0.5660377358490566
f-score for label 'neutral' is 0.5351681957186545
f-score for label 'contradiction' is 0.44113263785394935
macro f-score is 0.5141128564738868

logits are:
tensor([[ 4.0918, -1.6033, -2.1241]], device='cuda:2')
tensor([[ 2.1181,  3.1343, -4.6836]], device='cuda:2')
tensor([[-2.1798,  4.1833, -1.9967]], device='cuda:2')
tensor([[ 4.8251, -0.2956, -3.8933]], device='cuda:2')
------------------------------------------------


-------------------------------
Training r2
Epoch 1:



Batches: 100%|██████████| 5090/5090 [28:07<00:00,  3.02it/s]


Average loss for epoch: 0.5576156075319278

Evaluating epoch 1


100%|██████████| 1599/1599 [00:17<00:00, 93.27it/s]



accuracy is 0.5215759849906192

f-score for label 'entailment' is 0.5686433793663688
f-score for label 'neutral' is 0.5235602094240838
f-score for label 'contradiction' is 0.4253968253968254
macro f-score is 0.5058668047290926

logits are:
tensor([[-1.6177, -1.0102,  1.7078]], device='cuda:2')
tensor([[ 0.4187,  1.2612, -1.5426]], device='cuda:2')
tensor([[-0.7209,  1.2764, -0.4694]], device='cuda:2')
tensor([[ 1.3959,  0.4911, -1.7349]], device='cuda:2')
------------------------------------------------
Epoch 2:



Batches: 100%|██████████| 5090/5090 [28:09<00:00,  3.01it/s]


Average loss for epoch: 0.362235447193527

Evaluating epoch 2


100%|██████████| 1599/1599 [00:17<00:00, 92.36it/s]



accuracy is 0.47842401500938087

f-score for label 'entailment' is 0.4708029197080292
f-score for label 'neutral' is 0.5077574047954866
f-score for label 'contradiction' is 0.4298245614035088
macro f-score is 0.4694616286356748

logits are:
tensor([[ 1.0562, -0.0833, -1.2052]], device='cuda:2')
tensor([[ 0.5872,  2.0281, -2.3055]], device='cuda:2')
tensor([[-0.7986,  1.7133, -0.7606]], device='cuda:2')
tensor([[ 1.7829,  0.8831, -2.5442]], device='cuda:2')
------------------------------------------------
Epoch 3:



Batches: 100%|██████████| 5090/5090 [28:09<00:00,  3.01it/s]


Average loss for epoch: 0.2407900101624151

Evaluating epoch 3


100%|██████████| 1599/1599 [00:17<00:00, 92.82it/s]



accuracy is 0.5434646654158849

f-score for label 'entailment' is 0.6227272727272727
f-score for label 'neutral' is 0.5088161209068011
f-score for label 'contradiction' is 0.4512372634643377
macro f-score is 0.5275935523661371

logits are:
tensor([[-0.9515, -1.7944,  1.9320]], device='cuda:2')
tensor([[ 0.5372,  1.3751, -1.7461]], device='cuda:2')
tensor([[-0.2931,  0.8351, -0.4507]], device='cuda:2')
tensor([[ 3.3980,  0.1738, -3.3402]], device='cuda:2')
------------------------------------------------
Epoch 4:



Batches: 100%|██████████| 5090/5090 [28:08<00:00,  3.02it/s]


Average loss for epoch: 0.17088851366944113

Evaluating epoch 4


100%|██████████| 1599/1599 [00:17<00:00, 93.47it/s]



accuracy is 0.5390869293308318

f-score for label 'entailment' is 0.583596214511041
f-score for label 'neutral' is 0.528152260111023
f-score for label 'contradiction' is 0.47533632286995514
macro f-score is 0.5290282658306731

logits are:
tensor([[ 0.2974, -1.4995,  0.7891]], device='cuda:2')
tensor([[-1.6089,  3.8365, -1.8890]], device='cuda:2')
tensor([[-2.2810,  1.6916,  1.0477]], device='cuda:2')
tensor([[ 2.9057,  0.5594, -3.0800]], device='cuda:2')
------------------------------------------------
Epoch 5:



Batches: 100%|██████████| 5090/5090 [28:09<00:00,  3.01it/s]


Average loss for epoch: 0.13319093215266473

Evaluating epoch 5


100%|██████████| 1599/1599 [00:17<00:00, 92.59it/s]



accuracy is 0.4878048780487805

f-score for label 'entailment' is 0.5069444444444444
f-score for label 'neutral' is 0.4954268292682927
f-score for label 'contradiction' is 0.444141689373297
macro f-score is 0.48217098769534467

logits are:
tensor([[ 1.2674, -2.2717,  0.5232]], device='cuda:2')
tensor([[-1.9146,  3.9203, -1.6866]], device='cuda:2')
tensor([[-1.3942,  2.6207, -0.9765]], device='cuda:2')
tensor([[ 3.3543,  0.4080, -3.6303]], device='cuda:2')
------------------------------------------------
Epoch 6:



Batches: 100%|██████████| 5090/5090 [28:06<00:00,  3.02it/s]


Average loss for epoch: 0.11006774310254051

Evaluating epoch 6


100%|██████████| 1599/1599 [00:17<00:00, 92.74it/s]



accuracy is 0.4965603502188868

f-score for label 'entailment' is 0.5276595744680851
f-score for label 'neutral' is 0.5019425019425019
f-score for label 'contradiction' is 0.4375
macro f-score is 0.48903402547019564

logits are:
tensor([[ 3.0981, -2.0594, -1.3462]], device='cuda:2')
tensor([[-1.1748,  4.8013, -3.1886]], device='cuda:2')
tensor([[ 0.6220,  2.3307, -2.5826]], device='cuda:2')
tensor([[ 2.9542,  2.2034, -4.5903]], device='cuda:2')
------------------------------------------------


-------------------------------
Training r3
Epoch 1:



Batches: 100%|██████████| 5090/5090 [30:57<00:00,  2.74it/s]


Average loss for epoch: 0.5780789584038534

Evaluating epoch 1


100%|██████████| 1599/1599 [00:17<00:00, 93.62it/s]



accuracy is 0.5303314571607255

f-score for label 'entailment' is 0.6012658227848101
f-score for label 'neutral' is 0.5067513899920572
f-score for label 'contradiction' is 0.4414814814814815
macro f-score is 0.5164995647527829

logits are:
tensor([[-1.5204, -0.9570,  2.5276]], device='cuda:2')
tensor([[ 1.1911,  0.8918, -1.5472]], device='cuda:2')
tensor([[-1.3153,  1.2601,  0.0151]], device='cuda:2')
tensor([[ 1.4158,  1.1957, -2.2139]], device='cuda:2')
------------------------------------------------
Epoch 2:



Batches: 100%|██████████| 5090/5090 [31:01<00:00,  2.73it/s]


Average loss for epoch: 0.3770621106211528

Evaluating epoch 2


100%|██████████| 1599/1599 [00:17<00:00, 92.84it/s]



accuracy is 0.5228267667292058

f-score for label 'entailment' is 0.5294635004397538
f-score for label 'neutral' is 0.5491803278688525
f-score for label 'contradiction' is 0.4455611390284757
macro f-score is 0.508068322445694

logits are:
tensor([[ 1.1522, -0.7137,  0.3013]], device='cuda:2')
tensor([[ 2.1274,  0.3179, -1.6634]], device='cuda:2')
tensor([[-1.8805,  1.8854, -0.3816]], device='cuda:2')
tensor([[ 1.3285,  0.6132, -1.4542]], device='cuda:2')
------------------------------------------------
Epoch 3:



Batches: 100%|██████████| 5090/5090 [31:01<00:00,  2.73it/s]


Average loss for epoch: 0.2536836412136968

Evaluating epoch 3


100%|██████████| 1599/1599 [00:17<00:00, 92.71it/s]



accuracy is 0.5121951219512195

f-score for label 'entailment' is 0.575682382133995
f-score for label 'neutral' is 0.4721984602224123
f-score for label 'contradiction' is 0.47560975609756095
macro f-score is 0.5078301994846561

logits are:
tensor([[ 2.4098, -0.4799, -0.9106]], device='cuda:2')
tensor([[ 2.2502,  0.6995, -2.1619]], device='cuda:2')
tensor([[-2.1144,  2.5064, -0.8519]], device='cuda:2')
tensor([[ 2.0422,  0.7991, -2.2621]], device='cuda:2')
------------------------------------------------
Epoch 4:



Batches: 100%|██████████| 5090/5090 [30:58<00:00,  2.74it/s]


Average loss for epoch: 0.17906102435919502

Evaluating epoch 4


100%|██████████| 1599/1599 [00:17<00:00, 92.78it/s]



accuracy is 0.5196998123827392

f-score for label 'entailment' is 0.5725677830940988
f-score for label 'neutral' is 0.49637389202256244
f-score for label 'contradiction' is 0.4665718349928876
macro f-score is 0.511837836703183

logits are:
tensor([[ 2.4189, -1.3606,  0.0259]], device='cuda:2')
tensor([[ 2.6009,  0.9109, -2.9557]], device='cuda:2')
tensor([[-3.1715,  2.6111,  0.1160]], device='cuda:2')
tensor([[ 1.5842,  0.9953, -2.3431]], device='cuda:2')
------------------------------------------------
Epoch 5:



Batches: 100%|██████████| 5090/5090 [30:58<00:00,  2.74it/s]


Average loss for epoch: 0.138967253081775

Evaluating epoch 5


100%|██████████| 1599/1599 [00:17<00:00, 92.74it/s]



accuracy is 0.5109443402126329

f-score for label 'entailment' is 0.5449915110356537
f-score for label 'neutral' is 0.4976
f-score for label 'contradiction' is 0.4805194805194805
macro f-score is 0.5077036638517114

logits are:
tensor([[ 0.7530, -1.3052,  1.4089]], device='cuda:2')
tensor([[ 2.3237,  0.2496, -2.0267]], device='cuda:2')
tensor([[-3.8631,  2.4831,  0.9547]], device='cuda:2')
tensor([[ 1.0666,  2.3845, -3.8690]], device='cuda:2')
------------------------------------------------
Epoch 6:



Batches: 100%|██████████| 5090/5090 [30:57<00:00,  2.74it/s]


Average loss for epoch: 0.11400099914413026

Evaluating epoch 6


100%|██████████| 1599/1599 [00:17<00:00, 92.94it/s]



accuracy is 0.48530331457160725

f-score for label 'entailment' is 0.4813126709206928
f-score for label 'neutral' is 0.5049928673323824
f-score for label 'contradiction' is 0.4520743919885551
macro f-score is 0.4794599767472101

logits are:
tensor([[ 1.1456, -1.1632,  0.6915]], device='cuda:2')
tensor([[ 3.1519,  0.0432, -2.4915]], device='cuda:2')
tensor([[-3.7929,  3.6805, -0.7279]], device='cuda:2')
tensor([[ 1.8064,  1.1069, -2.8002]], device='cuda:2')
------------------------------------------------


-------------------------------
Training r4
Epoch 1:



Batches:  90%|████████▉ | 4572/5090 [35:59<04:04,  2.12it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 130.00 MiB. GPU 2 has a total capacity of 10.90 GiB of which 57.31 MiB is free. Including non-PyTorch memory, this process has 10.85 GiB memory in use. Of the allocated memory 10.23 GiB is allocated by PyTorch, and 455.39 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [64]:

#stored = models['r1']['e3']
loaded = model_specifier['sequence_classification'].from_pretrained('./models/testing_model_4/r1/e1').to(device)

In [65]:
test(modelin=loaded, device=device, eval_dataset=cnli_eval_datasets['chaosNLI_mnli_m'], logfile=False)

100%|██████████| 1599/1599 [00:16<00:00, 94.76it/s]



accuracy is 0.36460287679799874

f-score for label 'entailment' is 0.0
f-score for label 'neutral' is 0.534372135655362
f-score for label 'contradiction' is 0.0
macro f-score is 0.178124045218454

logits are:
tensor([[-9.2412,  8.8403,  7.3992]], device='cuda:1')
tensor([[-9.2412,  8.8403,  7.3992]], device='cuda:1')
tensor([[-9.2412,  8.8403,  7.3992]], device='cuda:1')
tensor([[-9.2412,  8.8403,  7.3992]], device='cuda:1')
------------------------------------------------


In [None]:
l_test_id,l_test_log, l_test_oe, l_test_e, l_test_out, l_y_test, l_y_pred, l_collected_logits, l_collected_entropies =test(modelpath=False, modelin= loaded, device=device)
s_test_id,s_test_log, s_test_oe, s_test_e, s_test_out, s_y_test, s_y_pred, s_collected_logits, s_collected_entropies =test(modelpath=False, modelin= stored, device=device)

  0%|          | 1/1599 [00:00<13:08,  2.03it/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.209800338660482 1.3814821158806183

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.295225451536183 1.3814822018719437

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.4041074513870861 1.3814821158806183

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.11878903655513 1.3814821158806183



  1%|          | 9/1599 [00:00<01:32, 17.17it/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.048018094115863 1.3814821158806183

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.217796811598595 1.3814821158806183

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.071206339756579 1.3814822018719437

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.104746783742209 1.3814822018719437

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.054335564133742 1.3814822018719437

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.219240704636849 

  1%|          | 15/1599 [00:00<01:36, 16.49it/s]


dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.1456781689999 1.3814821158806183

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.054335564133742 1.3814822018719437

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
0.746539872415499 1.3814822018719437

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.044667051917939 1.3814822018719437



  0%|          | 0/1599 [00:00<?, ?it/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.209800338660482 1.3814821158806183



  0%|          | 6/1599 [00:00<00:55, 28.48it/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.295225451536183 1.3814822018719437

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.4041074513870861 1.3814821158806183

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.11878903655513 1.3814821158806183

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.048018094115863 1.3814821158806183

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.217796811598595 1.3814821158806183

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.071206339756579 

  1%|          | 14/1599 [00:00<00:47, 33.40it/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.219240704636849 1.3814822018719437

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
0.795040279384522 1.3814822018719437

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
0.7579911871785621 1.3814821158806183

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.1456781689999 1.3814821158806183

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.054335564133742 1.3814822018719437

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
0.746539872415499 1

  1%|          | 15/1599 [00:00<00:53, 29.43it/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_masks', 'unique_data', 'label_tensors'])
tensor([[ 0.2848, -0.1289, -1.2064]])
1.044667051917939 1.3814822018719437






In [None]:
for l, s in zip(l_collected_logits, s_collected_logits):
    print(l, s)

tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.2848, -0.1289, -1.2064]]) tensor([[ 0.2848, -0.1289, -1.2064]])
tensor([[ 0.

In [None]:
NLI_var_eval_dataset[0]

{'input_encoding': {'input_ids': tensor([  101,  2054,  2024,  2027,  2725,  2157,  2008,  1045,  2106,  3308,
           1029,  2672,  2026,  6707,  2428,  2001,  2074,  1037, 11576,  4926,
           1012,   102,  2009,  2003,  2825,  1045,  2001,  2074,  1037, 11576,
           4926,  1012,   102], device='cuda:0'),
  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')},
 'premise': 'What are they doing right that I did wrong? Maybe my mistake really was just a freak accident.',
 'hypothesis': 'It is possible I was just a freak accident.',
 'input_length': 33,
 'unique_data': labels           [49.999903547520425, -49.99995111441214, -49.9...
 normed-labels    [-1.049158911374859, -2.056001753990448, -1.38...
 Name: 0, dtype: objec

In [None]:
hyperparams

{'batch_size': 4, 'epochs': 2, 'lr': 0.1}

with open("NLI_variation_data.jsonl", "r") as f:
    data = f.read()