# Preliminary Training Script 

In [1]:
# imports
# %pip install bert_race/requirements.txt -qqq
%pip install transformers==4.11.3 -qqq
!pip install pytorch_pretrained_bert==0.4.0 -qqq
# %conda install -c conda-forge ipywidgets



%load_ext autoreload
%autoreload 2
# %pip install transformers --upgrade

!pip install wandb -qqq
import wandb

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import transformers 
print(transformers.__version__)

from loss import LossCriterion, LOSS_REGISTRY

from module.bert_optim import RAdam

import torch
torch.cuda.empty_cache()
import gc
# del variables
gc.collect()

device_id = 0

4.11.3


In [3]:
# import torch
# import pickle
# %load_ext autoreload
# %autoreload 2
from utils.tokenization_utils import read_data

In [4]:
bucket = 'support-bert-data'
data_location = f's3://{bucket}'



sweep_config = {
    'method': 'random', #grid, random
    'metric': {
      'name': 'regularized batch loss',
      'goal': 'minimize'   
    },
    'parameters': {
        'max_seq_length': {
            'values': [128]
        },
        'epochs': {
            'values': [2, 3]
        },
        'batch_size': {
            'values': [2]
        },
        'adv_train':{
            'values': [0,1]
        },
        'adv_k': {
            'values': [1, 3, 5]
        },
        'learning_rate': {
            'values': [5e-4, 1e-5, 3e-5, 5e-5]
        },
        'adv_p_norm': {
            'values': ['inf']#, 'l1', 'l2']
        },
        'adv_alpha': {
            'values': [0.1, 1, 3, 5]
        },
        'adv_loss': {
            'values': ['LossCriterion.SymKlCriterion', 'LossCriterion.KlCriterion']
        },
        'grad_accumulation_step': {
            'values': [1, 2]
        },
        'scheduler_type':{
            'values': ['ms','exp']
        },
        'optimizer': {
            'values': ['adam', 'radam']
        },
    }
}

sweep_id = wandb.sweep(sweep_config, project="master-thesis")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: hgwhyik9
Sweep URL: https://wandb.ai/wassimboubaker/master-thesis/sweeps/hgwhyik9


In [5]:
# %pip install -r bert_race/requirements.txt
# %cp bert_race/pytorch_pretrained_bert/tokenization.py utils/tokenization.py
# %cp bert_race/pytorch_pretrained_bert/file_utils.py utils/file_utils.py

from utils.tokenization import BertTokenizer 
from utils.tokenization_utils import convert_examples_to_features
from data_utils.utils import AverageMeter

In [6]:
bert_model = "bert-base-uncased"
do_lower_case = True
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)

In [7]:
import os

# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# %cp bert_race/pytorch_pretrained_bert/modeling.py utils/modeling.py
# %cp bert_race/pytorch_pretrained_bert/optimization.py utils/optimization.py

import torch
from utils.modeling import BertForMultipleChoice, BertConfig, BertEmbeddings
from utils.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from utils.optimization import BertAdam, WarmupLinearSchedule

In [8]:
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset, IterableDataset)
from tqdm import tqdm, trange

In [9]:
# from utils.tokenization_utils import build_tensor
def select_field(features, field):
    return [
        [
            choice[field]
            for choice in feature.choices_features
        ]
        for feature in features
    ]

def build_tensor(features):
    all_input_ids = torch.tensor(select_field(features, 'input_ids'),
                                 dtype=torch.long)
    all_input_mask = torch.tensor(select_field(features, 'input_mask'),
                                  dtype=torch.long)
    all_segment_ids = torch.tensor(select_field(features, 'segment_ids'),
                                   dtype=torch.long)
    all_label = torch.tensor([f.label for f in features],
                             dtype=torch.long)
    return TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)


In [10]:
# args
seed = 30
do_lower_case = True
bert_model = "bert-base-uncased"

In [11]:
import torch

cuda = torch.device("cuda" if torch.cuda.is_available() else "cpu")

n_gpu = torch.cuda.device_count()
if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)
print(n_gpu)

1


In [12]:
# # Prepare optimizer
# param_optimizer = list(model.named_parameters())
# param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

# no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
#         ]
    

# global_step = 0



In [13]:
train_examples = read_data(data_location,'train')
dev_examples = read_data(data_location,'dev')
test_examples = read_data(data_location,'test')

dev_train_examples = train_examples[:16]

def build_dataset(config):
    train_features = convert_examples_to_features(
        train_examples, tokenizer, config['max_seq_length'], True)
    dev_features = convert_examples_to_features(
                dev_examples, tokenizer, config['max_seq_length'], True)
    test_features = convert_examples_to_features(
                test_examples, tokenizer, config['max_seq_length'], True)
    dev_train_features = convert_examples_to_features(
        train_examples, tokenizer, config['max_seq_length'], True)

    train_data = build_tensor(train_features)
    dev_data = build_tensor(dev_features)
    test_data = build_tensor(test_features)
    dev_train_data = build_tensor(dev_train_features)
    
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=config['batch_size'])
    
    eval_sampler = SequentialSampler(dev_data)
    eval_dataloader = DataLoader(dev_data,
                                 sampler=eval_sampler,
                                 batch_size=2)    
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data,
                                 sampler=test_sampler,
                                 batch_size=32)
    
    dev_train_sampler = SequentialSampler(dev_train_data)
    dev_train_dataloader = DataLoader(dev_data,
                                 sampler=dev_train_sampler,
                                 batch_size=2) 
    
    return train_dataloader, eval_dataloader, test_dataloader, dev_train_dataloader
[train_dataloader,
 eval_dataloader,
 test_dataloader,
 dev_train_dataloader] = build_dataset({'max_seq_length': 128,
                                        'batch_size':2})

model = BertForMultipleChoice.from_pretrained(bert_model,
                                              cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(-1)),
                                              num_choices=3)
    

def create_model(device):
    
    for name, param in model.named_parameters():
        ln = 24
        if name.startswith('bert.encoder'):
        	l = name.split('.')
        	ln = int(l[3])
      
        if name.startswith('bert.embeddings') or ln < 6:
#         	print(name)  
        	param.requires_grad = False
    
    model.to(device)
    
    return model

def _model_init(config, model, device, state_dict=None, num_train_step=-1):
        total_param = sum([p.nelement() for p in model.parameters() if p.requires_grad])
        if config['cuda']:
            if config['local_rank'] != -1:
                model = model.to(device)
            else:
                model = model.to(device)
        network = model
        if state_dict:
            missing_keys, unexpected_keys = network.load_state_dict(state_dict['state'], strict=False)

        optimizer_parameters = _get_param_groups(network)
#         try:
        optimizer, scheduler = _setup_optim(config,optimizer_parameters, state_dict, num_train_step, network)
#         except Exceptio#work: {network}')
#         optimizer.zero_grad()

        #if self.config["local_rank"] not in [-1, 0]:
        #    torch.distributed.barrier()

        if config['local_rank'] != -1:
            mnetwork = torch.nn.parallel.DistributedDataParallel(network, device_ids=[self.config["local_rank"]], output_device=self.config["local_rank"], find_unused_parameters=True)
        elif config['multi_gpu_on']:
            mnetwork = torch.nn.DataParallel(network, device_ids=[0, 1, 2])
        else:
            mnetwork = network
        task_loss_criterion = _setup_lossmap(config)
        adv_task_loss_criterion = _setup_adv_lossmap(config)
        adv_teacher = _setup_adv_training(config, adv_task_loss_criterion)
        
        return [mnetwork,
                task_loss_criterion,
                adv_task_loss_criterion,
                adv_teacher,
                optimizer_parameters,
                optimizer,
                scheduler]

After sorted: support-bert-data/train/0.txt
After sorted: support-bert-data/dev/0.txt
After sorted: support-bert-data/test/0.txt
3289 race_id: support-bert-data/train/0:0, context_sentence: when tech opens info session and when he tries to close it, it hangs and will not close. i have included a screen shot as to what process  hangs in the device manager. i have tried to uninstall and reinstall the pdf application but it did not help, start_ending: what kind of issue occurred?, ending_0: a performance issue., ending_1: [NOA], ending_2: a protocol issue., label: 0
0
557 race_id: support-bert-data/dev/0:0, context_sentence: [AN56] data is showing as queued, almost all to [AN16], [AN31] seems to transmit okay. please see screen shot and [AN25] requirements log. i also submitted error report from admin client on device., start_ending: how would you describe your issue?, ending_0: the {[AN56]} protocols were transferred to {[AN31]} but could not be saved on your device (pc/{[AN1]})., ending

In [28]:
from perturbation import SmartPerturbation

def _setup_adv_training(config, adv_task_loss_criterion):
        adv_teacher = None
        if config.get('adv_train', False):
            adv_teacher = SmartPerturbation(config['adv_epsilon'],
                    config['multi_gpu_on'],
                    config['adv_step_size'],
                    config['adv_noise_var'],
                    config['adv_p_norm'],
                    config['adv_k'],
                    config['fp16'],
                    config['encoder_type'],
                    loss_map=adv_task_loss_criterion,
                    norm_level=config['adv_norm_level'])
        return adv_teacher
            
def _setup_adv_lossmap(config):
        task_def_list: List[TaskDef] = config['task_def_list']
        adv_task_loss_criterion = []
        if config.get('adv_train', False):
            cs = config['adv_loss']
            assert cs in ['LossCriterion.SymKlCriterion', 'LossCriterion.KlCriterion']
            if cs == 'LossCriterion.SymKlCriterion':
                lc = LOSS_REGISTRY[LossCriterion.SymKlCriterion](name='Adv Loss func of task {}: {}'.format(0, cs))
                adv_task_loss_criterion.append(lc)
            else:
                lc = LOSS_REGISTRY[LossCriterion.KlCriterion](name='Adv Loss func of task {}: {}'.format(0, cs))
                adv_task_loss_criterion.append(lc)
            return adv_task_loss_criterion
            
def _setup_lossmap(config):
        task_def_list: List[TaskDef] = config['task_def_list']
        task_loss_criterion = []
        cs = config['loss'] # this loss has later to be passed through config file
        if cs=='LossCriterion.CeCriterion':
            lc = LOSS_REGISTRY[LossCriterion.CeCriterion](name='Loss func of task {}: {}'.format(0, cs)) 
            task_loss_criterion.append(lc)
        
        elif cs=='LossCriterion.KlCriterion':
            lc = LOSS_REGISTRY[LossCriterion.KlCriterion](name='Loss func of task {}: {}'.format(0, cs)) 
            task_loss_criterion.append(lc)
        elif cs=='LossCriterion.SymKlCriterion':
            lc = LOSS_REGISTRY[LossCriterion.KlCriterion](name='Loss func of task {}: {}'.format(0, cs)) 
            task_loss_criterion.append(lc)
        else:
            raise NotImplementedError
        return task_loss_criterion
            
def _get_param_groups(network):
        param_optimizer = [n for n in list(network.named_parameters()) if 'pooler' not in n[0]]
        no_decay = ['bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]
        return optimizer_parameters
    
from pytorch_pretrained_bert import BertAdam as Adam

def _setup_optim(config, optimizer_parameters, state_dict=None, num_train_step=-1, network=None):
    if config['optimizer'] == 'sgd':
        optimizer = optim.SGD(optimizer_parameters, config['learning_rate'],
                                   weight_decay=config['weight_decay'])

    elif config['optimizer'] == 'adamax':
        optimizer = Adamax(optimizer_parameters,
                                config['learning_rate'],
                                warmup=config['warmup'],
                                t_total=num_train_step,
                                max_grad_norm=config['grad_clipping'],
                                schedule=config['warmup_schedule'],
                                weight_decay=config['weight_decay'])
        if config.get('have_lr_scheduler', False): config['have_lr_scheduler'] = False
    elif config['optimizer'] == 'radam':
        optimizer = RAdam(optimizer_parameters,
                                config['learning_rate'],
                                warmup=config['warmup'],
                                t_total=num_train_step,
                                max_grad_norm=config['grad_clipping'],
                                schedule=config['warmup_schedule'],
                                eps=config['adam_eps'],
                                weight_decay=config['weight_decay'])
        if config.get('have_lr_scheduler', False): config['have_lr_scheduler'] = False
        # The current radam does not support FP16.
        config['fp16'] = False
    elif config['optimizer'] == 'adam':
        optimizer = Adam(optimizer_parameters,
                              lr=config['learning_rate'],
                              warmup=config['warmup'],
                              t_total=num_train_step,
                              max_grad_norm=config['grad_clipping'],
                              schedule=config['warmup_schedule'],
                              weight_decay=config['weight_decay'])
        if config.get('have_lr_scheduler', False): config['have_lr_scheduler'] = False
    else:
        raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer'])

    if state_dict and 'optimizer' in state_dict:
        optimizer.load_state_dict(state_dict['optimizer'])

    if config['fp16']:
        try:
            from apex import amp
            global amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(network, optimizer, opt_level=config['fp16_opt_level'])
        network = model
        optimizer = optimizer

    if config.get('have_lr_scheduler', False):
        if config.get('scheduler_type', 'rop') == 'rop':
            scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=config['lr_gamma'], patience=3)
        elif config.get('scheduler_type', 'rop') == 'exp':
            scheduler = ExponentialLR(optimizer, gamma=config.get('lr_gamma', 0.95))
        else:
            milestones = [int(step) for step in config.get('multi_step_lr', '10,20,30').split(',')]
            scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=config.get('lr_gamma'))
    else:
        scheduler = None
        
    return optimizer, scheduler

def perturbated_loss():
    smartPerturbation = SmartPerturbation(epsilon=1e-6,
                    multi_gpu_on=False,
                    step_size=1e-3,
                    noise_var=1e-5,
                    norm_p='inf',
                    k=1,
                    fp16=False,
                    encoder_type=EncoderModelType.BERT,
                    loss_map=[SymKlCriterion],
                    norm_level=0)
    
    return smartPerturbation

def _norm_grad(grad, norm_p, epsilon, eff_grad=None, sentence_level=False):
        eff_direction = 0
        if norm_p == 'l2':
            if sentence_level:
                direction = grad / (torch.norm(grad, dim=(-2, -1), keepdim=True) + epsilon)
            else:
                direction = grad / (torch.norm(grad, dim=-1, keepdim=True) + epsilon)
        elif norm_p == 'l1':
            direction = grad.sign()
        else:
            if sentence_level:
                direction = grad / (grad.abs().max((-2, -1), keepdim=True)[0] + epsilon)
            else:
                direction = grad / (grad.abs().max(-1, keepdim=True)[0] + epsilon)
                eff_direction = eff_grad / (grad.abs().max(-1, keepdim=True)[0] + epsilon)
        return direction, eff_direction
    
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    #print(outputs,outputs == labels)
    return np.sum(outputs == labels)

from loss import stable_kl

def generate_noise(embed, mask, epsilon=1e-5):
    noise = embed.data.new(embed.size()).normal_(0, 1) *  epsilon
    noise.detach()
    noise.requires_grad_()
    return noise

In [29]:
def check_save_ckp(metric, best_metric, model, optimizer, save_path, epoch, metric_name='train_loss'):
    if metric < best_metric:
        wandb.summary[f"best_{metric_name}"] = metric
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epoch': epoch
        }, save_path)
        return metric
    return best_metric
    
def load_checkpoint(model, optimizer, load_path):
    checkpoint = torch.load(load_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    
    return model, optimizer, epoch

In [30]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def training_loop(config,
                  train_dataloader,
                  eval_dataloader,
                  model,
                  task_loss_criterion,
                  adv_task_loss_criterion,
                  adv_teacher,
                  optimizer_parameters,
                  optimizer,
                  scheduler,
                  run_id,
                  device,
                  save_path=''):
    
    # preliminaries for training loop
    loss_scale=0
    output_dir="large_models"
    %mkdir large_models
    output_train_file = os.path.join(output_dir, "train_results.txt")
    output_smart_train_file = os.path.join(output_dir, "smart_train_results.txt")
    loss_writer = open(output_train_file, "w",1)
    smart_loss_writer = open(output_smart_train_file, "w",1)

    train_loss_meter = AverageMeter()
    adv_loss_meter = AverageMeter()
    emb_val_meter = AverageMeter()
    eff_perturb_meter = AverageMeter()
    val_loss_meter = AverageMeter()
    extracted_std_loss_meter = AverageMeter()
    
    local_updates = 0
    updates =0
    best_loss = np.inf
    last_es_criterion = np.inf
    accumulated_tr_loss = 0
    accumulated_std_tr_loss = 0
    overall_wrong_confidence_scores = []
    overall_correct_confidence_scores = []
    trigger_times = 0
    stopping = False
    best_metrics = {}
    
    try:
        adv_schedule_step = int(len(train_dataloader)/len(config['adv_alpha_schedule']))
    except ZeroDivisionError:
        adv_schedule_step = 1

    for epoch in trange(int(config['epochs']), desc="Epoch"):
                tr_loss = 0
                last_tr_loss = 0
                std_loss = 0

                nb_tr_examples, nb_tr_steps = 0, 0
                for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                    if len(config['adv_alpha_schedule']) > 0 and (step % adv_schedule_step == 0):
                        config['adv_alpha'] = config['adv_alpha_schedule'].pop()
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, segment_ids, label_ids = batch
                    logits = model(input_ids, segment_ids, input_mask)

                    # compute loss
                    if config['loss'] is not 'LossCriterion.KlCriterion':
                        label_logits = create_true_logits(logits, label_ids)
                        loss = task_loss_criterion[0](logits, label_logits, config['weight'], ignore_index=-1)
                    else:
                        loss = task_loss_criterion[0](logits, label_ids, config['weight'], ignore_index=-1)
                    
                    if config['adv_train']:
                        if config['multi_gpu_on']:
                            embed = torch.stack([model.module.bert.embeddings(ids, mask) for ids, mask in zip(input_ids, input_mask)]) # recover batch-embeddings
                        else:
                            embed = torch.stack([model.bert.embeddings(ids, mask) for ids, mask in zip(input_ids, input_mask)]) # recover batch-embeddings
                        noise = generate_noise(embed, input_mask, config['adv_noise_var'])
                        for step in range(config['adv_k']):
                            adv_logits = torch.stack([model(input_ids=ids,
                                                               token_type_ids=mask,
                                                               attention_mask=segment,
                                                               inputs_embeds=in_embed)[-1] for ids, mask, segment, in_embed in zip(input_ids,
                                                                                                                                   input_mask,
                                                                                                                                   segment_ids,
                                                                                                                                   noise+embed)
                                                     ]
                                                    )
                            adv_loss = stable_kl(adv_logits, logits.detach(), reduce=False)
                            delta_grad, = torch.autograd.grad(adv_loss, noise, only_inputs=True, retain_graph=False)
                            norm = delta_grad.norm()
                            if (torch.isnan(norm) or torch.isinf(norm)):
                                adv_loss = emb_val = eff_perturbation = 0
                                break
                            eff_delta_grad = delta_grad * config['adv_step_size']
                            delta_grad = noise + delta_grad * config['adv_step_size']
                            noise, eff_noise = _norm_grad(delta_grad,
                                                          norm_p=config['adv_p_norm'],
                                                          epsilon=config['adv_epsilon'],
                                                          eff_grad=eff_delta_grad,
                                                          sentence_level=config['adv_norm_level'])
                            noise = noise.detach()
                            noise.requires_grad_()
                        adv_logits = torch.stack([model(input_ids=ids,
                                               token_type_ids=mask,
                                               attention_mask=segment,
                                               inputs_embeds=in_embed)[-1] for ids, mask, segment, in_embed in zip(input_ids,
                                                                                                                   input_mask,
                                                                                                                   segment_ids,
                                                                                                                   noise+embed)
                                                 ]
                                                )
                        if config['klcriterion']=='flip':
                            adv_loss = adv_task_loss_criterion[-1](adv_logits, logits, ignore_index=-1)
                        else:
                            adv_loss = adv_task_loss_criterion[-1](logits, adv_logits, ignore_index=-1)
                        emb_val = embed.detach().abs().mean()
                        try:
                            eff_perturb = eff_noise.detach().abs().mean()
                            eff_perturb_meter.update(eff_perturb.item(), config['batch_size'])
                        except AttributeError:
                            eff_perturb = 0
                            eff_perturb_meter.update(0, config['batch_size'])

                        loss += config['adv_alpha']*adv_loss
                        adv_loss_meter.update(adv_loss.item(), config['batch_size'])
                        emb_val_meter.update(emb_val.item(), config['batch_size'])
                    
                    else:
                        adv_loss = 0
                        emb_val = 0
                        adv_loss_meter.update(adv_loss, config['batch_size'])
                        emb_val_meter.update(emb_val, config['batch_size'])
                    train_loss_meter.update(loss.item(), config['batch_size'])
                    
                    
                    

                    loss = loss /config.get('grad_accumulation_step', 1)
                    if config['fp16']:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()

                    local_updates += 1
                    if local_updates % config.get('grad_accumulation_step', 1) == 0:
                        if config['global_grad_clipping'] > 0:
                            if config['fp16']:
                                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                                               config['global_grad_clipping'])
                            else:
                                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                               config['global_grad_clipping'])
                        updates += 1
                        # reset number of the grad accumulation
                        optimizer.step()
                        optimizer.zero_grad() 
                    
    
#                     accumulated_tr_loss+=train_loss_meter.val
#                     accumulated_std_tr_loss+=train_loss_meter.val-adv_loss_meter.val
                    if (updates) % (config['log_per_updates']) == 0 or updates == 1:
        
                        print('evaluating model......')
                        wandb.log({"avg standard loss": train_loss_meter.avg - (config['adv_alpha']*adv_loss_meter.avg)})
                        wandb.log({"avg regularized loss": train_loss_meter.avg})
                                                
#                         wandb.log({"moving avg standard batch loss": accumulated_std_tr_loss/config['log_per_updates']})
#                         wandb.log({"moving avg regularized batch loss": accumulated_tr_loss/config['log_per_updates']})
                        
                        # Early stopping
                        if config['early_stopping_loss']=='train_loss':
                            es_criterion = train_loss_meter.avg
                        
#                         accumulated_tr_loss = 0
#                         accumulated_std_tr_loss = 0
                        
                        best_loss=check_save_ckp(train_loss_meter.val,
                                                 best_loss,
                                                 model,
                                                 optimizer,
                                                 f'model_ckp{run_id}.pt',
                                                 epoch,
                                                 'train_loss')
                        
                        [extracted_std_tr_loss,
                         _,
                         metrics,
                         scores] = evaluation(model,
                                              train_dataloader,
                                              device,
                                              mode='train')
                        extracted_std_loss_meter.update(extracted_std_tr_loss,
                                                        config['batch_size'])
                        wandb.log({'extracted train std loss': extracted_std_loss_meter.avg})
                
                        for key, val in metrics.items():
                            wandb.log({f'train {key}': val})
                            if val > best_metrics.get(f'best train {key}', 0):
                                wandb.summary[f'best train {key}'] = val
                                best_metrics[f'best train {key}'] = val
                            
                        [avg_val_loss,
                         _,
                         metrics,
                         scores] = evaluation(model,
                                              eval_dataloader,
                                              device,
                                              mode='val')
                        
                        val_loss_meter.update(avg_val_loss,
                                              config['batch_size'])
                        wandb.log({"avg eval loss": val_loss_meter.avg})
                        
                        for key, val in metrics.items():
                            wandb.log({f'val {key}': val})
                            if val > best_metrics.get(f'best val {key}', 0):
                                wandb.summary[f'best val {key}'] = val
                                best_metrics[f'best val {key}'] = val
                        
                        
                        if val_loss_meter.avg < best_metrics.get('best_avg_val_loss', np.inf):
                            wandb.summary[f"best eval loss"] = val_loss_meter.avg
                            best_metrics['best_avg_val_loss'] = val_loss_meter.avg
                        
                        # test
                        [_,
                         _,
                         metrics,
                         scores] = evaluation(model,
                                              test_dataloader,
                                              device,
                                              mode='test')
                        
                        for key, val in metrics.items():
                            wandb.log({f'test {key}': val})
                            if val > best_metrics.get(f'best test {key}', 0):
                                wandb.summary[f'best test {key}'] = val
                                best_metrics[f'best test {key}'] = val
                            
                        if config['early_stopping_loss']!='train_loss':
                            es_criterion = val_loss_meter.avg
                        
                        
                        if es_criterion > last_es_criterion:
                            trigger_times += 1

                            if trigger_times >= config['patience']:
                                stopping=True
                                break
                                

                        else:
                            trigger_times = 0

                        last_es_criterion = es_criterion
                    
                        test_correct_confidence_scores = scores['correct_confidence']
                        test_wrong_confidence_scores = scores['wrong_confidence']
                        
                    overall_wrong_confidence_scores = []
                    overall_correct_confidence_scores = []
                    
                    wandb.log({'batch adv_loss': train_loss_meter.avg})
                    wandb.log({'batch std_loss': train_loss_meter.avg - adv_loss_meter.avg})
                        
                    
                try:
                    test_correct_confidence_scores = [[sigmoid(s)] for s in test_correct_confidence_scores]
                    test_wrong_confidence_scores = [[sigmoid(s)] for s in test_wrong_confidence_scores]
                    correct_table = wandb.Table(data=test_correct_confidence_scores, columns=["confidence scores"])
                    wrong_table = wandb.Table(data=test_wrong_confidence_scores, columns=["confidence scores"])

                    wandb.log({f'correct_pred_scores epoch {epoch}': wandb.plot.histogram(correct_table,
                                                                    "confidence scores",
                                                                    title="(Correct) Prediction Score Distribution")})
                    wandb.log({f'wrong_pred_scores epoch {epoch}': wandb.plot.histogram(wrong_table,
                                                                "confidence scores",
                                                                title="(Wrong) Prediction Score Distribution")})
                except Exception as e:
                    print(f'EXCEPTION {e}')
                    print(test_correct_confidence_scores)
                    print(test_wrong_confidence_scores)
                    
                if stopping==True:
                    print(f'STOPPING at epoch {epoch}.')
                    return model
                    
                
                    
                    
    return model


In [17]:
from sklearn.metrics import accuracy_score, f1_score
from scipy.special import softmax

def calc_metrics(predictions, labels):
    return {'acc': np.round(100*accuracy_score(labels, predictions)),
            'f1': np.round(100*f1_score(labels, predictions, average="weighted"), 2)}
     

def evaluation(model, dataloader, device, mode='val'):
    assert mode in ['test', 'val', 'dev', 'train']
    with torch.no_grad():
        [metrics,
         predictions,
         loss,
         batch_size,
         scores] = eval_model(model, dataloader, device)
    return loss.item(), batch_size, metrics, scores
                         
def eval_model(model, dataloader, device):
    overall_predictions = []
    overall_labels = []
    golds = []
    scores = []
    ids = []
    overall_metrics = {}
    
    
    prediction_confidence_summary = {'correct_confidence': [],
                            'wrong_confidence': []}
    for step, batch in enumerate(dataloader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        loss, logits = model(input_ids,
                             segment_ids,
                             input_mask,
                             label_ids,
                             return_logits=True)
        
        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        
        predictions = np.argmax(logits, axis=1)
        
        normalized_confidence = softmax(logits)
        prediction_confidence = np.max(normalized_confidence, axis=1)
        
        correct_label_prediction = prediction_confidence[predictions==label_ids]
        wrong_label_prediction = prediction_confidence[predictions!=label_ids]
        
        
        prediction_confidence_summary['correct_confidence'].extend(list(correct_label_prediction))
        prediction_confidence_summary['wrong_confidence'].extend((wrong_label_prediction))
        overall_predictions.extend(predictions)
        overall_labels.extend(label_ids)
    
    metrics = calc_metrics(overall_predictions, list(overall_labels))
    prediction_confidence_summary = {'correct_confidence': [],
                            'wrong_confidence': []} 
    return metrics, predictions, loss, len(predictions), prediction_confidence_summary

In [32]:
# build_dataset
# train_dataloader, eval_dataloader = build_dataset(wandb.config)

def train():
    default_config = {
             'multi_gpu_on':False,
             'max_seq_length': 128,
             'adam_eps': 6, 
             'adv_epsilon': 1e-5, # 1e-6
             'adv_train': 1,
             'adv_noise_var': 1e-5,
             'adv_norm_level': 0,
             'adv_step_size': 1e-3, #e-5 was bad
             'bin_on': False,
             'cuda': 1,
             'encoder_type': None,
             'fp16': True,
             'fp16_opt_level': 'O1',
             'global_grad_clipping': 1.0,
             'grad_accumulation_step': 1,
             'grad_clipping': 1,
             'local_rank': -1,
             'mkd_opt': 0,
             'scheduler_type': 'exp',
             'task_def_list': None,
             'warmup': 0.1,
             'warmup_schedule': 'warmup_linear',
             'weight_decay': 0,
             'weighted_on': False,
             'state_dict': None,
             'loss': 'LossCriterion.SymKlCriterion',
             'epochs': 5,
             'batch_size': 2,
             'adv_k': 1,
             'learning_rate':2e-5,
             'adv_p_norm': 'inf',
             'adv_alpha': 1,
             'optimizer': 'adam',
             'adv_loss': 'LossCriterion.SymKlCriterion',
             'klcriterion': '-',
             'weight': 0,
             'log_per_updates': 128,
             'have_lr_scheduler': True,
             'patience': 3,
             'early_stopping_loss': 'train_loss',
             'adv_alpha_schedule': []#[5, 3, 1, 0.1]#[0.1, 1, 3, 5] #[5, 3, 1, 0.1]
    }
    
    wandb.init(project="master-thesis", config=default_config)
    
    cuda = default_config['cuda']
    if cuda:
        device='cuda'
    
    config = wandb.config
    run_id = wandb.run.id
    
#     # build_dataset
#     train_dataloader, eval_dataloader = build_dataset(config)
    
    # create model
    model = create_model(device)
    
    # initialize model and losses
    [mnetwork,
     task_loss_criterion,
     adv_task_loss_criterion,
     adv_teacher,
     optimizer_parameters,
     optimizer,
     scheduler] = _model_init(config=config, model=model, num_train_step=len(train_dataloader), device=device)
    
    # training mode ON
    mnetwork.train()
    
    # wandb watch
    wandb.watch(model)
    
    # train/eval
    training_loop(config,
                  train_dataloader,
                  eval_dataloader,
                  mnetwork,
                  task_loss_criterion,
                  adv_task_loss_criterion,
                  adv_teacher,
                  optimizer_parameters,
                  optimizer,
                  scheduler,
                  run_id,
                  device=device)
    
    # stop wandb
    wandb.finish()

In [None]:
train()

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
mkdir: cannot create directory ‘large_models’: File exists


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
Iteration:   0%|          | 0/1645 [00:00<?, ?it/s][A

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
evaluating model......



Iteration:   0%|          | 1/1645 [04:09<113:54:32, 249.44s/it][A
Iteration:   0%|          | 2/1645 [04:10<79:49:03, 174.89s/it] [A

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0



Iteration:   0%|          | 3/1645 [04:11<55:58:04, 122.71s/it][A

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0



Iteration:   0%|          | 4/1645 [04:12<39:16:55, 86.18s/it] [A

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0



Iteration:   0%|          | 5/1645 [04:13<27:35:37, 60.57s/it][A

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2048.0



Iteration:   0%|          | 6/1645 [04:14<19:26:21, 42.70s/it][A
Iteration:   0%|          | 7/1645 [04:14<13:42:49, 30.14s/it][A

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1024.0



Iteration:   0%|          | 8/1645 [04:16<9:44:57, 21.44s/it] [A
Iteration:   1%|          | 9/1645 [04:17<6:58:36, 15.35s/it][A
Iteration:   1%|          | 10/1645 [04:18<5:01:13, 11.05s/it][A
Iteration:   1%|          | 11/1645 [04:19<3:39:21,  8.05s/it][A
Iteration:   1%|          | 12/1645 [04:20<2:42:43,  5.98s/it][A
Iteration:   1%|          | 13/1645 [04:21<2:02:54,  4.52s/it][A
Iteration:   1%|          | 14/1645 [04:22<1:34:42,  3.48s/it][A
Iteration:   1%|          | 15/1645 [04:23<1:14:13,  2.73s/it][A
Iteration:   1%|          | 16/1645 [04:24<1:01:04,  2.25s/it][A
Iteration:   1%|          | 17/1645 [04:25<51:52,  1.91s/it]  [A
Iteration:   1%|          | 18/1645 [04:26<45:22,  1.67s/it][A
Iteration:   1%|          | 19/1645 [04:28<40:55,  1.51s/it][A
Iteration:   1%|          | 20/1645 [04:29<37:40,  1.39s/it][A
Iteration:   1%|▏         | 21/1645 [04:30<35:22,  1.31s/it][A
Iteration:   1%|▏         | 22/1645 [04:31<33:56,  1.25s/it][A
Iteration:   1%|▏   

evaluating model......



Iteration:   8%|▊         | 128/1645 [10:43<33:07:05, 78.59s/it][A
Iteration:   8%|▊         | 129/1645 [10:44<23:17:30, 55.31s/it][A
Iteration:   8%|▊         | 130/1645 [10:45<16:25:19, 39.02s/it][A
Iteration:   8%|▊         | 131/1645 [10:46<11:36:37, 27.61s/it][A
Iteration:   8%|▊         | 132/1645 [10:47<8:14:39, 19.62s/it] [A
Iteration:   8%|▊         | 133/1645 [10:48<5:53:17, 14.02s/it][A
Iteration:   8%|▊         | 134/1645 [10:49<4:14:25, 10.10s/it][A
Iteration:   8%|▊         | 135/1645 [10:50<3:05:13,  7.36s/it][A
Iteration:   8%|▊         | 136/1645 [10:51<2:16:55,  5.44s/it][A
Iteration:   8%|▊         | 137/1645 [10:52<1:43:10,  4.11s/it][A
Iteration:   8%|▊         | 138/1645 [10:53<1:19:42,  3.17s/it][A
Iteration:   8%|▊         | 139/1645 [10:54<1:03:25,  2.53s/it][A
Iteration:   9%|▊         | 140/1645 [10:55<53:12,  2.12s/it]  [A
Iteration:   9%|▊         | 141/1645 [10:56<45:21,  1.81s/it][A
Iteration:   9%|▊         | 142/1645 [10:57<40:36,  1.62s/

evaluating model......



Iteration:  16%|█▌        | 256/1645 [16:45<26:17:03, 68.12s/it][A
Iteration:  16%|█▌        | 257/1645 [16:46<18:31:06, 48.03s/it][A
Iteration:  16%|█▌        | 258/1645 [16:47<13:05:09, 33.97s/it][A
Iteration:  16%|█▌        | 259/1645 [16:48<9:17:10, 24.12s/it] [A
Iteration:  16%|█▌        | 260/1645 [16:49<6:37:38, 17.23s/it][A
Iteration:  16%|█▌        | 261/1645 [16:50<4:45:19, 12.37s/it][A
Iteration:  16%|█▌        | 262/1645 [16:51<3:26:46,  8.97s/it][A
Iteration:  16%|█▌        | 263/1645 [16:52<2:31:47,  6.59s/it][A
Iteration:  16%|█▌        | 264/1645 [16:53<1:53:17,  4.92s/it][A
Iteration:  16%|█▌        | 265/1645 [16:54<1:26:30,  3.76s/it][A
Iteration:  16%|█▌        | 266/1645 [16:56<1:08:38,  2.99s/it][A
Iteration:  16%|█▌        | 267/1645 [16:57<54:58,  2.39s/it]  [A
Iteration:  16%|█▋        | 268/1645 [16:58<45:27,  1.98s/it][A
Iteration:  16%|█▋        | 269/1645 [16:59<38:51,  1.69s/it][A
Iteration:  16%|█▋        | 270/1645 [17:00<34:18,  1.50s/it]

evaluating model......



Iteration:  23%|██▎       | 384/1645 [23:18<27:10:46, 77.59s/it][A
Iteration:  23%|██▎       | 385/1645 [23:19<19:07:08, 54.63s/it][A
Iteration:  23%|██▎       | 386/1645 [23:20<13:28:50, 38.55s/it][A
Iteration:  24%|██▎       | 387/1645 [23:21<9:32:11, 27.29s/it] [A
Iteration:  24%|██▎       | 388/1645 [23:22<6:47:18, 19.44s/it][A
Iteration:  24%|██▎       | 389/1645 [23:23<4:51:01, 13.90s/it][A
Iteration:  24%|██▎       | 390/1645 [23:24<3:30:01, 10.04s/it][A
Iteration:  24%|██▍       | 391/1645 [23:25<2:33:20,  7.34s/it][A
Iteration:  24%|██▍       | 392/1645 [23:26<1:53:38,  5.44s/it][A
Iteration:  24%|██▍       | 393/1645 [23:27<1:25:51,  4.11s/it][A
Iteration:  24%|██▍       | 394/1645 [23:28<1:07:06,  3.22s/it][A
Iteration:  24%|██▍       | 395/1645 [23:29<53:36,  2.57s/it]  [A
Iteration:  24%|██▍       | 396/1645 [23:30<43:53,  2.11s/it][A
Iteration:  24%|██▍       | 397/1645 [23:31<37:03,  1.78s/it][A
Iteration:  24%|██▍       | 398/1645 [23:32<32:18,  1.55s/it]

evaluating model......



Iteration:  31%|███       | 512/1645 [29:21<21:38:15, 68.75s/it][A
Iteration:  31%|███       | 513/1645 [29:22<15:14:21, 48.46s/it][A
Iteration:  31%|███       | 514/1645 [29:23<10:45:46, 34.26s/it][A
Iteration:  31%|███▏      | 515/1645 [29:24<7:37:38, 24.30s/it] [A
Iteration:  31%|███▏      | 516/1645 [29:25<5:26:43, 17.36s/it][A
Iteration:  31%|███▏      | 517/1645 [29:26<3:54:15, 12.46s/it][A
Iteration:  31%|███▏      | 518/1645 [29:27<2:49:37,  9.03s/it][A
Iteration:  32%|███▏      | 519/1645 [29:28<2:05:09,  6.67s/it][A
Iteration:  32%|███▏      | 520/1645 [29:29<1:32:57,  4.96s/it][A
Iteration:  32%|███▏      | 521/1645 [29:30<1:10:52,  3.78s/it][A
Iteration:  32%|███▏      | 522/1645 [29:31<55:22,  2.96s/it]  [A
Iteration:  32%|███▏      | 523/1645 [29:33<45:05,  2.41s/it][A
Iteration:  32%|███▏      | 524/1645 [29:34<37:47,  2.02s/it][A
Iteration:  32%|███▏      | 525/1645 [29:35<32:45,  1.75s/it][A
Iteration:  32%|███▏      | 526/1645 [29:36<28:15,  1.52s/it][

evaluating model......



Iteration:  39%|███▉      | 640/1645 [35:31<19:45:35, 70.78s/it][A
Iteration:  39%|███▉      | 641/1645 [35:32<13:53:48, 49.83s/it][A
Iteration:  39%|███▉      | 642/1645 [35:33<9:47:46, 35.16s/it] [A
Iteration:  39%|███▉      | 643/1645 [35:34<6:56:31, 24.94s/it][A
Iteration:  39%|███▉      | 644/1645 [35:35<4:56:14, 17.76s/it][A
Iteration:  39%|███▉      | 645/1645 [35:36<3:32:13, 12.73s/it][A
Iteration:  39%|███▉      | 646/1645 [35:37<2:33:48,  9.24s/it][A
Iteration:  39%|███▉      | 647/1645 [35:38<1:52:58,  6.79s/it][A
Iteration:  39%|███▉      | 648/1645 [35:39<1:24:24,  5.08s/it][A
Iteration:  39%|███▉      | 649/1645 [35:40<1:04:25,  3.88s/it][A
Iteration:  40%|███▉      | 650/1645 [35:41<50:26,  3.04s/it]  [A
Iteration:  40%|███▉      | 651/1645 [35:42<40:41,  2.46s/it][A
Iteration:  40%|███▉      | 652/1645 [35:43<33:52,  2.05s/it][A
Iteration:  40%|███▉      | 653/1645 [35:44<28:40,  1.73s/it][A
Iteration:  40%|███▉      | 654/1645 [35:45<25:04,  1.52s/it][A

evaluating model......



Iteration:  47%|████▋     | 768/1645 [41:36<17:16:33, 70.92s/it][A
Iteration:  47%|████▋     | 769/1645 [41:37<12:09:08, 49.94s/it][A
Iteration:  47%|████▋     | 770/1645 [41:38<8:34:10, 35.26s/it] [A
Iteration:  47%|████▋     | 771/1645 [41:39<6:04:19, 25.01s/it][A
Iteration:  47%|████▋     | 772/1645 [41:41<4:19:29, 17.83s/it][A
Iteration:  47%|████▋     | 773/1645 [41:42<3:06:11, 12.81s/it][A
Iteration:  47%|████▋     | 774/1645 [41:43<2:14:33,  9.27s/it][A
Iteration:  47%|████▋     | 775/1645 [41:44<1:38:24,  6.79s/it][A
Iteration:  47%|████▋     | 776/1645 [41:45<1:13:14,  5.06s/it][A
Iteration:  47%|████▋     | 777/1645 [41:46<56:10,  3.88s/it]  [A
Iteration:  47%|████▋     | 778/1645 [41:47<44:06,  3.05s/it][A
Iteration:  47%|████▋     | 779/1645 [41:48<35:11,  2.44s/it][A
Iteration:  47%|████▋     | 780/1645 [41:49<29:32,  2.05s/it][A
Iteration:  47%|████▋     | 781/1645 [41:50<25:34,  1.78s/it][A
Iteration:  48%|████▊     | 782/1645 [41:51<22:13,  1.55s/it][A
I

evaluating model......



Iteration:  54%|█████▍    | 896/1645 [47:56<15:35:45, 74.96s/it][A
Iteration:  55%|█████▍    | 897/1645 [47:57<10:58:16, 52.80s/it][A
Iteration:  55%|█████▍    | 898/1645 [47:58<7:44:15, 37.29s/it] [A
Iteration:  55%|█████▍    | 899/1645 [47:59<5:28:38, 26.43s/it][A
Iteration:  55%|█████▍    | 900/1645 [48:00<3:53:47, 18.83s/it][A
Iteration:  55%|█████▍    | 901/1645 [48:01<2:47:28, 13.51s/it][A
Iteration:  55%|█████▍    | 902/1645 [48:03<2:01:06,  9.78s/it][A
Iteration:  55%|█████▍    | 903/1645 [48:03<1:28:09,  7.13s/it][A
Iteration:  55%|█████▍    | 904/1645 [48:04<1:05:05,  5.27s/it][A
Iteration:  55%|█████▌    | 905/1645 [48:05<48:57,  3.97s/it]  [A
Iteration:  55%|█████▌    | 906/1645 [48:06<37:40,  3.06s/it][A
Iteration:  55%|█████▌    | 907/1645 [48:07<30:03,  2.44s/it][A
Iteration:  55%|█████▌    | 908/1645 [48:08<25:01,  2.04s/it][A
Iteration:  55%|█████▌    | 909/1645 [48:09<21:11,  1.73s/it][A
Iteration:  55%|█████▌    | 910/1645 [48:10<18:30,  1.51s/it][A
I

evaluating model......



Iteration:  62%|██████▏   | 1024/1645 [53:51<11:35:27, 67.19s/it][A
Iteration:  62%|██████▏   | 1025/1645 [53:52<8:09:18, 47.35s/it] [A
Iteration:  62%|██████▏   | 1026/1645 [53:53<5:45:13, 33.46s/it][A
Iteration:  62%|██████▏   | 1027/1645 [53:54<4:04:25, 23.73s/it][A
Iteration:  62%|██████▏   | 1028/1645 [53:55<2:53:56, 16.92s/it][A
Iteration:  63%|██████▎   | 1029/1645 [53:56<2:04:40, 12.14s/it][A
Iteration:  63%|██████▎   | 1030/1645 [53:57<1:30:14,  8.80s/it][A
Iteration:  63%|██████▎   | 1031/1645 [53:58<1:06:28,  6.50s/it][A
Iteration:  63%|██████▎   | 1032/1645 [53:59<49:25,  4.84s/it]  [A
Iteration:  63%|██████▎   | 1033/1645 [54:00<37:30,  3.68s/it][A
Iteration:  63%|██████▎   | 1034/1645 [54:01<29:09,  2.86s/it][A
Iteration:  63%|██████▎   | 1035/1645 [54:02<23:19,  2.30s/it][A
Iteration:  63%|██████▎   | 1036/1645 [54:03<19:23,  1.91s/it][A
Iteration:  63%|██████▎   | 1037/1645 [54:04<16:36,  1.64s/it][A
Iteration:  63%|██████▎   | 1038/1645 [54:05<14:49,  1.

evaluating model......



Iteration:  70%|███████   | 1152/1645 [59:56<9:41:03, 70.72s/it][A
Iteration:  70%|███████   | 1153/1645 [59:57<6:48:28, 49.81s/it][A
Iteration:  70%|███████   | 1154/1645 [59:58<4:47:55, 35.18s/it][A
Iteration:  70%|███████   | 1155/1645 [59:59<3:23:59, 24.98s/it][A
Iteration:  70%|███████   | 1156/1645 [1:00:00<2:25:04, 17.80s/it][A
Iteration:  70%|███████   | 1157/1645 [1:00:01<1:43:48, 12.76s/it][A
Iteration:  70%|███████   | 1158/1645 [1:00:02<1:15:01,  9.24s/it][A
Iteration:  70%|███████   | 1159/1645 [1:00:03<54:57,  6.78s/it]  [A
Iteration:  71%|███████   | 1160/1645 [1:00:04<41:11,  5.10s/it][A
Iteration:  71%|███████   | 1161/1645 [1:00:05<31:38,  3.92s/it][A
Iteration:  71%|███████   | 1162/1645 [1:00:06<24:50,  3.09s/it][A
Iteration:  71%|███████   | 1163/1645 [1:00:08<19:59,  2.49s/it][A
Iteration:  71%|███████   | 1164/1645 [1:00:09<16:35,  2.07s/it][A
Iteration:  71%|███████   | 1165/1645 [1:00:10<14:14,  1.78s/it][A
Iteration:  71%|███████   | 1166/1645 [

evaluating model......



Iteration:  78%|███████▊  | 1280/1645 [1:05:57<6:56:07, 68.40s/it][A
Iteration:  78%|███████▊  | 1281/1645 [1:05:59<4:52:38, 48.24s/it][A
Iteration:  78%|███████▊  | 1282/1645 [1:06:00<3:26:25, 34.12s/it][A
Iteration:  78%|███████▊  | 1283/1645 [1:06:01<2:25:57, 24.19s/it][A
Iteration:  78%|███████▊  | 1284/1645 [1:06:02<1:43:43, 17.24s/it][A
Iteration:  78%|███████▊  | 1285/1645 [1:06:03<1:14:25, 12.40s/it][A
Iteration:  78%|███████▊  | 1286/1645 [1:06:04<53:56,  9.01s/it]  [A
Iteration:  78%|███████▊  | 1287/1645 [1:06:05<39:36,  6.64s/it][A
Iteration:  78%|███████▊  | 1288/1645 [1:06:06<29:37,  4.98s/it][A
Iteration:  78%|███████▊  | 1289/1645 [1:06:07<22:37,  3.81s/it][A
Iteration:  78%|███████▊  | 1290/1645 [1:06:08<17:30,  2.96s/it][A
Iteration:  78%|███████▊  | 1291/1645 [1:06:09<14:10,  2.40s/it][A
Iteration:  79%|███████▊  | 1292/1645 [1:06:11<11:50,  2.01s/it][A
Iteration:  79%|███████▊  | 1293/1645 [1:06:12<10:00,  1.71s/it][A
Iteration:  79%|███████▊  | 1294/

evaluating model......


In [19]:
def create_true_logits(logits, labels):
    out = torch.zeros_like(logits)
    for i, label in enumerate(labels):
        out[i, label]=1
    return out

create_true_logits(logits, labels)

NameError: name 'logits' is not defined

In [None]:
n_gpu = torch.cuda.device_count()
if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)
n_gpu

In [None]:
wandb.agent(sweep_id, train)

# Custom metrics

In [None]:
# 0,1,2 classes (irrelevant - misleading)
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    #print(outputs,outputs == labels)
    return np.sum(outputs == labels)

# group data by question -> we want to look at the accuracy and f1 score for each question 

In [None]:
from transformers import DistilBertTokenizer, DistilBertForMultipleChoice


In [None]:
# !jupyter nbextension enable --py widgetsnbextension

In [None]:
from transformers import DistilBertTokenizer, DistilBertForMultipleChoice, DistilBertModel
import torch

# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
# model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')

In [None]:
def create_distil_model():
    
    for name, param in distil_model.named_parameters():
#         param.requires_grad = False
        ln = 24
        if name.startswith('distilbert.encoder'):
        	l = name.split('.')
        	ln = int(l[3])
      
        if name.startswith('distilbert.embeddings') or ln < 6:
#         	print(name)  
        	param.requires_grad = False
    
    distil_model.to(device)
    
    return distil_model

# from utils.tokenization import BertTokenizer tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)

In [None]:
from utils.modeling import BertEmbeddings

class RaceDistilBert(DistilBertModel):
    def __init__(self, config):
        super(RaceDistilBert, self).__init__(config)
        self.embeddings = BertEmbeddings(config)
        self.encoder = self.transformer
    
    def forward(self, input_ids,
                token_type_ids=None,
                attention_mask=None,
                output_all_encoded_layers=True,
                inputs_embeds=None):
        '''overwrite forward method'''
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        P_att = torch.zeros_like(input_ids)
        Q_att = torch.zeros_like(input_ids)
        A_att = torch.zeros_like(input_ids)
        token_ids = torch.zeros_like(input_ids)

        P_att[token_type_ids==0] = 1
        Q_att[token_type_ids==1] = 1
        A_att[token_type_ids==2] = 1

        token_ids[token_type_ids > 0] = 1

        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        P_att = P_att.unsqueeze(1).unsqueeze(2)
        P_att = P_att.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
        P_att = (1.0 - P_att) * -10000.0

        Q_att = Q_att.unsqueeze(1).unsqueeze(2)
        Q_att = Q_att.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
        Q_att = (1.0 - Q_att) * -10000.0

        A_att = A_att.unsqueeze(1).unsqueeze(2)
        A_att = A_att.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
        A_att = (1.0 - A_att) * -10000.0

        if inputs_embeds is None:
            embedding_output = self.embeddings(input_ids, token_ids)
        else:
            embedding_output = inputs_embeds
#             print(f'shape inputs_embeds: {len(inputs_embeds)}, and shape embeddings: {len(self.embeddings(input_ids, token_ids))}')

#         self.noise = self.generate_noise(embedding_output, attention_mask[0], epsilon=1e-5)
        
        print(f'embedding output: {embedding_output}')
        return embedding_output, extended_attention_mask,output_all_encoded_layers
        try:
            encoded_layers = self.encoder(embedding_output,
                                          extended_attention_mask,
                                          output_all_encoded_layers=output_all_encoded_layers)
        except TypeError as e:
            encoded_layers = self.encoder(embedding_output,
                                          extended_attention_mask,
                                          output_hidden_states=output_all_encoded_layers)
        
#         self.adv_encoded_layers = self.encoder(embedding_output+self.noise,
#                                       extended_attention_mask,
#                                       output_all_encoded_layers=output_all_encoded_layers)
        
        
        sequence_output = encoded_layers[-1]
        pooled_output = self.pooler(sequence_output)
        if not output_all_encoded_layers:
            encoded_layers = encoded_layers[-1]

        return extended_attention_mask,P_att,Q_att,A_att,sequence_output
    
class RaceDistilBertMultipleChoice(DistilBertForMultipleChoice):
    def __init__(self, config):
        super(RaceDistilBertMultipleChoice, self).__init__(config)
        self.distilbert = dmodel
    
    def get_config(self):
        return self.config
        

In [None]:
dmodel = RaceDistilBert.from_pretrained('distilbert-base-uncased')
dmodel_mc = RaceDistilBertMultipleChoice.from_pretrained('distilbert-base-uncased')

In [None]:
dmodel_mc.distilbert.encoder

In [None]:
model.config

In [None]:
dbert = DistilBertForMultipleChoice.from_pretrained('distilbert-base-uncased')

In [None]:
from transformers import BertTokenizer, BertForMultipleChoice, BertModel
from utils.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME


In [None]:
import os
distil_model = RaceDistilBertMultipleChoice.from_pretrained('distilbert-base-uncased',
                                              cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(-1)))

In [None]:
def dtrain():
    default_config = {
             'multi_gpu_on':False,
             'max_seq_length': 64,
             'adam_eps': 6, 
             'adv_epsilon': 1e-6,
             'adv_train': 1,
             'adv_noise_var': 1e-5,
             'adv_norm_level': 0,
             'adv_step_size': 1e-3,
             'bin_on': False,
             'cuda': 1,
             'encoder_type': None,
             'fp16': True,
             'fp16_opt_level': 'O1',
             'global_grad_clipping': 1.0,
             'grad_accumulation_step': 1,
             'grad_clipping': 0,
             'local_rank': -1,
             'mkd_opt': 0,
             'scheduler_type': 'ms',
             'task_def_list': None,
             'warmup': 0.1,
             'warmup_schedule': 'warmup_linear',
             'weight_decay': 0,
             'weighted_on': False,
             'state_dict': None,
             'loss': 'LossCriterion.CeCriterion',
             'epochs': 3,
             'batch_size': 4,
             'adv_k': 1,
             'learning_rate':5e-5,
             'adv_p_norm': 'inf',
             'adv_alpha': 1,
             'optimizer': 'radam',
             'adv_loss': 'LossCriterion.SymKlCriterion',
             'weight': 0
    }
    
    wandb.init(project="master-thesis", config=default_config)
    
    device = default_config['cuda']
    
    config = wandb.config
    
    # build_dataset
    train_dataloader, eval_dataloader = build_dataset(config)
    
    # create model
    model = create_distil_model()
    
    # initialize model and losses
    [mnetwork,
     task_loss_criterion,
     adv_task_loss_criterion,
     adv_teacher,
     optimizer_parameters,
     optimizer,
     scheduler] = _model_init(config=config, model=model, num_train_step=len(train_dataloader))
    
    return [config,
                  train_dataloader,
                  eval_dataloader,
                  mnetwork,
                  task_loss_criterion,
                  adv_task_loss_criterion,
                  adv_teacher,
                  optimizer_parameters,
                  optimizer,
                  scheduler]

    # training mode ON
    model.train()
    
    # wandb watch
    wandb.watch(model)
    
    # train/eval
    training_loop(config,
                  train_dataloader,
                  eval_dataloader,
                  mnetwork,
                  task_loss_criterion,
                  adv_task_loss_criterion,
                  adv_teacher,
                  optimizer_parameters,
                  optimizer,
                  scheduler)

In [None]:
params = dtrain()

In [None]:
input_ids=training_loop(*params)

In [None]:
network = params[3]

In [None]:
network.distilbert.embeddings(input_ids[0])

In [None]:
net = DistilBertForMultipleChoice.from_pretrained('distilbert-base-uncased')
net.to(device)

In [None]:
net.distilbert.embeddings(input_ids[0]).shape

In [None]:
net_untrained = DistilBertForMultipleChoice(net.config).to(device)

In [None]:
net_untrained.distilbert.embeddings(input_ids[0])

In [None]:
help(net.distilbert.embeddings.forward), help(network.distilbert.embeddings.forward)

In [None]:
np.linalg.norm(distil_model.to(device).distilbert.embeddings(input_ids[0]).cpu().detach().numpy() - net_untrained.distilbert.embeddings(input_ids[0]).cpu().detach().numpy())

In [None]:
net.distilbert.embeddings

In [None]:
output_dir = 'large_models'

# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)

In [None]:
model.eval()

tr_loss = 0
test_train_loss, test_train_accuracy = 0, 0
nb_test_train_steps, nb_test_train_examples = 0, 0
test_train_total_logits = []
test_train_total_labels = []
for input_ids, input_mask, segment_ids, label_ids in tqdm(train_dataloader, desc="Testing"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids)
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_test_accuracy = accuracy(logits, label_ids)

            test_loss += tmp_test_loss.mean().item()
            test_accuracy += tmp_test_accuracy

            nb_test_examples += input_ids.size(0)
            nb_test_steps += 1
            
            test_train_total_logits.append(logits)
            test_train_total_labels.append(label_ids)
	
test_train_total_logits = np.concatenate(total_logits)
test_train_total_labels = np.concatenate(total_labels)

In [None]:
# np.save(output_dir+"/test_logits.npy",test_total_logits)
# np.save(output_dir+"/test_labels.npy",test_total_labels)
test_loss = test_loss / nb_test_steps
test_accuracy = test_accuracy / nb_test_examples

result = {'test_loss': test_loss,
          'test_accuracy': test_accuracy}

output_test_file = os.path.join(output_dir, "train_acc_results.txt")
with open(output_test_file, "w") as writer:
#             logger.info("***** Test results *****")
            for key in sorted(result.keys()):
#                 logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

In [None]:
# eval_examples = read_race("./RACE/dev")
# eval_features = convert_examples_to_features(
#             eval_examples, tokenizer, args.max_seq_length, True)
#         logger.info("***** Running evaluation *****")
#         logger.info("  Num examples = %d", len(eval_examples))
#         logger.info("  Batch size = %d", args.eval_batch_size)
#         all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
#         all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
#         all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)

#         all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
#         eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
        # Run prediction for full data

if True:
    dev_examples = read_data(data_location,'dev')
    test_examples = read_data(data_location,'test')

if True:
    test_features = convert_examples_to_features(
                test_examples, tokenizer, max_seq_length, True)
    dev_features = convert_examples_to_features(
            dev_examples, tokenizer, max_seq_length, True)
                
if True:
    test_data = build_tensor(test_features)
    dev_data = build_tensor(dev_features)
eval_sampler = SequentialSampler(dev_data)
eval_dataloader = DataLoader(dev_data, sampler=eval_sampler, batch_size=8)

In [None]:
model.eval()

tr_loss = 0
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
total_logits = []
total_labels = []
for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1
            
            total_logits.append(logits)
            total_labels.append(label_ids)
	
total_logits = np.concatenate(total_logits)
total_labels = np.concatenate(total_labels)

np.save(output_dir+"/logits.npy",total_logits)
np.save(output_dir+"/labels.npy",total_labels)

eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_examples

result = {'eval_loss': eval_loss,
          'eval_accuracy': eval_accuracy}

output_eval_file = os.path.join(output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
#             logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
#                 logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

In [None]:


model.eval()

tr_loss = 0
test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0
total_logits = []
total_labels = []
for input_ids, input_mask, segment_ids, label_ids in tqdm(test_dataloader, desc="Testing"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids)
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_test_accuracy = accuracy(logits, label_ids)

            test_loss += tmp_test_loss.mean().item()
            test_accuracy += tmp_test_accuracy

            nb_test_examples += input_ids.size(0)
            nb_test_steps += 1
            
            total_logits.append(logits)
            total_labels.append(label_ids)
	
test_total_logits = np.concatenate(total_logits)
test_total_labels = np.concatenate(total_labels)

np.save(output_dir+"/test_logits.npy",test_total_logits)
np.save(output_dir+"/test_labels.npy",test_total_labels)

test_loss = test_loss / nb_test_steps
test_accuracy = test_accuracy / nb_test_examples

result = {'test_loss': test_loss,
          'test_accuracy': test_accuracy}

output_test_file = os.path.join(output_dir, "test_results.txt")
with open(output_test_file, "w") as writer:
#             logger.info("***** Test results *****")
            for key in sorted(result.keys()):
#                 logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

In [None]:
assembled_total_logits = [items for labels in total_logits for items in labels]
assembled_total_logits = np.array(assembled_total_logits)

assembled_total_labels = [items for labels in total_labels for items in labels]
assembled_total_labels = np.array(assembled_total_labels)
assembled_total_labels, assembled_total_logits

In [None]:
from sklearn.metrics import classification_report as cls_report
print(cls_report(test_train_total_labels, np.argmax(test_train_total_logits, axis=1)))
print(cls_report(assembled_total_labels, np.argmax(assembled_total_logits, axis=1)))
print(cls_report(test_total_labels, np.argmax(test_total_logits, axis=1)))

In [None]:
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32)

model.eval()

tr_loss = 0
test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0
total_logits = []
total_labels = []
for input_ids, input_mask, segment_ids, label_ids in tqdm(test_dataloader, desc="Testing"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids)
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_test_accuracy = accuracy(logits, label_ids)

            test_loss += tmp_test_loss.mean().item()
            test_accuracy += tmp_test_accuracy

            nb_test_examples += input_ids.size(0)
            nb_test_steps += 1
            
            total_logits.append(logits)
            total_labels.append(label_ids)
	
test_total_logits = np.concatenate(total_logits)
test_total_labels = np.concatenate(total_labels)

np.save(output_dir+"/test_logits.npy",test_total_logits)
np.save(output_dir+"/test_labels.npy",test_total_labels)

test_loss = test_loss / nb_test_steps
test_accuracy = test_accuracy / nb_test_examples

result = {'test_loss': test_loss,
          'test_accuracy': test_accuracy}

output_test_file = os.path.join(output_dir, "test_results.txt")
with open(output_test_file, "w") as writer:
#             logger.info("***** Test results *****")
            for key in sorted(result.keys()):
#                 logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

In [None]:
chosen_examples = []
for example in tqdm(test_examples, desc="Testing"):
    if example.endings[example.label].startswith('identification fails'):
        chosen_examples.append(example)

chosen_features = convert_examples_to_features(chosen_examples, tokenizer, max_seq_length, is_training=False, debug=False)

chosen_data = build_tensor(chosen_features)

chosen_sampler = SequentialSampler(chosen_data)
chosen_dataloader = DataLoader(chosen_data, sampler=chosen_sampler, batch_size=32)

len(chosen_examples)

In [None]:
model.eval()

tr_loss = 0
test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0
total_logits = []
total_labels = []
for input_ids, input_mask, segment_ids, label_ids in tqdm(chosen_dataloader, desc="Testing"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids)
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_test_accuracy = accuracy(logits, label_ids)

            test_loss += tmp_test_loss.mean().item()
            test_accuracy += tmp_test_accuracy

            nb_test_examples += input_ids.size(0)
            nb_test_steps += 1
            
            total_logits.append(logits)
            total_labels.append(label_ids)
	
test_total_logits = np.concatenate(total_logits)
test_total_labels = np.concatenate(total_labels)

np.save(output_dir+"/test_logits.npy",test_total_logits)
np.save(output_dir+"/test_labels.npy",test_total_labels)

test_loss = test_loss / nb_test_steps
test_accuracy = test_accuracy / nb_test_examples

result = {'test_loss': test_loss,
          'test_accuracy': test_accuracy}

# output_test_file = os.path.join(output_dir, "test_results.txt")
# with open(output_test_file, "w") as writer:
# #             logger.info("***** Test results *****")
#             for key in sorted(result.keys()):
# #                 logger.info("  %s = %s", key, str(result[key]))
#                 writer.write("%s = %s\n" % (key, str(result[key])))

In [None]:
test_total_labels

In [None]:
np.argmax(test_total_logits, axis=1)

In [None]:
print(cls_report(test_total_labels, np.argmax(test_total_logits, axis=1)))

In [None]:
chosen_examples

In [None]:
chosen_examples = []
for example in tqdm(train_examples, desc="Testing"):
    if example.endings[example.label].startswith('i cannot find my ope'):
        chosen_examples.append(example)
        print(example.endings[example.label])

len(chosen_examples)

In [None]:
chosen_examples = []
for example in tqdm(test_examples, desc="Testing"):
    if example.endings[example.label].startswith('i cannot find my ope'):
        chosen_examples.append(example)

len(chosen_examples)

In [None]:
chosen_features = convert_examples_to_features(chosen_examples, tokenizer, max_seq_length, is_training=False, debug=False)

chosen_data = build_tensor(chosen_features)

chosen_sampler = SequentialSampler(chosen_data)
chosen_dataloader = DataLoader(chosen_data, sampler=chosen_sampler, batch_size=32)

len(chosen_examples)

In [None]:
model.eval()

tr_loss = 0
test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0
total_logits = []
total_labels = []
for input_ids, input_mask, segment_ids, label_ids in tqdm(chosen_dataloader, desc="Testing"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids)
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_test_accuracy = accuracy(logits, label_ids)

            test_loss += tmp_test_loss.mean().item()
            test_accuracy += tmp_test_accuracy

            nb_test_examples += input_ids.size(0)
            nb_test_steps += 1
            
            total_logits.append(logits)
            total_labels.append(label_ids)
	
test_total_logits = np.concatenate(total_logits)
test_total_labels = np.concatenate(total_labels)

np.save(output_dir+"/test_logits.npy",test_total_logits)
np.save(output_dir+"/test_labels.npy",test_total_labels)

test_loss = test_loss / nb_test_steps
test_accuracy = test_accuracy / nb_test_examples

result = {'test_loss': test_loss,
          'test_accuracy': test_accuracy}

# output_test_file = os.path.join(output_dir, "test_results.txt")
# with open(output_test_file, "w") as writer:
# #             logger.info("***** Test results *****")
#             for key in sorted(result.keys()):
# #                 logger.info("  %s = %s", key, str(result[key]))
#                 writer.write("%s = %s\n" % (key, str(result[key])))

In [None]:
print(cls_report(test_total_labels, np.argmax(test_total_logits, axis=1)))

In [None]:
test_total_labels

In [None]:
np.argmax(test_total_logits, axis=1)

In [None]:
chosen_examples = []
for example in tqdm(test_examples, desc="Testing"):
    if example.endings[example.label].startswith('The {FASTA}'):
        chosen_examples.append(example)

len(chosen_examples)

In [None]:
contexts = [example.context_sentence for example in chosen_examples]
distribution = [example.label for example in chosen_examples]

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(distribution)
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.hist(contexts)
plt.xticks(rotation=90)
plt.show()