In [1]:
import pandas as pd
import numpy as np
import os
import csv
import datetime
from tqdm.notebook import tqdm, trange

from sklearn.metrics import accuracy_score

import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset # used load data
import torch.optim as optim

from transformers.data.processors.utils import InputExample, InputFeatures
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import (CamembertConfig,
                          CamembertForSequenceClassification,
                          CamembertTokenizer)


In [2]:
path_data="data/"

In [49]:
# modified from glue.py and utils.py

class InputExample(object):

    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

    
class DataProcessor():
    
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                line = list(cell for cell in line)
                lines.append(line)
            return lines

        
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['ID'].numpy(),
                            tensor_dict['question'].numpy().decode('utf-8'),
                            str(tensor_dict['intention'].numpy()))

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
    
    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")


    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_labels(self):
        """See base class."""
        return [str(j) for j in range(51)]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                print(line)
                continue
            guid = "%s-%s" % (set_type, line[0])
            if len(line) == 3:
                utterance = line[1]
                label = line[2]
                examples.append(InputExample(guid=guid, text_a=utterance, text_b=None, label=label))
            else:
                utterance = line[1]
                examples.append(InputExample(guid=guid, text_a=utterance, text_b=None, label="0"))
        return examples


def convert_examples_to_features(examples,
                                 tokenizer,
                                 max_length=512,
                                 label_list=None,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """

    processor = DataProcessor()
    if label_list is None:
        label_list = processor.get_labels()

    label_map = {label: i for i, label in enumerate(label_list)}
    print(label_map)

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            print("Writing example %d" % (ex_index))

        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)

        label = label_map[example.label]
            
        #if ex_index < 5:
            #print("*** Example ***")
            #print("guid: %s" % (example.guid))
            #print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            #print("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            #print("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            #print("label: %s (id = %d)" % (example.label, label))

        features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              label=label))

    return features

In [21]:
def load_and_cache_examples(tokenizer, 
                            max_seq_length,
                            data_dir,
                            which_data="train",
                            overwrite_cache = False,
                            verbose=1):

    processor = DataProcessor()
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(data_dir, 'cached_{}_{}'.format(which_data,
                                                                        str(max_seq_length)))
    if os.path.exists(cached_features_file) and not overwrite_cache:
        if verbose > 0:
            print("Loading features from cached file %s" % cached_features_file)
        features = torch.load(cached_features_file)
    else:
        if verbose > 0:
            print("Creating features from dataset file at %s" % data_dir)
        label_list = processor.get_labels()
        
        if which_data == "train":
            examples = processor.get_train_examples(data_dir)
        elif which_data == "test":
            examples = processor.get_test_examples(data_dir)
        elif which_data == "dev":
            examples = processor.get_dev_examples(data_dir)
        
        features = convert_examples_to_features(examples,
                                                tokenizer,
                                                label_list=label_list,
                                                max_length=max_seq_length,
                                                pad_on_left=False,
                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                                pad_token_segment_id=0,
        )
        if verbose > 0:
            print("Saving features into cached file %s" % cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset


def train(model, train_dataset, tokenizer, num_train_epochs, train_batch_size, learning_rate, adam_epsilon=1e-8,
          logging_steps=None, gradient_accumulation_steps=1, max_grad_norm=1.0, weight_decay=0.0,
          warmup_steps=0, save_steps=-1, output_dir=None, evaluate_during_training=False,
          seed=None, max_steps=-1):
    
    """ Train the model """
    if output_dir is None:
        output_dir = "model_" + str(datetime.datetime.now()).split(".")[0].replace(" ","_") + "/"
        
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)
    
    if logging_steps is None:
        logging_steps = len(train_dataloader) // (gradient_accumulation_steps * 5)
        
    if max_steps > 0:
        t_total = max_steps
        num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    
    # pas besoin de la partie custom ci-dessous à priori
    
    #no_decay = ['bias', 'LayerNorm.weight']
    #optimizer_grouped_parameters = [
    #    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    #    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    #    ]
    
    # change l'optimizer pour voir
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon) # optimizer_grouped_parameters
    #optimizer = optim.SGD(model.parameters(), lr=learning_rate) #eps=adam_epsilon , momentum=0.9
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    
    #for i, tensor in enumerate(model.parameters()):
    #    if i > 1:
    #        tensor.requires_grad = False

    # Train!
    print("***** Running training *****")
    print("  Num examples = %d" % len(train_dataset))
    print("  Num Epochs = %d" % num_train_epochs)
    print(" Batch size = %d" % train_batch_size)
    print("  Total train batch size (w. parallel, distributed & accumulation) = %d" %
                   train_batch_size * gradient_accumulation_steps)
    print("  Gradient Accumulation steps = %d" % gradient_accumulation_steps)
    print("  Total optimization steps = %d" % t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(num_train_epochs), desc="Epoch")
    if seed is not None:
        set_seed(seed)
        
    for epoch, _ in enumerate(train_iterator):
        # print("Epoch %d / %d" % (epoch, num_train_epochs))
        epoch_iterator = train_dataloader
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[3]}
            inputs['token_type_ids'] = batch[2] #or None
            optimizer.zero_grad()

            outputs = model(**inputs)
            #print(outputs)
            #print(outputs[0].size())
            #print(outputs[1].size())
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            #print(loss)
            
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            print(loss.item())
            if (step + 1) % gradient_accumulation_steps == 0:
                #params = list(model.parameters())[:1]
                #print(params)
                #print([torch.sum(par, 0) for par in params])
                #print([par.size() for par in params])
                #print([par.grad for par in params])
                #print([torch.sum(par.grad, 0) for par in params])
                #print([torch.sum(par.grad, 1) for par in params])
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if logging_steps > 0 and global_step % logging_steps == 0:
                    print("\n\nEval")
                    # Log metrics
                    dict_print = {'step':global_step,
                                  'lr': scheduler.get_lr()[0],
                                  'tr_loss': (tr_loss - logging_loss)/logging_steps}
                    if evaluate_during_training:
                        results = evaluate(model, tokenizer, eval_output_dir=output_dir, verbose=-1)
                        for key, value in results.items():
                            dict_print['eval_{}'.format(key)] = value
                    print(dict_print)
                    logging_loss = tr_loss

                if save_steps > 0 and global_step % save_steps == 0:
                    # Save model checkpoint
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    save_model_dir = os.path.join(output_dir, 'checkpoint-{}'.format(global_step))
                    os.makedirs(save_model_dir)
                    model.save_pretrained(save_model_dir)
                    #torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    print("Saving model checkpoint to %s" % save_model_dir)

            if max_steps > 0 and global_step > max_steps:
                #epoch_iterator.close() #deleted since no tqdm anymore
                break
                
        if max_steps > 0 and global_step > max_steps:
            train_iterator.close()
            break

    if global_step == 0:
        global_step= 1

    return global_step, tr_loss / global_step


def evaluate(model, tokenizer, eval_batch_size=8, prefix="", eval_output_dir=None,
             verbose=1):
    
    #results = {}
    eval_dataset = load_and_cache_examples(tokenizer=tokenizer, max_seq_length=128, data_dir="data/",
                                           which_data="test", verbose=verbose)
    
    eval_batch_size = eval_batch_size
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size)
    
    # Eval!
    if verbose > 0:
        print("***** Running evaluation {} *****".format(prefix))
        print("  Num examples = %d", len(eval_dataset))
        print("  Batch size = %d", eval_batch_size)
        
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[3]}
            inputs['token_type_ids'] = batch[2] #or None
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]
            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds_class = np.argmax(preds, axis=1)
    acc = accuracy_score(out_label_ids, preds_class)
    
    result = {"val_loss": eval_loss, "val_acc" : acc}
    #results.update(result)

    if eval_output_dir is not None:
        if not os.path.exists(eval_output_dir):
            os.makedirs(eval_output_dir)
        
        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            writer.write("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                writer.write("  %s = %s" % (key, str(result[key])))
            writer.write("\n")

    return result



In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
num_labels=51 # 1081
model_name="camembert-base"

config = CamembertConfig.from_pretrained(model_name,
                                         num_labels=num_labels,
                                         finetuning_task="classification")

In [9]:
tokenizer = CamembertTokenizer.from_pretrained(model_name,
                                               do_lower_case=True)

In [26]:
model = CamembertForSequenceClassification.from_pretrained(model_name, config=config)
model = model.to(device)


CamembertForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [10]:
# rmk : we can see all sequences of tokens begin with 5 and end with 6
train_dataset = load_and_cache_examples(tokenizer=tokenizer, max_seq_length=128, data_dir="data/")

Loading features from cached file data/cached_train_128


In [29]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=3e-5, num_train_epochs=6, evaluate_during_training=True, logging_steps=200,
      max_grad_norm=1.0, save_steps=400) #  , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 6
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1206


HBox(children=(IntProgress(value=0, description='Epoch', max=6, style=ProgressStyle(description_width='initial…

3.954603433609009
3.919445753097534
3.9322924613952637
3.8946921825408936
3.910825252532959
3.8993918895721436
3.863528251647949
3.7767529487609863
3.8153207302093506
3.8207786083221436
3.850013494491577
3.8282217979431152
3.7205209732055664
3.8224904537200928
3.671534776687622
3.7041499614715576
3.779334783554077
3.729846954345703
3.744424819946289
3.7205638885498047
3.655184030532837
3.7368013858795166
3.6692581176757812
3.6277525424957275
3.6878747940063477
3.7105698585510254
3.7141177654266357
3.6039962768554688
3.6324462890625
3.701836347579956
3.6019062995910645
3.719346046447754
3.627075433731079
3.555041790008545
3.6106109619140625
3.6970605850219727
3.50252366065979
3.533339738845825
3.5799365043640137
3.489165782928467
3.6448733806610107
3.5750460624694824
3.746976852416992
3.5653815269470215
3.404083251953125
3.47594952583313
3.589625835418701
3.4910640716552734
3.6208016872406006
3.4820127487182617
3.4821834564208984
3.636002540588379
3.4179749488830566
3.4103522300720215
3

2.3777406215667725
2.4666073322296143
2.4074866771698
1.8573601245880127
2.2648375034332275
1.9055452346801758
2.365907669067383
2.59787654876709
1.774544596672058
2.5134644508361816
2.082928419113159
1.5987881422042847
2.1852478981018066
2.151304244995117
2.359860420227051
2.231125831604004
1.9675766229629517
2.02148699760437
1.9921717643737793
1.9689817428588867
2.416802406311035
1.8812065124511719
1.8529179096221924
1.9608919620513916
2.246354579925537
2.1809918880462646
2.1118862628936768
2.418818950653076
1.995296597480774
2.6412107944488525
2.3734264373779297
2.497556686401367
1.8912078142166138
1.8806664943695068
2.1332311630249023
1.7059204578399658
1.9621014595031738
2.343445301055908
1.4720436334609985
2.1501176357269287
1.8762855529785156
2.0764176845550537
1.8204073905944824
2.250467300415039
1.519646406173706
1.9727267026901245
2.2060749530792236
2.1010782718658447
2.2535340785980225
1.5646275281906128
2.1344330310821533
1.5989606380462646
2.4452390670776367
1.969888806343

1.387610912322998
1.6897379159927368
1.7436795234680176
1.5588253736495972
1.4303996562957764
1.6970914602279663
1.687085747718811
1.6163684129714966
1.3932645320892334
1.6825517416000366
1.3027198314666748
1.2796508073806763
2.0354061126708984
1.7055226564407349
2.162390947341919
1.479465365409851
1.243815302848816
1.5422340631484985
1.6482863426208496
1.412458896636963
1.4626909494400024
1.56338369846344
1.4403122663497925
1.6146053075790405
1.6690177917480469
1.7449405193328857
1.5703368186950684
1.3643383979797363
1.3102856874465942
1.2196203470230103
1.3443859815597534
1.642361044883728
1.603108286857605
1.949116826057434
1.32575261592865
1.2480512857437134
1.4743558168411255
1.0056265592575073
1.7110340595245361
1.547686219215393
1.7824727296829224
1.7960782051086426
1.3086618185043335
1.4965877532958984
1.4019116163253784
1.3121554851531982
1.5954238176345825
1.587134838104248
1.5431674718856812
2.0163254737854004
1.3665111064910889
1.266579270362854
1.2261898517608643
1.4483683

(1206, 2.032258079716222)

In [27]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=1e-5, num_train_epochs=2, evaluate_during_training=True, logging_steps=100,
      max_grad_norm=1.0, save_steps=100) #  , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 2
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 402


HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

1.5291413068771362
1.3760437965393066
1.271986722946167
1.2752169370651245
1.423956036567688
1.489784598350525
1.257717251777649
1.7602481842041016
1.3991947174072266
1.3344826698303223
1.1536766290664673
1.3781737089157104
1.5173650979995728
1.2899410724639893
1.28874933719635
1.5700956583023071
1.499956488609314
1.3890092372894287
1.699430227279663
1.1812721490859985
1.4770766496658325
1.291730523109436
1.4695543050765991
1.42909574508667
1.6804746389389038
0.9253361225128174
1.6083953380584717
1.3092269897460938
1.398566484451294
1.0211924314498901
1.4162847995758057
1.484044075012207
1.374495506286621
1.1443562507629395
1.6540266275405884
1.413906455039978
0.9574322700500488
1.3369084596633911
1.6368544101715088
1.2152783870697021
1.3103543519973755
1.5962672233581543
1.0649263858795166
1.4034196138381958
1.132301688194275
0.9981662034988403
1.485278606414795
1.1404260396957397
1.3492188453674316
1.3749096393585205
0.8825526833534241
1.3868935108184814
1.0810794830322266
1.73945367

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 100, 'lr': 7.512437810945274e-06, 'tr_loss': 1.3423914289474488, 'eval_val_loss': 1.4864923181521952, 'eval_val_acc': 0.6930261519302615}
Saving model checkpoint to model_2019-12-08_16:42:05/checkpoint-100
1.4397130012512207
1.0743200778961182
1.9257392883300781
1.49910306930542
1.421026587486267
1.5582592487335205
1.4413692951202393
1.2272461652755737
1.3910601139068604
1.2512813806533813
1.1328109502792358
1.274594783782959
1.554145097732544
1.7303335666656494
1.103973150253296
1.3483951091766357
1.668999195098877
1.387466311454773
1.729688048362732
1.3657249212265015
1.1877058744430542
1.559802532196045
1.094299554824829
1.1550698280334473
0.9296040534973145
1.3432326316833496
1.3198602199554443
1.3242332935333252
1.4953231811523438
1.4222211837768555
1.651379942893982
1.4413721561431885
1.4494760036468506
1.6019452810287476
1.5938012599945068
1.4754812717437744
1.356894612312317
1.1175353527069092
1.490912675857544
1.5059711933135986
1.6439845561981201
1.627938985824585
1

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 200, 'lr': 5.024875621890548e-06, 'tr_loss': 1.3083149284124374, 'eval_val_loss': 1.4557405588638723, 'eval_val_acc': 0.6948941469489415}
Saving model checkpoint to model_2019-12-08_16:42:05/checkpoint-200
1.4467135667800903
1.1997665166854858
1.499519944190979
1.4842236042022705
0.938872754573822
1.165278673171997
1.1548994779586792
1.555411458015442
1.3297077417373657
1.6220840215682983
1.814465880393982
1.0481539964675903
1.1095833778381348
1.3901793956756592
1.1731042861938477
1.0144355297088623
1.2803597450256348
1.4280000925064087
1.075878620147705
1.3442405462265015
1.3417088985443115
1.1948506832122803
1.5536450147628784
1.68588125705719
1.6708399057388306
1.4539375305175781
1.157477855682373
1.2163519859313965
1.5277204513549805
1.2543871402740479
1.2155860662460327
1.1338876485824585
1.236772060394287
1.1670016050338745
1.361514925956726
1.4334379434585571
1.4625957012176514
1.2008278369903564
1.4869279861450195
1.1066888570785522
1.1611301898956299
1.12077736854553

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 300, 'lr': 2.537313432835821e-06, 'tr_loss': 1.2490742474794387, 'eval_val_loss': 1.4330960710843403, 'eval_val_acc': 0.6961394769613948}
Saving model checkpoint to model_2019-12-08_16:42:05/checkpoint-300
1.0728530883789062
1.3729627132415771
1.247690200805664
1.006927490234375
1.2159310579299927
1.099407434463501
1.241063117980957
1.0981887578964233
0.935329020023346
1.14821457862854
1.0685583353042603
2.311615467071533
0.8720554709434509
1.2676385641098022
1.3259085416793823
1.357863187789917
0.8670370578765869
1.2742196321487427
1.18605637550354
1.4959057569503784
1.0342189073562622
1.1689378023147583
1.3381742238998413
1.2568353414535522
1.2011667490005493
1.182530403137207
0.8993980288505554
1.5340664386749268
0.8627333641052246
1.4123079776763916
1.583191156387329
1.1602030992507935
1.5810211896896362
0.847210168838501
1.4602994918823242
1.1463831663131714
0.9310146570205688
1.1775867938995361
0.8789504766464233
1.3885189294815063
1.292022705078125
1.2349308729171753
1

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 400, 'lr': 4.975124378109453e-08, 'tr_loss': 1.2117482835054398, 'eval_val_loss': 1.428192032213828, 'eval_val_acc': 0.6986301369863014}
Saving model checkpoint to model_2019-12-08_16:42:05/checkpoint-400
1.5094091892242432
1.1343151330947876



(402, 1.2781010277532225)

In [28]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=1e-5, num_train_epochs=2, evaluate_during_training=True, logging_steps=100,
      max_grad_norm=1.0, save_steps=100) #  , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 2
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 402


HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

1.2926764488220215
1.205703854560852
1.2494045495986938
0.7891292572021484
1.3660897016525269
1.3403221368789673
1.5453382730484009
1.373969316482544
1.0697957277297974
1.1001777648925781
1.1897401809692383
1.1086492538452148
0.9734945297241211
1.4101479053497314
1.2624454498291016
0.9592949748039246
1.2468078136444092
1.0635957717895508
1.0990490913391113
0.9746720790863037
1.6746771335601807
1.4903929233551025
1.0239332914352417
1.1964316368103027
0.814210832118988
1.4332537651062012
1.2852836847305298
1.3009144067764282
0.9727036356925964
1.0650200843811035
1.2283895015716553
1.1107040643692017
1.2731916904449463
1.050178050994873
1.4980792999267578
1.1767499446868896
0.9726148843765259
1.7135181427001953
1.7980890274047852
0.9585139155387878
1.2098236083984375
1.1225398778915405
1.2107517719268799
1.1616615056991577
1.0890860557556152
1.3734716176986694
1.0180606842041016
1.1856310367584229
0.9483907222747803
1.2664817571640015
1.1461856365203857
1.61179780960083
1.0500191450119019

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 100, 'lr': 7.512437810945274e-06, 'tr_loss': 1.1955170494318008, 'eval_val_loss': 1.3990696486252456, 'eval_val_acc': 0.7067247820672479}
Saving model checkpoint to model_2019-12-08_16:58:45/checkpoint-100
1.4285149574279785
0.9533700942993164
1.2742977142333984
1.1931297779083252
1.1537102460861206
0.9982535243034363
1.450754165649414
1.1052987575531006
1.267979383468628
1.3092106580734253
0.9941190481185913
1.4167113304138184
1.2682169675827026
1.3658263683319092
1.3286105394363403
1.0674643516540527
1.3107589483261108
1.373126745223999
1.476521372795105
0.9340001344680786
0.9873515367507935
1.4004487991333008
1.5188748836517334
0.8685749769210815
1.3119795322418213
1.0322304964065552
0.9281020164489746
1.4018492698669434
0.9535475373268127
0.7626832127571106
1.2930245399475098
0.9912359118461609
0.7266678214073181
1.1534703969955444
1.7676337957382202
1.0587838888168335
1.269888997077942
1.5595409870147705
1.358846664428711
1.2256909608840942
0.847856879234314
1.4703872203

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 200, 'lr': 5.024875621890548e-06, 'tr_loss': 1.1594227766990661, 'eval_val_loss': 1.3843739181015622, 'eval_val_acc': 0.7092154420921544}
Saving model checkpoint to model_2019-12-08_16:58:45/checkpoint-200
1.3229502439498901
1.180606722831726
0.9512245655059814
0.767478883266449
1.3163586854934692
1.0449525117874146
1.1524583101272583
1.3586854934692383
1.1731059551239014
1.17861807346344
1.0343276262283325
1.1256895065307617
1.2712280750274658
1.2807605266571045
1.2122539281845093
1.3527944087982178
1.167405128479004
0.9054272770881653
1.5193367004394531
1.0073606967926025
0.8639300465583801
1.0515003204345703
0.9281935691833496
1.3505922555923462
0.7995290756225586
1.1154625415802002
0.8723435401916504
1.256285309791565
0.852118730545044
0.7721924185752869
1.251552700996399
1.5881752967834473
1.1011179685592651
1.288124680519104
0.7551161646842957
1.0105736255645752
1.2970422506332397
1.3197321891784668
1.0813552141189575
1.3316774368286133
1.396016001701355
1.2442312240600

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 300, 'lr': 2.537313432835821e-06, 'tr_loss': 1.1109099382162093, 'eval_val_loss': 1.360169146043151, 'eval_val_acc': 0.7185554171855542}
Saving model checkpoint to model_2019-12-08_16:58:45/checkpoint-300
0.9449523687362671
1.2324509620666504
1.0795823335647583
1.4507834911346436
1.1276551485061646
1.2034868001937866
1.3109344244003296
0.7649195194244385
1.0284192562103271
0.894287645816803
1.159991979598999
1.3553235530853271
0.8119962215423584
1.4318526983261108
1.3282030820846558
1.1777844429016113
1.0274657011032104
0.835600733757019
1.2413063049316406
1.0732508897781372
0.8712643384933472
0.9717802405357361
0.8621628880500793
1.0540229082107544
1.4134761095046997
1.2432392835617065
0.7782090902328491
0.9315853714942932
0.8314707279205322
1.2866781949996948
1.5986026525497437
0.7515439391136169
1.1726484298706055
1.1227097511291504
1.448766827583313
1.1019176244735718
0.8118078112602234
0.7616421580314636
1.0096495151519775
0.8925965428352356
1.0815200805664062
1.14849245

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 400, 'lr': 4.975124378109453e-08, 'tr_loss': 1.0813915705680848, 'eval_val_loss': 1.3527800427443946, 'eval_val_acc': 0.7179327521793275}
Saving model checkpoint to model_2019-12-08_16:58:45/checkpoint-400
1.0298988819122314
1.277644157409668



(402, 1.136894717738403)

In [29]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=1e-5, num_train_epochs=2, evaluate_during_training=True, logging_steps=100,
      max_grad_norm=1.0, save_steps=100) #  , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 2
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 402


HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

1.0478357076644897
0.9674321413040161
1.3237180709838867
0.7782022953033447
0.8142008185386658
1.0412660837173462
1.084148645401001
1.309620976448059
0.7301192879676819
1.0520362854003906
1.1408154964447021
1.6579899787902832
1.070573091506958
0.9094818234443665
0.7035747766494751
0.7360749840736389
1.269192099571228
1.6736085414886475
1.111518383026123
1.0096732378005981
1.0711514949798584
1.0777510404586792
0.9288709163665771
1.1376121044158936
1.3790130615234375
1.003525972366333
1.029054045677185
1.288703203201294
1.0027823448181152
1.0033659934997559
1.112678050994873
0.9815242290496826
1.2107261419296265
0.7972946166992188
1.136106014251709
0.945760190486908
1.2779288291931152
1.2883026599884033
1.383240818977356
0.9894634485244751
1.0447194576263428
1.2072023153305054
1.1119787693023682
1.456343412399292
0.7907453775405884
0.8077322840690613
1.1752145290374756
0.9197046160697937
1.2476252317428589
1.0101791620254517
1.0557496547698975
1.1516753435134888
0.82579505443573
1.340111

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 100, 'lr': 7.512437810945274e-06, 'tr_loss': 1.0696197521686555, 'eval_val_loss': 1.3435630122227455, 'eval_val_acc': 0.7160647571606475}
Saving model checkpoint to model_2019-12-08_17:15:53/checkpoint-100
1.2199398279190063
1.115576148033142
0.9271459579467773
1.0549319982528687
1.1092610359191895
0.8145690560340881
1.3217670917510986
0.6121776103973389
0.7946576476097107
0.9131174087524414
0.9486178755760193
1.0248092412948608
0.9654115438461304
1.1609313488006592
0.6445119380950928
0.7355023622512817
1.074260950088501
1.3547309637069702
1.0848641395568848
0.8523067831993103
1.033695101737976
0.8523840308189392
0.9517049789428711
1.155859112739563
1.181068778038025
0.8685779571533203
1.0467803478240967
1.2170536518096924
0.6219874620437622
0.8155491352081299
1.2923099994659424
1.0289267301559448
1.1733591556549072
0.56656414270401
0.9449780583381653
0.9614055156707764
0.9801222085952759
1.042612910270691
1.1919702291488647
0.7356554269790649
1.3198670148849487
1.07032346725

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 200, 'lr': 5.024875621890548e-06, 'tr_loss': 1.0254551780223846, 'eval_val_loss': 1.3134246865908306, 'eval_val_acc': 0.7229140722291407}
Saving model checkpoint to model_2019-12-08_17:15:53/checkpoint-200
0.8186774849891663
0.7609761357307434
1.153896689414978
1.4141769409179688
1.3532202243804932
0.8013406991958618
1.2053855657577515
0.6319950222969055
1.3167567253112793
1.1549943685531616
1.0426440238952637
0.7696332335472107
1.0631171464920044
0.8149642944335938
0.935106635093689
0.698583722114563
1.295013666152954
1.0268244743347168
0.6246161460876465
0.8833678960800171
1.16328763961792
0.8940533995628357
0.891200840473175
0.9751816987991333
0.7702134847640991
0.5344716310501099
1.0368844270706177
1.3868017196655273
0.7501935958862305
1.042136788368225
1.0707663297653198
1.0284472703933716
1.074666976928711
0.9536914229393005
1.1881054639816284
1.3338607549667358
1.0612831115722656
1.7444316148757935
1.0717419385910034
0.7073346376419067
0.7073949575424194
0.693108975887

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 300, 'lr': 2.537313432835821e-06, 'tr_loss': 0.9803498816490174, 'eval_val_loss': 1.3008361602600533, 'eval_val_acc': 0.724159402241594}
Saving model checkpoint to model_2019-12-08_17:15:53/checkpoint-300
1.025551199913025
1.0024468898773193
0.7840942740440369
0.9417954683303833
1.2223701477050781
0.8337961435317993
1.6116669178009033
1.0274925231933594
1.0235536098480225
1.183261752128601
0.8087616562843323
0.7172253131866455
1.0573499202728271
0.9033139944076538
0.7980811595916748
0.5805630087852478
0.749959409236908
0.7877103090286255
0.768212080001831
0.7952524423599243
0.9708897471427917
1.0509753227233887
0.7258901596069336
0.9264586567878723
1.0558722019195557
0.7573355436325073
1.0417824983596802
0.865876317024231
1.1899298429489136
0.6399694681167603
1.0149022340774536
0.7003359198570251
0.9970847368240356
0.8784528970718384
0.8391036987304688
1.1557376384735107
0.6395261287689209
1.0344500541687012
1.0313994884490967
1.1637247800827026
1.0489524602890015
1.525839805

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 400, 'lr': 4.975124378109453e-08, 'tr_loss': 0.9750852239131927, 'eval_val_loss': 1.2949534687236768, 'eval_val_acc': 0.7247820672478207}
Saving model checkpoint to model_2019-12-08_17:15:53/checkpoint-400
0.7839016914367676
1.23289954662323



(402, 1.012606479635286)

In [30]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=1e-5, num_train_epochs=2, evaluate_during_training=True, logging_steps=100,
      max_grad_norm=1.0, save_steps=100) #  , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 2
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 402


HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

0.761407196521759
0.927995502948761
1.1218477487564087
1.0910820960998535
1.0254580974578857
0.7351518869400024
1.0826733112335205
0.7810645699501038
0.7432273626327515
0.8345043063163757
1.0071285963058472
1.2719242572784424
1.1837204694747925
0.6667242050170898
0.9282634854316711
1.2545182704925537
1.022404670715332
1.1827224493026733
0.7315373420715332
1.1821491718292236
1.2056535482406616
1.3130919933319092
1.0387088060379028
0.5785367488861084
0.6941615343093872
0.9657138586044312
1.1190922260284424
0.8521401882171631
1.3540945053100586
0.9827744364738464
1.162158489227295
1.190825343132019
1.1730254888534546
0.6780453324317932
0.773576021194458
0.7816717624664307
1.311280369758606
1.1483969688415527
1.18199622631073
1.209343671798706
0.8860100507736206
0.8572548627853394
0.7134131789207458
1.0973085165023804
1.4822558164596558
0.9425055384635925
0.9580510854721069
1.373572826385498
0.9552561044692993
1.052029013633728
0.9770078659057617
1.4207968711853027
0.7584392428398132
0.998

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 100, 'lr': 7.512437810945274e-06, 'tr_loss': 0.9508729267120362, 'eval_val_loss': 1.2708618108609422, 'eval_val_acc': 0.7254047322540473}
Saving model checkpoint to model_2019-12-08_17:29:37/checkpoint-100
1.1621477603912354
1.2358688116073608
1.0195919275283813
0.8020496368408203
0.8615522384643555
0.8044514060020447
1.2633869647979736
0.9472253322601318
0.7708802223205566
0.9887212514877319
0.9744982123374939
0.790160596370697
0.8389945030212402
0.9213735461235046
1.0322339534759521
0.9953817129135132
0.587010383605957
1.1177021265029907
1.261479139328003
0.8902019262313843
0.8549672365188599
1.110206127166748
1.0213701725006104
0.7375018000602722
0.6056309938430786
0.6968594789505005
0.8388675451278687
1.0414785146713257
0.9560311436653137
1.1596312522888184
0.9000895023345947
1.1050128936767578
0.7572568655014038
1.215576171875
0.6310856342315674
0.9624018669128418
0.7475655674934387
0.985760509967804
0.819367527961731
0.9547679424285889
1.0371172428131104
1.1602843999862

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 200, 'lr': 5.024875621890548e-06, 'tr_loss': 0.9294627469778061, 'eval_val_loss': 1.2641369697466418, 'eval_val_acc': 0.726027397260274}
Saving model checkpoint to model_2019-12-08_17:29:37/checkpoint-200
1.2415300607681274
1.086782693862915
0.6770467162132263
0.7811872363090515
0.6345096230506897
0.8697549700737
1.6095200777053833
0.4851369857788086
0.9149253964424133
0.8859503865242004
1.182004690170288
0.6391107439994812
1.4757195711135864
0.7569196224212646
0.8826242089271545
0.7868229150772095
0.8476423025131226
0.5703203678131104
0.7289307117462158
0.5963057279586792
0.9682862758636475
0.9917241930961609
0.7426474690437317
1.0122712850570679
0.945374608039856
0.8036654591560364
0.5530684590339661
0.6902880668640137
0.780523419380188
0.8622916340827942
1.1527124643325806
0.9968894720077515
0.705843448638916
0.7113032341003418
0.8519794344902039
0.6738866567611694
1.2469189167022705
1.033087968826294
0.7041376829147339
1.0023002624511719
0.7854434847831726
0.8525940775871

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 300, 'lr': 2.537313432835821e-06, 'tr_loss': 0.8981926128268242, 'eval_val_loss': 1.2475524187384552, 'eval_val_acc': 0.7297633872976339}
Saving model checkpoint to model_2019-12-08_17:29:37/checkpoint-300
0.6847700476646423
0.7501887679100037
0.9758374094963074
0.9079113006591797
0.8994448184967041
0.5696734189987183
1.0245028734207153
0.9469355940818787
0.7557604908943176
0.8395162224769592
1.125745177268982
1.0087133646011353
0.4923021197319031
0.6981740593910217
0.5721426010131836
0.9789587259292603
0.5495090484619141
0.503866970539093
0.677960991859436
0.9360248446464539
1.0621598958969116
0.7986932396888733
0.8844207525253296
0.7624691128730774
0.6075289845466614
0.8526597023010254
0.9608372449874878
0.9813931584358215
0.7515769600868225
0.8201626539230347
0.9434528350830078
1.0451163053512573
0.6656144857406616
0.7817803621292114
0.6500221490859985
0.5400607585906982
0.9119657874107361
0.9356212019920349
0.8955280184745789
0.8232804536819458
0.9200849533081055
0.718128

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 400, 'lr': 4.975124378109453e-08, 'tr_loss': 0.8515936052799224, 'eval_val_loss': 1.2377146989551944, 'eval_val_acc': 0.7316313823163139}
Saving model checkpoint to model_2019-12-08_17:29:37/checkpoint-400
0.9178459644317627
1.619006633758545



(402, 0.9093259745717641)

In [31]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=5e-6, num_train_epochs=2, evaluate_during_training=True, logging_steps=100,
      max_grad_norm=1.0, save_steps=100) #  , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 2
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 402


HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

0.6846247315406799
0.4440154433250427
0.9250346422195435
0.8490475416183472
0.7260299324989319
0.9199112057685852
0.7575364112854004
1.1149746179580688
0.9524659514427185
0.7318416237831116
0.9178478717803955
0.8710548281669617
0.7482191920280457
0.7189096808433533
0.7286731600761414
0.7875323295593262
0.8820873498916626
0.715248167514801
0.7448263168334961
0.6143029928207397
1.199554681777954
0.7873803377151489
0.7587928771972656
0.938199520111084
0.9610286951065063
0.856558084487915
0.5225668549537659
0.8068720102310181
0.49251678586006165
0.974794864654541
0.7866537570953369
0.5217487812042236
0.7449788451194763
0.7421456575393677
0.8023957014083862
0.8003404140472412
0.43653130531311035
1.1521172523498535
0.5876345038414001
1.0904920101165771
0.7426061034202576
0.4159899353981018
0.8205618858337402
0.6209635734558105
0.9077222347259521
0.6004633903503418
1.1808452606201172
1.0180401802062988
1.1260123252868652
0.7708035707473755
0.9400575757026672
0.7357417941093445
0.7090139389038

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 100, 'lr': 3.756218905472637e-06, 'tr_loss': 0.8060041210055351, 'eval_val_loss': 1.2355094197674177, 'eval_val_acc': 0.7353673723536738}
Saving model checkpoint to model_2019-12-08_17:44:02/checkpoint-100
0.6988605260848999
0.7793618440628052
0.6593629717826843
0.5982028245925903
0.7946747541427612
0.8408485054969788
0.8539568185806274
0.8589860796928406
0.7073273062705994
0.6026161909103394
0.8124390840530396
1.1289552450180054
1.117707371711731
1.095635175704956
0.8735032677650452
0.8328152894973755
0.9719730615615845
0.9456930160522461
1.011049747467041
0.5850551128387451
0.6689412593841553
1.206134557723999
0.8383857011795044
0.7908294796943665
1.1204849481582642
0.8970261812210083
0.9590781927108765
1.046330451965332
0.6957674026489258
0.9370667934417725
0.8555658459663391
0.9725143313407898
0.8698849678039551
0.753929615020752
0.7570788264274597
1.0257861614227295
0.7211611270904541
0.6433303356170654
0.808917224407196
1.2997058629989624
0.8375900387763977
1.0190544128

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 200, 'lr': 2.512437810945274e-06, 'tr_loss': 0.8756511634588242, 'eval_val_loss': 1.2333973982737432, 'eval_val_acc': 0.7359900373599004}
Saving model checkpoint to model_2019-12-08_17:44:02/checkpoint-200
1.3251707553863525
0.8177995085716248
0.7817438244819641
0.8043184280395508
1.3640594482421875
0.89423006772995
0.5573244094848633
0.787554919719696
0.7716227173805237
0.8851171135902405
0.9240785241127014
1.1043789386749268
0.9643944501876831
0.7772319316864014
0.8404637575149536
0.8053708672523499
0.8578894138336182
1.0164148807525635
0.75246262550354
0.6891204714775085
0.8781780004501343
0.602749228477478
0.703570544719696
0.8869470357894897
0.7330896258354187
0.46334806084632874
0.4673071801662445
0.9351159334182739
0.7437984347343445
0.8217582106590271
0.7438912987709045
0.8121095299720764
1.0760841369628906
0.5683046579360962
0.8942214846611023
1.0753750801086426
0.6415919661521912
0.6531302332878113
0.7268456816673279
0.7820830345153809
0.79153972864151
0.82829469442

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 300, 'lr': 1.2686567164179105e-06, 'tr_loss': 0.7979286986589432, 'eval_val_loss': 1.2229618880879227, 'eval_val_acc': 0.7372353673723536}
Saving model checkpoint to model_2019-12-08_17:44:02/checkpoint-300
0.45796459913253784
0.7631716132164001
0.9452722072601318
0.9037373065948486
0.9119203090667725
0.6309323906898499
0.7569580078125
0.8598560094833374
0.8798892498016357
1.1834315061569214
0.7465862035751343
0.7169187664985657
0.8352838754653931
0.7168225049972534
1.1449074745178223
0.9816457033157349
0.6144207715988159
0.7129124402999878
1.092392921447754
1.19917893409729
0.5671300292015076
0.970512866973877
0.7887963652610779
0.5776698589324951
0.5305454730987549
0.8721446394920349
0.8416536450386047
0.7064170837402344
1.1192128658294678
0.8865160942077637
0.7564719915390015
0.768803060054779
1.1351776123046875
0.8994206786155701
0.704772412776947
0.7491394877433777
0.7484678030014038
0.7755534052848816
0.7529096603393555
0.9777173399925232
0.6188794374465942
1.3607296943

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 400, 'lr': 2.4875621890547265e-08, 'tr_loss': 0.8319821032881737, 'eval_val_loss': 1.2219879481922928, 'eval_val_acc': 0.7397260273972602}
Saving model checkpoint to model_2019-12-08_17:44:02/checkpoint-400
0.8215657472610474
0.8672279715538025



(402, 0.8279736377113495)

In [32]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=5e-6, num_train_epochs=2, evaluate_during_training=True, logging_steps=100,
      max_grad_norm=1.0, save_steps=100) #  , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 2
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 402


HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

0.6703553199768066
0.6576956510543823
0.7496973276138306
0.7890505790710449
0.5756669044494629
0.7371364235877991
0.5539400577545166
0.9930027723312378
1.0736827850341797
0.7594326734542847
1.1254003047943115
0.9291683435440063
0.791792631149292
0.8827391862869263
0.864612340927124
0.8232423067092896
0.9592387676239014
0.9828252196311951
0.6857706308364868
0.3047081530094147
0.7410679459571838
0.5866535305976868
0.7251980900764465
0.6359480619430542
1.267288088798523
0.6895477175712585
0.5617735981941223
0.851077675819397
0.6798716187477112
0.8825365304946899
1.0099440813064575
0.7462301254272461
0.6872737407684326
0.9829829931259155
0.6155173182487488
1.0996984243392944
0.8844208121299744
0.9953994154930115
0.9678763151168823
0.8014237880706787
0.7651979923248291
1.1381192207336426
0.8306530117988586
0.6774357557296753
0.7144116759300232
0.5879731178283691
0.5929692983627319
0.6662738919258118
0.48006004095077515
0.6777690649032593
0.879012405872345
0.8276195526123047
0.98466074466705

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 100, 'lr': 3.756218905472637e-06, 'tr_loss': 0.8048486924171447, 'eval_val_loss': 1.2110820156128252, 'eval_val_acc': 0.7440846824408468}
Saving model checkpoint to model_2019-12-08_17:59:10/checkpoint-100
1.0830225944519043
0.7646229267120361
0.7768069505691528
0.7149930000305176
0.8090452551841736
0.610966145992279
0.751543402671814
1.0769751071929932
0.7005274891853333
0.6332557797431946
0.5224201083183289
0.9690343141555786
0.8323901891708374
0.7217484712600708
1.2197978496551514
0.9027243852615356
0.6056621074676514
0.7211900949478149
0.9741875529289246
0.6640019416809082
0.9632684588432312
0.7335566282272339
0.9882599115371704
0.5895983576774597
0.8661456108093262
0.8728947639465332
0.8246482014656067
0.6628856658935547
0.39287427067756653
0.6400687098503113
0.9720865488052368
0.9897544384002686
0.3464885950088501
1.146335244178772
0.880609393119812
1.2184746265411377
0.7217410802841187
0.8254932761192322
1.1666032075881958
0.48686423897743225
0.7406827211380005
1.16672

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 200, 'lr': 2.512437810945274e-06, 'tr_loss': 0.7998514491319656, 'eval_val_loss': 1.2049577036010686, 'eval_val_acc': 0.7453300124533001}
Saving model checkpoint to model_2019-12-08_17:59:10/checkpoint-200
1.2644822597503662
0.5466439723968506
0.6027975082397461
0.7159350514411926
0.9509021639823914
0.9221739768981934
0.9886593222618103
0.785276472568512
1.0930829048156738
0.5219550132751465
0.46104055643081665
0.8644481301307678
0.8456065654754639
0.7060730457305908
0.8032370805740356
0.6932871341705322
0.7437085509300232
0.913036048412323
0.9608573317527771
0.7743244171142578
1.0167205333709717
0.8417587280273438
0.520045816898346
1.0306668281555176
0.5376139283180237
1.1778013706207275
0.8784940838813782
0.6490451097488403
0.8705318570137024
0.9148269891738892
0.41713833808898926
0.3089371919631958
0.6022785305976868
0.6102802753448486
0.9782452583312988
0.6129980683326721
0.8541675806045532
0.6672137379646301
0.8522464036941528
0.4461497664451599
0.6270009875297546
0.6821

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 300, 'lr': 1.2686567164179105e-06, 'tr_loss': 0.7631040853261948, 'eval_val_loss': 1.203842871669513, 'eval_val_acc': 0.7447073474470735}
Saving model checkpoint to model_2019-12-08_17:59:10/checkpoint-300
0.5137107372283936
0.7915772795677185
0.7245648503303528
1.208752989768982
0.8734756708145142
0.9828492403030396
0.8287994861602783
0.6657083630561829
0.9324822425842285
0.5307278633117676
0.27581846714019775
0.7512463331222534
0.607843279838562
0.9986605048179626
0.980507493019104
0.7848926782608032
0.6165057420730591
0.8680965304374695
0.7298227548599243
0.8725417852401733
1.075828194618225
0.44846275448799133
0.9320333003997803
0.5192525386810303
0.9216208457946777
0.9988819360733032
0.7253715991973877
0.7830570936203003
1.036686897277832
0.6647378206253052
0.9993907809257507
0.6298010349273682
0.7929564714431763
0.37700143456459045
0.9218209385871887
0.6739146709442139
1.1795836687088013
0.6497690677642822
0.618401825428009
0.8099417090415955
0.6144744753837585
0.798953

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 400, 'lr': 2.4875621890547265e-08, 'tr_loss': 0.7828557938337326, 'eval_val_loss': 1.2002893874597786, 'eval_val_acc': 0.7453300124533001}
Saving model checkpoint to model_2019-12-08_17:59:10/checkpoint-400
0.9388267397880554
0.9499779343605042



(402, 0.7884447928981402)

In [33]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=5e-6, num_train_epochs=2, evaluate_during_training=True, logging_steps=100,
      max_grad_norm=1.0, save_steps=100) #  , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 2
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 402


HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

0.9326176047325134
0.7404563426971436
0.7995263934135437
0.7517089247703552
0.7713527083396912
0.6889866590499878
0.7860600352287292
1.0268280506134033
1.3393670320510864
1.1007616519927979
1.2138895988464355
0.6804721355438232
0.5115942358970642
0.9221804738044739
0.723328173160553
0.5791844129562378
0.6406175494194031
0.5475871562957764
0.6534086465835571
0.8532133102416992
0.6555705070495605
0.6409006714820862
0.6480165719985962
0.7092991471290588
0.9623221158981323
0.9187583923339844
0.8966326713562012
0.44291409850120544
1.1303716897964478
0.7013081908226013
0.8215889930725098
0.70667564868927
0.6709491014480591
0.6130229830741882
0.7166080474853516
0.5976791381835938
0.9631994962692261
0.7352467775344849
0.7428648471832275
1.0166873931884766
0.6058624386787415
0.65419602394104
0.7452690601348877
0.6427350640296936
0.46945077180862427
0.6241875290870667
0.9701001644134521
0.6011103987693787
0.785102367401123
0.8105803728103638
0.6338809728622437
0.848543643951416
0.799976050853729

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 100, 'lr': 3.756218905472637e-06, 'tr_loss': 0.7609207257628441, 'eval_val_loss': 1.1915514203742963, 'eval_val_acc': 0.7471980074719801}
Saving model checkpoint to model_2019-12-08_18:12:01/checkpoint-100
1.1277893781661987
0.6853523254394531
0.7951122522354126
0.7198686599731445
0.8392888307571411
0.5661986470222473
0.673470139503479
0.5494980216026306
0.9555766582489014
0.6178069710731506
0.8633847236633301
0.7734060287475586
0.5634321570396423
0.583034873008728
0.8255380392074585
0.6829172372817993
0.7722642421722412
0.9496570825576782
0.4892115592956543
0.6755353808403015
0.9361472725868225
0.5355656147003174
0.8501962423324585
1.1091458797454834
0.8535484671592712
0.7571272850036621
0.8308889269828796
0.7682245969772339
0.8278366923332214
0.6818198561668396
0.5767881274223328
0.5367063879966736
0.6821590662002563
0.5919350981712341
0.90798020362854
0.5470659136772156
0.8946465253829956
0.6737302541732788
0.7450231909751892
0.8451193571090698
0.9349474906921387
0.7745625

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 200, 'lr': 2.512437810945274e-06, 'tr_loss': 0.7543816557526588, 'eval_val_loss': 1.1854524519016494, 'eval_val_acc': 0.7459526774595268}
Saving model checkpoint to model_2019-12-08_18:12:01/checkpoint-200
0.43458497524261475
1.0687016248703003
0.9772599935531616
0.45778506994247437
0.5621548891067505
0.8081046342849731
0.6317898035049438
0.7722927331924438
0.5314937829971313
0.7790589332580566
0.5973915457725525
0.4577210545539856
0.8163045644760132
0.6194348335266113
0.4324503540992737
0.5975956320762634
0.5582819581031799
0.570436418056488
0.5122864246368408
0.7395884394645691
1.0800559520721436
0.6097837686538696
0.8817977905273438
0.6408122777938843
0.6606724262237549
0.7171148657798767
1.1222947835922241
0.7215532660484314
0.67925626039505
0.9827665090560913
0.7561191916465759
0.5310468077659607
0.7853561639785767
0.7789801955223083
0.7588968276977539
1.3327902555465698
0.7798058986663818
0.6192984580993652
1.1334408521652222
0.5822276473045349
0.85753333568573
0.824867

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 300, 'lr': 1.2686567164179105e-06, 'tr_loss': 0.7279745838046074, 'eval_val_loss': 1.1865284745372944, 'eval_val_acc': 0.7484433374844334}
Saving model checkpoint to model_2019-12-08_18:12:01/checkpoint-300
0.7112087607383728
0.6324676871299744
0.8684608340263367
0.7901788353919983
0.45325833559036255
0.6436129808425903
0.5384208559989929
0.3463262915611267
0.35875290632247925
0.8693838715553284
0.8105355501174927
0.6659069657325745
0.7434106469154358
0.5832278728485107
0.8567532300949097
0.5758948922157288
0.7384520769119263
0.8280860781669617
0.864910364151001
0.6020148992538452
0.8848065733909607
0.9864746332168579
0.833411693572998
0.7916070818901062
0.7436698079109192
0.8868311047554016
0.8142000436782837
1.098319172859192
0.6576744318008423
0.8411765694618225
0.7680532932281494
0.9429420232772827
1.2184242010116577
0.3922785520553589
0.5078606009483337
0.7424298524856567
0.6961667537689209
1.0828300714492798
0.643310546875
0.8612486720085144
0.9131461381912231
0.8984563

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 400, 'lr': 2.4875621890547265e-08, 'tr_loss': 0.7322924946248531, 'eval_val_loss': 1.1843525153487475, 'eval_val_acc': 0.7496886674968867}
Saving model checkpoint to model_2019-12-08_18:12:01/checkpoint-400
0.6051297187805176
0.4408401548862457



(402, 0.7427933230551321)

In [34]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=5e-6, num_train_epochs=2, evaluate_during_training=True, logging_steps=100,
      max_grad_norm=1.0, save_steps=100) #  , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 2
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 402


HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

0.8767092227935791
0.9581749439239502
0.7750853300094604
1.0596277713775635
0.8584838509559631
0.5144208073616028
0.6259433627128601
1.0654127597808838
0.6079736351966858
1.1855857372283936
0.8450862169265747
0.9921684861183167
1.1623334884643555
0.7719600200653076
0.5441461801528931
0.7936689257621765
0.8104708194732666
0.5962953567504883
1.0040392875671387
0.6298707723617554
0.6046987175941467
0.724297046661377
0.9906914830207825
0.47616010904312134
0.6843993067741394
1.0438406467437744
0.7521935105323792
0.6582052707672119
0.7840901613235474
0.7277917861938477
0.7433528900146484
1.2930806875228882
0.8426961302757263
0.6711289882659912
0.7730394005775452
0.6096275448799133
0.5775250792503357
0.6664680242538452
0.6176942586898804
0.3392392098903656
0.5649620890617371
0.5817233324050903
0.6306421756744385
0.7196863889694214
0.4227276146411896
0.6272540092468262
0.6899303793907166
0.68519526720047
0.41693931818008423
0.41482478380203247
1.008230447769165
0.8731156587600708
0.50193077325

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 100, 'lr': 3.756218905472637e-06, 'tr_loss': 0.7080432569980621, 'eval_val_loss': 1.1807825442273818, 'eval_val_acc': 0.7503113325031133}
Saving model checkpoint to model_2019-12-08_18:25:32/checkpoint-100
0.41080179810523987
0.8625433444976807
0.5652791261672974
0.4892922043800354
0.6204036474227905
1.2219598293304443
0.9314162135124207
0.5350706577301025
0.48837384581565857
0.6862987875938416
0.8455680012702942
0.7316678762435913
0.9764372706413269
0.661334216594696
1.2798352241516113
0.9399585723876953
0.5413967370986938
1.010228157043457
0.9709081649780273
0.7205229997634888
0.6153926849365234
0.6607926487922668
0.7215282917022705
0.810319185256958
0.6092151999473572
0.852403998374939
1.073324203491211
0.5420789122581482
0.6374262571334839
0.689331591129303
0.6077033281326294
0.809916079044342
0.9143799543380737
0.8177672028541565
0.5313681364059448
0.943925678730011
0.6105579733848572
0.37734338641166687
0.8298780918121338
0.5855039358139038
0.7159491181373596
1.00333356

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 200, 'lr': 2.512437810945274e-06, 'tr_loss': 0.7223765024542809, 'eval_val_loss': 1.170042872873705, 'eval_val_acc': 0.75093399750934}
Saving model checkpoint to model_2019-12-08_18:25:32/checkpoint-200
0.7832090854644775
0.6278984546661377
0.9477198719978333
0.5757507681846619
0.5982702970504761
0.6391976475715637
0.5734015703201294
0.7798718810081482
0.7647820115089417
0.6604800820350647
1.0463758707046509
0.776109516620636
0.6529717445373535
0.8460841774940491
0.9161677360534668
0.37476909160614014
0.8329552412033081
0.8799142241477966
0.6403217315673828
0.6169546246528625
0.7472572922706604
0.7377431988716125
0.42071080207824707
0.7097985744476318
0.4861810505390167
0.9213047027587891
0.7813307046890259
1.0841114521026611
0.4184708893299103
0.5352482795715332
0.5782853364944458
0.4392037093639374
0.8661001920700073
0.6142445206642151
0.4354488253593445
0.7696018815040588
1.1482311487197876
0.7232334613800049
0.38491588830947876
0.6591179370880127
0.41478702425956726
1.111

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 300, 'lr': 1.2686567164179105e-06, 'tr_loss': 0.696763456761837, 'eval_val_loss': 1.1699880450519162, 'eval_val_acc': 0.7521793275217933}
Saving model checkpoint to model_2019-12-08_18:25:32/checkpoint-300
0.7487297654151917
0.549796462059021
0.4374921917915344
0.4554632306098938
0.35822007060050964
0.47929245233535767
0.6342255473136902
0.5693087577819824
0.7341794967651367
0.6737769842147827
0.883044958114624
0.5065958499908447
0.5576862096786499
0.671309232711792
0.5927583575248718
0.7057480812072754
0.8219992518424988
0.8383493423461914
0.8797590732574463
0.4342336356639862
0.5309181213378906
0.41957128047943115
0.7989094257354736
0.8685758113861084
0.5037670135498047
0.8883470296859741
0.629568874835968
0.39859238266944885
0.400875985622406
1.172121524810791
0.6687493920326233
0.7632136344909668
0.5692516565322876
0.7153305411338806
0.639782190322876
1.0748594999313354
0.7585827112197876
0.4262821674346924
0.6268687844276428
1.03775954246521
0.5665563941001892
0.69811028

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 400, 'lr': 2.4875621890547265e-08, 'tr_loss': 0.683668232858181, 'eval_val_loss': 1.1692896890106486, 'eval_val_acc': 0.7540473225404732}
Saving model checkpoint to model_2019-12-08_18:25:32/checkpoint-400
0.890155017375946
0.683367908000946



(402, 0.7031310145089875)

In [35]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=5e-6, num_train_epochs=2, evaluate_during_training=True, logging_steps=100,
      max_grad_norm=1.0, save_steps=100) #  , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 2
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 402


HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

0.6022676229476929
0.5073798894882202
0.6040853261947632
0.5024563074111938
0.7395118474960327
0.8193670511245728
0.6705150008201599
0.6913290619850159
0.490023672580719
0.5754872560501099
0.7903575301170349
0.7002363204956055
0.9561936855316162
0.9113644361495972
0.5875877141952515
0.6259462237358093
0.3238617777824402
0.3994397222995758
0.5183411240577698
0.8085016012191772
0.5488588213920593
0.7452020645141602
0.4373835623264313
0.751312792301178
0.6672600507736206
0.6909914612770081
0.8959885835647583
0.5039502382278442
0.708421528339386
0.6730737090110779
0.5449054837226868
0.7982456684112549
0.3519246578216553
0.8083717226982117
0.875636875629425
0.8178991675376892
1.1140780448913574
0.9647709131240845
0.6770021915435791
0.6603312492370605
0.8785777688026428
0.8815959095954895
0.7959219217300415
1.1868007183074951
0.4926017224788666
0.572573721408844
0.415716290473938
0.630107581615448
0.8753882050514221
0.8528650999069214
0.40865057706832886
0.7944437265396118
0.5579315423965454

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 100, 'lr': 3.756218905472637e-06, 'tr_loss': 0.6612459108233452, 'eval_val_loss': 1.168830418023304, 'eval_val_acc': 0.7503113325031133}
Saving model checkpoint to model_2019-12-08_18:45:51/checkpoint-100
0.7013289928436279
0.5646301507949829
0.57281494140625
0.7730282545089722
0.8818609714508057
0.5437389016151428
0.9160635471343994
0.5448720455169678
0.8120311498641968
0.39027827978134155
0.7396818399429321
0.6805025339126587
0.43335282802581787
0.7098075151443481
0.5929254293441772
0.899922788143158
0.9367557764053345
0.8118132948875427
0.7170715928077698
0.4926680624485016
0.8212723731994629
0.5570698380470276
0.8735101222991943
0.7733021974563599
0.6439420580863953
0.6405879259109497
0.7593066692352295
0.7996865510940552
1.025133728981018
0.5626339912414551
0.6689896583557129
0.6264081001281738
0.7913925647735596
0.684499204158783
0.8463377356529236
0.7155464291572571
0.7268548011779785
0.6959961652755737
0.4832438826560974
0.9242681264877319
0.41388165950775146
0.952728

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 200, 'lr': 2.512437810945274e-06, 'tr_loss': 0.6831934800744057, 'eval_val_loss': 1.170351390370089, 'eval_val_acc': 0.74906600249066}
Saving model checkpoint to model_2019-12-08_18:45:51/checkpoint-200
0.6746805310249329
0.6684576869010925
0.7388811111450195
0.5640815496444702
0.7109058499336243
0.5223262310028076
0.701219379901886
0.5253475308418274
0.8491418361663818
0.5499308705329895
0.5292744636535645
0.44176119565963745
0.7883495688438416
1.038120985031128
0.5375012159347534
0.7660424709320068
0.7896320223808289
0.5264256000518799
0.7453758120536804
0.6472437977790833
0.637015700340271
0.48713019490242004
0.3948226273059845
0.723495602607727
0.7594467401504517
0.5798995494842529
0.37657517194747925
0.5752784013748169
0.7547163367271423
0.8742102384567261
0.5794240236282349
0.3963620662689209
0.7898435592651367
0.6590536832809448
0.4945380389690399
0.7809206247329712
0.6564754843711853
0.5693616271018982
0.6875337362289429
0.7102994918823242
0.4877076745033264
0.6321696

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 300, 'lr': 1.2686567164179105e-06, 'tr_loss': 0.6767927324771881, 'eval_val_loss': 1.1578751176743958, 'eval_val_acc': 0.7540473225404732}
Saving model checkpoint to model_2019-12-08_18:45:51/checkpoint-300
0.6555989980697632
0.500582754611969
0.6944797039031982
0.7987204790115356
0.6672453880310059
0.6481849551200867
0.5222460031509399
0.4849556088447571
0.8084656000137329
0.5946096181869507
0.6355496048927307
0.7742204666137695
0.6199742555618286
0.6312724351882935
0.579660177230835
0.6207770109176636
0.40009114146232605
0.6607109904289246
0.5698256492614746
0.921358048915863
0.6079521179199219
0.29257655143737793
0.665278971195221
0.682808518409729
0.625403642654419
0.7269933223724365
0.29352614283561707
0.5982027053833008
0.7762971520423889
0.7290500402450562
0.6498697996139526
0.5580374002456665
0.7112936973571777
0.654147744178772
0.41017672419548035
0.8430595397949219
0.489795982837677
0.8668186664581299
0.6501657962799072
0.9768617153167725
0.9851390719413757
0.955160

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 400, 'lr': 2.4875621890547265e-08, 'tr_loss': 0.6258571381866932, 'eval_val_loss': 1.15795534923302, 'eval_val_acc': 0.7565379825653799}
Saving model checkpoint to model_2019-12-08_18:45:51/checkpoint-400
0.5572386980056763
0.8780757784843445



(402, 0.6620503498324707)

In [36]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=5e-7, num_train_epochs=2, evaluate_during_training=True, logging_steps=100,
      max_grad_norm=1.0, save_steps=100) #  , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 2
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 402


HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

0.47426676750183105
0.733772337436676
0.5962886810302734
0.8204739689826965
0.744265079498291
1.0292290449142456
0.7017791271209717
0.5671645402908325
0.7663383483886719
0.707411527633667
0.4594423472881317
0.7825263142585754
0.7266461253166199
0.8465022444725037
0.9596213102340698
0.8873273134231567
0.5377678871154785
0.467706561088562
0.8040708303451538
0.6483874320983887
0.4962446391582489
0.5562210083007812
0.7437949180603027
0.6703795790672302
0.5457642674446106
0.7776371240615845
0.7644872665405273
0.7841882705688477
0.2923938035964966
0.42067238688468933
0.4162808358669281
0.7871259450912476
0.7415921688079834
0.7128235101699829
0.7504164576530457
0.5764873027801514
0.9266740679740906
0.37378963828086853
0.596489429473877
0.8492785096168518
0.7258742451667786
0.6116454005241394
0.3683498501777649
0.8066926002502441
0.3897758424282074
0.503635823726654
0.6871684789657593
0.49074313044548035
0.39482054114341736
0.6775978803634644
0.858648955821991
0.7808157205581665
0.710148751735

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 100, 'lr': 3.756218905472637e-07, 'tr_loss': 0.636774240732193, 'eval_val_loss': 1.1583468602664435, 'eval_val_acc': 0.7534246575342466}
Saving model checkpoint to model_2019-12-08_18:59:42/checkpoint-100
0.6465752124786377
0.8007612824440002
0.6122683882713318
0.4839579463005066
0.4636536240577698
0.8032873272895813
0.8185293674468994
0.6337109208106995
0.9824199676513672
0.6147928237915039
0.5314657092094421
0.7068657875061035
0.5792171359062195
0.4427424669265747
0.4176105856895447
0.4378034174442291
0.41831010580062866
0.7422701716423035
0.6130102872848511
0.8010938763618469
0.8054477572441101
0.6465485692024231
0.8913582563400269
0.6574344635009766
0.8785531520843506
0.6326351761817932
0.45971545577049255
0.5392355918884277
0.7088095545768738
0.7816169261932373
0.43228867650032043
0.6145102977752686
0.6332839131355286
0.7095060348510742
0.47315043210983276
1.0241868495941162
0.6399968862533569
1.3099017143249512
0.7038575410842896
0.7216082811355591
0.4192762076854706
0.

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 200, 'lr': 2.5124378109452736e-07, 'tr_loss': 0.649299835562706, 'eval_val_loss': 1.1577869358643964, 'eval_val_acc': 0.7534246575342466}
Saving model checkpoint to model_2019-12-08_18:59:42/checkpoint-200
0.1612231582403183
0.47628453373908997
0.4738261103630066
0.5305418372154236
0.4802483320236206
0.4676308035850525
0.4347594082355499
0.6739484071731567
0.7277740836143494
0.4816324710845947
0.8188096880912781
0.5487377643585205
0.3576511740684509
0.7456238865852356
0.4949363172054291
0.7181270122528076
0.6858359575271606
0.7695472240447998
0.6131823658943176
0.467244029045105
0.5823192596435547
0.4619450569152832
0.5326296091079712
0.6106265187263489
0.61687833070755
0.9741437435150146
0.820043683052063
0.8547782301902771
0.7990767955780029
0.6884324550628662
0.6423919200897217
0.34536808729171753
0.6294569969177246
0.6303496956825256
0.9683232307434082
0.5575888156890869
0.5843691825866699
0.5231508016586304
0.9221721887588501
0.8828161954879761
0.34177491068840027
0.7129

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 300, 'lr': 1.2686567164179106e-07, 'tr_loss': 0.6262011761963368, 'eval_val_loss': 1.1576437491979172, 'eval_val_acc': 0.7552926525529265}
Saving model checkpoint to model_2019-12-08_18:59:42/checkpoint-300
0.669922947883606
0.6789073348045349
0.4823794960975647
0.6355155110359192
0.5404163599014282
0.5633834004402161
0.843842625617981
0.3625451326370239
0.7880809307098389
0.9607582092285156
0.4429342746734619
0.9512324929237366
0.6986635327339172
0.6514921188354492
0.6792269945144653
0.8828417658805847
0.5394847393035889
0.8839915990829468
0.6335245966911316
0.3858000338077545
0.5584709644317627
0.6043920516967773
0.6279592514038086
0.28809231519699097
0.5465155839920044
0.8655281066894531
0.5324428081512451
0.6695370078086853
0.4291156828403473
0.6820511817932129
0.668169379234314
0.7567324042320251
0.45521485805511475
0.6248955726623535
0.4333324134349823
0.36267945170402527
0.785209596157074
0.4580151438713074
0.9207304120063782
0.9084218144416809
0.3791504204273224
0.854

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 400, 'lr': 2.487562189054726e-09, 'tr_loss': 0.6474633318185806, 'eval_val_loss': 1.1573753634198982, 'eval_val_acc': 0.7540473225404732}
Saving model checkpoint to model_2019-12-08_18:59:42/checkpoint-400
0.6687226295471191
0.5746030211448669



(402, 0.6398437414967005)

In [37]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=5e-7, num_train_epochs=2, evaluate_during_training=True, logging_steps=100,
      max_grad_norm=1.0, save_steps=100) #  , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 2
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 402


HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

0.7383387684822083
0.8248773813247681
0.6558001041412354
0.8449190258979797
0.31201547384262085
0.5968238115310669
0.8043745756149292
0.6917213797569275
1.0762884616851807
0.5186601877212524
0.6904730200767517
0.6425389647483826
0.7283509373664856
0.5141333937644958
0.8060901165008545
0.9243096113204956
0.7283825278282166
0.722327470779419
0.39358270168304443
0.5236388444900513
0.47710150480270386
0.6401848793029785
0.4055819511413574
0.520879328250885
0.7576154470443726
0.6461966037750244
0.7137479186058044
0.6331877708435059
0.43643689155578613
0.7895541191101074
0.5999490022659302
0.38027963042259216
0.43711042404174805
0.4385201334953308
0.757766604423523
0.7397112250328064
0.5623487830162048
0.6573457717895508
0.6010711789131165
0.4794796109199524
0.7073172926902771
1.023752212524414
0.8720166087150574
0.4165593981742859
0.6382279396057129
0.6236130595207214
0.9050284624099731
0.9529151916503906
0.5645617246627808
0.6966869235038757
0.33784356713294983
0.7392958402633667
0.4113383

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 100, 'lr': 3.756218905472637e-07, 'tr_loss': 0.6463379660248756, 'eval_val_loss': 1.1566974141111421, 'eval_val_acc': 0.7552926525529265}
Saving model checkpoint to model_2019-12-08_19:15:18/checkpoint-100
0.555331289768219
0.6676710844039917
0.33925479650497437
0.6885546445846558
0.8295334577560425
0.49166518449783325
0.5799886584281921
0.512721061706543
0.5951560735702515
0.5458337068557739
0.41821467876434326
0.7549512386322021
0.8254916667938232
0.8718047142028809
0.8621187806129456
0.37524890899658203
0.8562800884246826
0.493075966835022
0.5976651906967163
0.6098016500473022
0.8148226141929626
0.6073451638221741
0.4410873353481293
0.9514695405960083
0.45082032680511475
0.4659914970397949
0.4001072645187378
0.8569481372833252
0.6755304336547852
0.4602808654308319
0.9295154809951782
0.6810876131057739
0.8692979216575623
0.6013669967651367
0.5918809175491333
0.7628464102745056
0.9131968021392822
0.8384005427360535
0.805741548538208
0.4688335061073303
0.6813596487045288
0.55

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 200, 'lr': 2.5124378109452736e-07, 'tr_loss': 0.6314811538159847, 'eval_val_loss': 1.1569484012636972, 'eval_val_acc': 0.7546699875466999}
Saving model checkpoint to model_2019-12-08_19:15:18/checkpoint-200
0.546549379825592
0.3037872910499573
0.6966895461082458
0.5798909664154053
0.5466554760932922
0.34821391105651855
0.6840956807136536
0.3608510196208954
0.8233697414398193
0.5325448513031006
0.7542339563369751
0.6057077050209045
0.6516522169113159
0.4490785598754883
0.5165235996246338
0.46838870644569397
0.7447226643562317
0.7090803384780884
0.47847235202789307
0.6731550097465515
0.3933517634868622
0.7745859026908875
0.6363286375999451
0.4988705515861511
0.4096873998641968
0.8439350128173828
0.9513567090034485
0.39468032121658325
0.7065523862838745
0.6110616326332092
0.4432528018951416
0.7750552296638489
0.41276228427886963
0.7901366949081421
0.7531296014785767
0.6378301382064819
0.342142790555954
0.6816529631614685
0.7776808738708496
1.0282304286956787
0.7061550617218018
0

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 300, 'lr': 1.2686567164179106e-07, 'tr_loss': 0.6159438055753708, 'eval_val_loss': 1.1567556675690323, 'eval_val_acc': 0.7540473225404732}
Saving model checkpoint to model_2019-12-08_19:15:18/checkpoint-300
0.3902236819267273
0.9538466930389404
0.5762240886688232
0.6782328486442566
0.6838855147361755
0.6473745703697205
0.7763615846633911
0.6158285737037659
0.9061225652694702
0.44118303060531616
0.5728477239608765
0.6195244789123535
0.5152880549430847
0.43206435441970825
0.6196173429489136
0.5246708989143372
0.9159119129180908
0.6968867778778076
0.42235225439071655
0.9742652773857117
0.6823099851608276
0.5643921494483948
0.7189637422561646
1.3245126008987427
0.6457765102386475
0.47930189967155457
0.3383340835571289
0.8616924285888672
0.8055487871170044
0.61390221118927
0.6954436898231506
0.8793591260910034
0.5150595307350159
0.9981389045715332
0.4120658040046692
0.7218239903450012
0.6027472615242004
0.5445824265480042
0.4698200821876526
1.0142834186553955
0.4116056561470032
1.

HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…


{'step': 400, 'lr': 2.487562189054726e-09, 'tr_loss': 0.658843001127243, 'eval_val_loss': 1.1565091515061867, 'eval_val_acc': 0.7546699875466999}
Saving model checkpoint to model_2019-12-08_19:15:18/checkpoint-400
0.5974645018577576
0.8229619860649109



(402, 0.6385099978663435)

# Pred on test

In [26]:
model = CamembertForSequenceClassification.from_pretrained("model_2019-12-08_14:59:37/checkpoint-400/checkpoint-800/checkpoint-1200/")
model = model.to(device)

In [12]:
evaluate(model, tokenizer)

Loading features from cached file data/cached_test_128
***** Running evaluation  *****
  Num examples = %d 1606
  Batch size = %d 8


HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…




{'val_loss': 1.5273962446409672, 'val_acc': 0.6749688667496887}

# Pred on dev

In [51]:
eval_batch_size=8
    
eval_dataset = load_and_cache_examples(tokenizer=tokenizer, max_seq_length=128, data_dir="data/",
                                       which_data="dev", overwrite_cache=True)

Creating features from dataset file at data/
['ID', 'question']
{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '10': 10, '11': 11, '12': 12, '13': 13, '14': 14, '15': 15, '16': 16, '17': 17, '18': 18, '19': 19, '20': 20, '21': 21, '22': 22, '23': 23, '24': 24, '25': 25, '26': 26, '27': 27, '28': 28, '29': 29, '30': 30, '31': 31, '32': 32, '33': 33, '34': 34, '35': 35, '36': 36, '37': 37, '38': 38, '39': 39, '40': 40, '41': 41, '42': 42, '43': 43, '44': 44, '45': 45, '46': 46, '47': 47, '48': 48, '49': 49, '50': 50}
Writing example 0
Saving features into cached file data/cached_dev_128


In [54]:
eval_batch_size = eval_batch_size
# Note that DistributedSampler samples randomly
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size)

# Eval!
print("***** Running evaluation *****")
print("  Num examples = %d", len(eval_dataset))
print("  Batch size = %d", eval_batch_size)

eval_loss = 0.0
nb_eval_steps = 0
preds = None
out_label_ids = None
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    model.eval()
    batch = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[3]}
        inputs['token_type_ids'] = batch[2] #or None
        outputs = model(**inputs)
        tmp_eval_loss, logits = outputs[:2]
        eval_loss += tmp_eval_loss.mean().item()
    nb_eval_steps += 1
    if preds is None:
        preds = logits.detach().cpu().numpy()
        out_label_ids = inputs['labels'].detach().cpu().numpy()
    else:
        preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

eval_loss = eval_loss / nb_eval_steps
preds_class = np.argmax(preds, axis=1)


***** Running evaluation *****
  Num examples = %d 2035
  Batch size = %d 8


HBox(children=(IntProgress(value=0, description='Evaluating', max=255, style=ProgressStyle(description_width='…




# Build submission file

In [59]:
dev_data = pd.read_csv(path_data + "dev.tsv", sep = "\t")

In [61]:
sub = pd.DataFrame({"ID": dev_data["ID"].values,  "intention": preds_class})

In [62]:
sub.head()

Unnamed: 0,ID,intention
0,8028,32
1,8029,32
2,8030,32
3,8031,31
4,8032,44


In [63]:
sub.to_csv("sub/sub0.csv", index=False)