In [1]:
import pandas as pd
import numpy as np
import os
import csv
import datetime
from tqdm.notebook import tqdm, trange

import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset # used load data
import torch.optim as optim

from transformers.data.processors.utils import InputExample, InputFeatures
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import (CamembertConfig,
                          CamembertForSequenceClassification,
                          CamembertTokenizer)


In [2]:
path_data="data/"

In [3]:
#data = pd.read_csv(path_data + "knowledge_20191112.tsv", sep="\t")

In [4]:
#data.head()

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
# modified from glue.py and utils.py

class InputExample(object):

    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

    
class DataProcessor():
    
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                line = list(cell for cell in line)
                lines.append(line)
            return lines

        
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['ID'].numpy(),
                            tensor_dict['question'].numpy().decode('utf-8'),
                            str(tensor_dict['intention'].numpy()))

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
    
    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")


    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_labels(self):
        """See base class."""
        return [str(j) for j in range(51)]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                print(line)
                continue
            guid = "%s-%s" % (set_type, line[0])
            try:
                utterance = line[1]
                label = line[2]
            except IndexError:
                continue
            examples.append(
                InputExample(guid=guid, text_a=utterance, text_b=None, label=label))
        return examples


def convert_examples_to_features(examples,
                                 tokenizer,
                                 max_length=512,
                                 label_list=None,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """

    processor = DataProcessor()
    if label_list is None:
        label_list = processor.get_labels()

    label_map = {label: i for i, label in enumerate(label_list)}
    print(label_map)

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            print("Writing example %d" % (ex_index))

        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)

        label = label_map[example.label]

        #if ex_index < 5:
            #print("*** Example ***")
            #print("guid: %s" % (example.guid))
            #print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            #print("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            #print("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            #print("label: %s (id = %d)" % (example.label, label))

        features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              label=label))

    return features

In [25]:
num_labels=51 # 1081
model_name="camembert-base"

config = CamembertConfig.from_pretrained(model_name,
                                         num_labels=num_labels,
                                         finetuning_task="classification")

In [26]:
tokenizer = CamembertTokenizer.from_pretrained(model_name,
                                               do_lower_case=True)

model = CamembertForSequenceClassification.from_pretrained(model_name, config=config)
model = model.to(device)
print(model)

CamembertForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [21]:
def simple_accuracy(preds, labels):
    return (preds == labels).mean()

def compute_metrics(preds, labels):
    return {"acc": simple_accuracy(preds, labels)}


In [27]:
def load_and_cache_examples(tokenizer, 
                            max_seq_length,
                            data_dir,
                            evaluate=False,
                            overwrite_cache = False,
                            verbose=1):

    processor = DataProcessor()
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(data_dir, 'cached_{}_{}'.format('test' if evaluate else 'train',
                                                                        str(max_seq_length)))
    if os.path.exists(cached_features_file) and not overwrite_cache:
        if verbose > 0:
            print("Loading features from cached file %s" % cached_features_file)
        features = torch.load(cached_features_file)
    else:
        if verbose > 0:
            print("Creating features from dataset file at %s" % data_dir)
        label_list = processor.get_labels()
        examples = processor.get_test_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
        features = convert_examples_to_features(examples,
                                                tokenizer,
                                                label_list=label_list,
                                                max_length=max_seq_length,
                                                pad_on_left=False,
                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                                pad_token_segment_id=0,
        )
        if verbose > 0:
            print("Saving features into cached file %s" % cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset


def train(model, train_dataset, tokenizer, num_train_epochs, train_batch_size, learning_rate, adam_epsilon=1e-8,
          logging_steps=None, gradient_accumulation_steps=1, max_grad_norm=1.0, weight_decay=0.0,
          warmup_steps=0, save_steps=-1, output_dir=None, evaluate_during_training=False,
          seed=None, max_steps=-1):
    
    """ Train the model """
    if output_dir is None:
        output_dir = "model_" + str(datetime.datetime.now()).split(".")[0].replace(" ","_") + "/"
        
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)
    
    if logging_steps is None:
        logging_steps = len(train_dataloader) // (gradient_accumulation_steps * 5)
        
    if max_steps > 0:
        t_total = max_steps
        num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    
    # pas besoin de la partie custom ci-dessous à priori
    
    #no_decay = ['bias', 'LayerNorm.weight']
    #optimizer_grouped_parameters = [
    #    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    #    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    #    ]
    
    # change l'optimizer pour voir
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon) # optimizer_grouped_parameters
    #optimizer = optim.SGD(model.parameters(), lr=learning_rate) #eps=adam_epsilon , momentum=0.9
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    
    #for i, tensor in enumerate(model.parameters()):
    #    if i > 1:
    #        tensor.requires_grad = False

    # Train!
    print("***** Running training *****")
    print("  Num examples = %d" % len(train_dataset))
    print("  Num Epochs = %d" % num_train_epochs)
    print(" Batch size = %d" % train_batch_size)
    print("  Total train batch size (w. parallel, distributed & accumulation) = %d" %
                   train_batch_size * gradient_accumulation_steps)
    print("  Gradient Accumulation steps = %d" % gradient_accumulation_steps)
    print("  Total optimization steps = %d" % t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(num_train_epochs), desc="Epoch")
    if seed is not None:
        set_seed(seed)
        
    for epoch, _ in enumerate(train_iterator):
        # print("Epoch %d / %d" % (epoch, num_train_epochs))
        epoch_iterator = train_dataloader
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[3]}
            inputs['token_type_ids'] = batch[2] #or None
            optimizer.zero_grad()

            outputs = model(**inputs)
            #print(outputs)
            #print(outputs[0].size())
            #print(outputs[1].size())
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            #print(loss)
            
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            print(loss.item())
            if (step + 1) % gradient_accumulation_steps == 0:
                #params = list(model.parameters())[:1]
                #print(params)
                #print([torch.sum(par, 0) for par in params])
                #print([par.size() for par in params])
                #print([par.grad for par in params])
                #print([torch.sum(par.grad, 0) for par in params])
                #print([torch.sum(par.grad, 1) for par in params])
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if logging_steps > 0 and global_step % logging_steps == 0:
                    print("\n\nEval")
                    # Log metrics
                    dict_print = {'step':global_step,
                                  'lr': scheduler.get_lr()[0],
                                  'loss': (tr_loss - logging_loss)/logging_steps}
                    if evaluate_during_training:
                        results = evaluate(model, tokenizer, eval_output_dir=output_dir, verbose=-1)
                        for key, value in results.items():
                            dict_print['eval_{}'.format(key)] = value
                    print(dict_print)
                    logging_loss = tr_loss

                if save_steps > 0 and global_step % save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model.save_pretrained(output_dir)
                    #torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    print("Saving model checkpoint to %s" % output_dir)

            if max_steps > 0 and global_step > max_steps:
                #epoch_iterator.close() #deleted since no tqdm anymore
                break
                
        if max_steps > 0 and global_step > max_steps:
            train_iterator.close()
            break

    if global_step == 0:
        global_step= 1

    return global_step, tr_loss / global_step


def evaluate(model, tokenizer, eval_batch_size=8, prefix="", eval_output_dir=None,
             verbose=1):
    
    results = {}
    eval_dataset = load_and_cache_examples(tokenizer=tokenizer, max_seq_length=128, data_dir="data/",
                                           evaluate=True, verbose=verbose)
    
    eval_batch_size = eval_batch_size
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size)
    
    # Eval!
    if verbose > 0:
        print("***** Running evaluation {} *****".format(prefix))
        print("  Num examples = %d", len(eval_dataset))
        print("  Batch size = %d", eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'labels':         batch[3]}
                inputs['token_type_ids'] = batch[2] #or None
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
    else:
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in eval_dataloader:
            model.eval()
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'labels':         batch[3]}
                inputs['token_type_ids'] = None
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds = np.argmax(preds, axis=1)
    result = compute_metrics(preds, out_label_ids)
    results.update(result)

    if eval_output_dir is not None:
        if not os.path.exists(eval_output_dir):
            os.makedirs(eval_output_dir)
            
        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            writer.write("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                writer.write("  %s = %s" % (key, str(result[key])))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return results



In [11]:
# rmk : we can see all sequences of tokens begin with 5 and end with 6
train_dataset = load_and_cache_examples(tokenizer=tokenizer, max_seq_length=128, data_dir="data/")

Loading features from cached file data/cached_train_128


In [None]:
eval_batch_size=8
    
eval_dataset = load_and_cache_examples(tokenizer=tokenizer, max_seq_length=128, data_dir="data/",
                                       evaluate=True, verbose=verbose)

eval_batch_size = eval_batch_size
# Note that DistributedSampler samples randomly
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size)

# Eval!

print("***** Running evaluation {} *****".format(prefix))
print("  Num examples = %d", len(eval_dataset))
print("  Batch size = %d", eval_batch_size)
eval_loss = 0.0
nb_eval_steps = 0
preds = None
out_label_ids = None
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    model.eval()
    batch = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[3]}
        inputs['token_type_ids'] = batch[2] #or None
        outputs = model(**inputs)
        tmp_eval_loss, logits = outputs[:2]
        eval_loss += tmp_eval_loss.mean().item()
    nb_eval_steps += 1
    if preds is None:
        preds = logits.detach().cpu().numpy()
        out_label_ids = inputs['labels'].detach().cpu().numpy()
    else:
        preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

eval_loss = eval_loss / nb_eval_steps
preds = np.argmax(preds, axis=1)
result = compute_metrics(preds, out_label_ids)
results.update(result)


In [29]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=3e-4, num_train_epochs=6, evaluate_during_training=True, logging_steps=200,
      max_grad_norm=1.0) # save_steps=500, , max_steps=3


***** Running training *****
  Num examples = 6422
  Num Epochs = 6
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1206


HBox(children=(IntProgress(value=0, description='Epoch', max=6, style=ProgressStyle(description_width='initial…

2.884810209274292
3.0879733562469482
3.350253105163574
2.818715810775757
2.9689314365386963
2.567230701446533
2.798086166381836
2.6314258575439453
2.9995920658111572
2.8127663135528564
2.8527374267578125
3.3339149951934814
2.9414429664611816
3.220916986465454
2.784862518310547
3.384927749633789
2.973037004470825
2.6827237606048584
3.254485607147217
2.678860902786255
2.5067012310028076
2.8412930965423584
2.7037107944488525
2.8836231231689453
2.7936646938323975
2.6234822273254395
2.6642377376556396
2.5152699947357178
2.5603389739990234
2.4513602256774902
2.923617124557495
2.3742082118988037
2.5751242637634277
2.7275633811950684
2.2201406955718994
2.483858585357666
2.2708017826080322
2.670881986618042
2.87984561920166
1.953345775604248
2.1703102588653564
2.328474760055542
2.370340585708618
2.657716989517212
2.267050266265869
2.649440050125122
2.0546109676361084
2.332259178161621
2.1084725856781006
2.161945104598999
2.282254695892334
2.3306093215942383
2.354071617126465
2.9823410511016846


0.7069466710090637
1.1240100860595703
1.1286624670028687
1.296004295349121
1.3610950708389282
1.4578629732131958
1.2929978370666504
0.8333848118782043
1.0942572355270386
0.8579293489456177
1.1245982646942139
0.9410082101821899
1.0176806449890137
1.0327541828155518
1.3544573783874512
0.805938720703125
0.9775002598762512
1.4016577005386353
0.9416476488113403
1.245983600616455
0.8281059265136719
0.9942266941070557
1.0064728260040283
1.0215492248535156
0.7538716793060303
1.3501200675964355
1.2648755311965942
1.1244181394577026
1.4138927459716797
1.120267629623413
1.2658792734146118
1.1193331480026245
1.225368857383728
0.9352459907531738
1.344355821609497
1.3682149648666382
0.8929122686386108
1.2891981601715088
1.0813945531845093
0.7071747183799744
0.6153106689453125
1.9496939182281494
0.7802537679672241
1.2509560585021973
0.9369244575500488
0.9423636198043823
1.4796420335769653
1.149080514907837
0.7265802621841431
1.3039642572402954
0.7999440431594849
1.118280053138733
0.8287081718444824
0

0.5316310524940491
0.5844495296478271
0.39467987418174744
0.3952609896659851
0.5198977589607239
0.6502028703689575
0.9184147715568542
0.4250558614730835
0.270436555147171
0.6357858777046204
0.6086827516555786
0.35648033022880554
0.3174479901790619
0.6489050388336182
0.35355278849601746
0.6070705056190491
0.42419958114624023
0.6033890247344971
0.39801421761512756
0.4398368000984192
0.47152596712112427
0.30323582887649536
0.5952702164649963
0.5096852779388428
1.1189260482788086
0.6208342909812927
0.724920392036438
0.9527333974838257
0.3875076472759247
0.13163405656814575
0.46822822093963623
0.41942328214645386
0.4265613257884979
0.7753093242645264
0.5066714286804199
0.5372862219810486
0.2620290219783783
0.47119730710983276
0.6540384888648987
0.6034553050994873
0.632429301738739
0.5472471714019775
0.5116559267044067
0.3924092650413513
0.6240493059158325
0.4681586027145386
0.40451860427856445
0.4927898943424225
0.7266154289245605
0.3539683222770691
0.33151084184646606
0.6529977917671204
0.

(1206, 1.0429923138511714)

In [None]:
# evaluate the mode ...

In [None]:
import json
import copy

In [10]:
processor = DataProcessor()
examples = processor.get_train_examples(path_data)

['idx', 'utterance', 'label']


In [17]:
for (ex_index, example) in enumerate(examples):
    print(example)

{
  "guid": "train-10064",
  "label": "successions_884_process",
  "text_a": "Proc\u00e9dure Instruction D\u00e9c\u00e8s - Constitution du dossier b\u00e9n\u00e9ficiaire - Vie - Pr\u00e9voyance - CONTRATS VIE - Constitution du dossier administratif",
  "text_b": null
}

{
  "guid": "train-1524",
  "label": "com.annuelle_302_supports",
  "text_a": "A quoi correspondent les revenus de l'OPCI ?",
  "text_b": null
}

{
  "guid": "train-3876",
  "label": "personnes protegees_156_habilitations familiales",
  "text_a": "Quels sont les actes accept\u00e9s par l'habiliation familiale g\u00e9n\u00e9rale ?",
  "text_b": null
}

{
  "guid": "train-3477",
  "label": "non-resident_233_adhesion",
  "text_a": "Y'a t'il une blacklist de certains pays qui ne sont pas autoris\u00e9s \u00e0 souscrire ?",
  "text_b": null
}

{
  "guid": "train-5447",
  "label": "personnes protegees_209_signature_tutelle",
  "text_a": "signature mandat seoa",
  "text_b": null
}

{
  "guid": "train-10472",
  "label": "Unites

{
  "guid": "train-423",
  "label": "clause beneficiaire_389_procedure",
  "text_a": "modifier clause benef",
  "text_b": null
}

{
  "guid": "train-2044",
  "label": "com.annuelle_334_fondseuros",
  "text_a": "Que signifie le TMG ?",
  "text_b": null
}

{
  "guid": "train-432",
  "label": "clause beneficiaire_390_procedure",
  "text_a": "Est-il possible pour le b\u00e9n\u00e9ficiaire de renoncer \u00e0 la succession ?",
  "text_b": null
}

{
  "guid": "train-7605",
  "label": "rachat_081_processus-outils",
  "text_a": "De quelles pieces ai-je besoin pour saisir un rachat ?",
  "text_b": null
}

{
  "guid": "train-4011",
  "label": "personnes protegees_163_lexique",
  "text_a": "A quoi correspond un acte de disposition ?",
  "text_b": null
}

{
  "guid": "train-2236",
  "label": "com.annuelle_358_arbitrage",
  "text_a": "changer la r\u00e9partition du contrat",
  "text_b": null
}

{
  "guid": "train-3750",
  "label": "non-resident_274_fiscalite-deces",
  "text_a": "Explique moi la fisc

{
  "guid": "train-3562",
  "label": "non-resident_235_adhesion",
  "text_a": "Puis-je accepter des coordonn\u00e9es bancaires \u00e9trangeres ?",
  "text_b": null
}

{
  "guid": "train-7128",
  "label": "rachat_042_info-generale",
  "text_a": "Comment racheter un contrat pendant la periode de renonciation ?",
  "text_b": null
}

{
  "guid": "train-8438",
  "label": "successions_1010_process",
  "text_a": "Comment constituer le dossier de r\u00e8glement?",
  "text_b": null
}

{
  "guid": "train-9324",
  "label": "successions_290_infos",
  "text_a": "Aide moi sur les quotit\u00e9s",
  "text_b": null
}

{
  "guid": "train-3563",
  "label": "non-resident_235_adhesion",
  "text_a": "Est-ce que les coordonn\u00e9es bancaires \u00e9trangeres sont \u00e9ligibles ?",
  "text_b": null
}

{
  "guid": "train-589",
  "label": "clause beneficiaire_401_general",
  "text_a": "Que sais tu sur les clauses b\u00e9n\u00e9ficiaires ?",
  "text_b": null
}

{
  "guid": "train-9354",
  "label": "successions_

{
  "guid": "train-269",
  "label": "Capitalisation_437_procedure",
  "text_a": "Comment contacter la CDC au sujet des bons de capitalisation anonyme ?",
  "text_b": null
}

{
  "guid": "train-2592",
  "label": "demembrement_001_piecesjustificatives",
  "text_a": "pieces justificatives clause demembr\u00e9e",
  "text_b": null
}

{
  "guid": "train-6824",
  "label": "prevoyance_583_lexique",
  "text_a": "Quelle est la franchise en cas de maladie dans CERAMIK ?",
  "text_b": null
}

{
  "guid": "train-1581",
  "label": "com.annuelle_304_supports",
  "text_a": "D\u00e9finie moi le nombre d'unit\u00e9s de compte",
  "text_b": null
}

{
  "guid": "train-6441",
  "label": "prevoyance_238_process",
  "text_a": "Comment traiter un rejet de pr\u00e9l\u00e8vement dans le 3270 Suspens comptable ?",
  "text_b": null
}

{
  "guid": "train-2953",
  "label": "demembrement_019_eligibilite",
  "text_a": "points attention clause b\u00e9n\u00e9ficiaire",
  "text_b": null
}

{
  "guid": "train-169",
  "la

In [15]:
examples[0]

{
  "guid": "train-10064",
  "label": "successions_884_process",
  "text_a": "Proc\u00e9dure Instruction D\u00e9c\u00e8s - Constitution du dossier b\u00e9n\u00e9ficiaire - Vie - Pr\u00e9voyance - CONTRATS VIE - Constitution du dossier administratif",
  "text_b": null
}

In [19]:
inputs = tokenizer.encode_plus(
    examples[0].text_a,
    examples[0].text_b,
    add_special_tokens=True,
    max_length=128,
)
inputs

{'special_tokens_mask': [1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 'input_ids': [5,
  28306,
  21,
  28579,
  8750,
  2257,
  67,
  7181,
  25,
  1421,
  16267,
  67,
  4171,
  67,
  2852,
  4629,
  1269,
  67,
  28078,
  26155,
  24426,
  67,
  7181,
  25,
  1421,
  6823,
  6],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [26]:
ut = examples[0].text_a
print(ut)
print(tokenizer.encode(ut))
print([tokenizer._convert_id_to_token(idx) for idx in tokenizer.encode(ut)])

Procédure Instruction Décès - Constitution du dossier bénéficiaire - Vie - Prévoyance - CONTRATS VIE - Constitution du dossier administratif
[5, 28306, 21, 28579, 8750, 2257, 67, 7181, 25, 1421, 16267, 67, 4171, 67, 2852, 4629, 1269, 67, 28078, 26155, 24426, 67, 7181, 25, 1421, 6823, 6]
['<s>', '▁Procédure', '▁', 'Instruction', '▁Déc', 'ès', '▁-', '▁Constitution', '▁du', '▁dossier', '▁bénéficiaire', '▁-', '▁Vie', '▁-', '▁Pré', 'voy', 'ance', '▁-', '▁CONTR', 'ATS', '▁VIE', '▁-', '▁Constitution', '▁du', '▁dossier', '▁administratif', '</s>']


In [27]:
train_dataset[0]

(tensor([    5, 28306,    21, 28579,  8750,  2257,    67,  7181,    25,  1421,
         16267,    67,  4171,    67,  2852,  4629,  1269,    67, 28078, 26155,
         24426,    67,  7181,    25,  1421,  6823,     6,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  

In [30]:
outputs = model(**{'input_ids': train_dataset[0][0].to(device),
                 'attention_mask': train_dataset[0][1].to(device),
                 'labels': train_dataset[0][3].to(device),
                 'token_type_ids': None})


UnboundLocalError: local variable 'extended_attention_mask' referenced before assignment

In [33]:
train_dataset[0][3]

tensor(945)

In [None]:
eval_dataset = load_and_cache_examples(tokenizer=tokenizer, max_seq_length=128, data_dir="data/",
                                       evaluate=True, verbose=-1)

eval_batch_size = 1
# Note that DistributedSampler samples randomly
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size)

# Eval!
eval_loss = 0.0
nb_eval_steps = 0
preds = None
out_label_ids = None
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    model.eval()
    batch = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[3]}
        inputs['token_type_ids'] = None
        outputs = model(**inputs)
        print(outputs)

HBox(children=(IntProgress(value=0, description='Evaluating', max=947, style=ProgressStyle(description_width='…

(tensor(7.0042, device='cuda:0'), tensor([[-0.0251, -0.0207, -0.1045,  ..., -0.0234, -0.0853,  0.0026]],
       device='cuda:0'))
(tensor(6.9261, device='cuda:0'), tensor([[-0.0595,  0.0143, -0.0857,  ..., -0.0534, -0.0819,  0.0025]],
       device='cuda:0'))
(tensor(6.8802, device='cuda:0'), tensor([[-0.0484,  0.0155, -0.1004,  ..., -0.0111, -0.0926,  0.0050]],
       device='cuda:0'))
(tensor(7.0199, device='cuda:0'), tensor([[-0.0203,  0.0121, -0.1016,  ..., -0.0463, -0.0923,  0.0075]],
       device='cuda:0'))
(tensor(6.9273, device='cuda:0'), tensor([[-0.0175, -0.0030, -0.0995,  ..., -0.0335, -0.0834,  0.0084]],
       device='cuda:0'))
(tensor(6.9906, device='cuda:0'), tensor([[-0.0507,  0.0202, -0.1175,  ..., -0.0633, -0.0394, -0.0101]],
       device='cuda:0'))
(tensor(7.0462, device='cuda:0'), tensor([[-0.0262,  0.0217, -0.0977,  ..., -0.0276, -0.0840, -0.0137]],
       device='cuda:0'))
(tensor(7.0374, device='cuda:0'), tensor([[-0.0361,  0.0148, -0.1168,  ..., -0.0351, -0.06

(tensor(6.8549, device='cuda:0'), tensor([[-0.0573,  0.0013, -0.0903,  ..., -0.0351, -0.0739, -0.0045]],
       device='cuda:0'))
(tensor(6.9094, device='cuda:0'), tensor([[-0.0278,  0.0164, -0.0964,  ..., -0.0580, -0.0775, -0.0005]],
       device='cuda:0'))
(tensor(6.8955, device='cuda:0'), tensor([[-0.0474,  0.0135, -0.0984,  ..., -0.0198, -0.0834, -0.0158]],
       device='cuda:0'))
(tensor(6.8893, device='cuda:0'), tensor([[-0.0771, -0.0118, -0.0984,  ...,  0.0065, -0.0291, -0.0049]],
       device='cuda:0'))
(tensor(6.8936, device='cuda:0'), tensor([[-0.0397,  0.0336, -0.0937,  ..., -0.0299, -0.0818, -0.0063]],
       device='cuda:0'))
(tensor(6.9010, device='cuda:0'), tensor([[-0.0166,  0.0128, -0.0980,  ..., -0.0277, -0.0965,  0.0020]],
       device='cuda:0'))
(tensor(6.9071, device='cuda:0'), tensor([[-0.0457,  0.0343, -0.0988,  ..., -0.0374, -0.0857,  0.0026]],
       device='cuda:0'))
(tensor(6.9577, device='cuda:0'), tensor([[-0.0444, -0.0026, -0.0994,  ..., -0.0231, -0.11

(tensor(6.9724, device='cuda:0'), tensor([[-0.0097,  0.0153, -0.0888,  ..., -0.0528, -0.0717, -0.0047]],
       device='cuda:0'))
(tensor(6.9166, device='cuda:0'), tensor([[-0.0527,  0.0132, -0.0833,  ..., -0.0290, -0.0732,  0.0022]],
       device='cuda:0'))
(tensor(6.9011, device='cuda:0'), tensor([[-0.0350, -0.0058, -0.0872,  ..., -0.0346, -0.0696, -0.0048]],
       device='cuda:0'))
(tensor(7.0450, device='cuda:0'), tensor([[-0.0667,  0.0062, -0.0731,  ..., -0.0170, -0.0824, -0.0028]],
       device='cuda:0'))
(tensor(7.0438, device='cuda:0'), tensor([[-0.0452,  0.0262, -0.0984,  ..., -0.0274, -0.0739, -0.0007]],
       device='cuda:0'))
(tensor(6.9950, device='cuda:0'), tensor([[-0.0263, -0.0080, -0.0900,  ..., -0.0436, -0.0881, -0.0024]],
       device='cuda:0'))
(tensor(7.0492, device='cuda:0'), tensor([[-0.0172,  0.0174, -0.0943,  ..., -0.0261, -0.0919,  0.0068]],
       device='cuda:0'))
(tensor(6.9074, device='cuda:0'), tensor([[-0.0387, -0.0175, -0.0884,  ..., -0.0340, -0.12

(tensor(6.9811, device='cuda:0'), tensor([[-0.0360, -0.0073, -0.0967,  ..., -0.0401, -0.0809,  0.0093]],
       device='cuda:0'))
(tensor(6.9576, device='cuda:0'), tensor([[-0.0181,  0.0030, -0.1042,  ..., -0.0344, -0.0866,  0.0124]],
       device='cuda:0'))
(tensor(6.9772, device='cuda:0'), tensor([[-0.0406,  0.0284, -0.0922,  ..., -0.0344, -0.0929,  0.0176]],
       device='cuda:0'))
(tensor(6.9762, device='cuda:0'), tensor([[-0.0449,  0.0247, -0.0901,  ..., -0.0334, -0.0925,  0.0182]],
       device='cuda:0'))
(tensor(7.0032, device='cuda:0'), tensor([[-0.0215,  0.0186, -0.1059,  ..., -0.0322, -0.0673, -0.0049]],
       device='cuda:0'))
(tensor(7.0133, device='cuda:0'), tensor([[-0.0337,  0.0236, -0.0924,  ..., -0.0304, -0.0718, -0.0076]],
       device='cuda:0'))
(tensor(7.0502, device='cuda:0'), tensor([[-0.0303,  0.0129, -0.0972,  ..., -0.0394, -0.1010,  0.0018]],
       device='cuda:0'))
(tensor(6.9427, device='cuda:0'), tensor([[-0.0035,  0.0101, -0.1100,  ..., -0.0306, -0.07

(tensor(7.0103, device='cuda:0'), tensor([[-0.0226,  0.0089, -0.0963,  ..., -0.0485, -0.1062, -0.0016]],
       device='cuda:0'))
(tensor(6.8717, device='cuda:0'), tensor([[-0.0115,  0.0041, -0.1077,  ..., -0.0434, -0.0968,  0.0104]],
       device='cuda:0'))
(tensor(6.8878, device='cuda:0'), tensor([[-0.0299,  0.0053, -0.0816,  ..., -0.0565, -0.0838,  0.0050]],
       device='cuda:0'))
(tensor(6.9492, device='cuda:0'), tensor([[-0.0272, -0.0149, -0.0816,  ..., -0.0460, -0.0838,  0.0045]],
       device='cuda:0'))
(tensor(6.9964, device='cuda:0'), tensor([[-0.0191, -0.0031, -0.0963,  ..., -0.0684, -0.0848, -0.0037]],
       device='cuda:0'))
(tensor(6.9442, device='cuda:0'), tensor([[-0.0247,  0.0274, -0.0899,  ..., -0.0676, -0.0833,  0.0124]],
       device='cuda:0'))
(tensor(6.8646, device='cuda:0'), tensor([[-0.0225, -0.0133, -0.1030,  ..., -0.0424, -0.0954,  0.0100]],
       device='cuda:0'))
(tensor(7.0115, device='cuda:0'), tensor([[-0.0519, -0.0033, -0.0977,  ..., -0.0173, -0.04

(tensor(7.0062, device='cuda:0'), tensor([[-0.0565, -0.0036, -0.0729,  ..., -0.0335, -0.0544, -0.0095]],
       device='cuda:0'))
(tensor(6.9715, device='cuda:0'), tensor([[-0.0319,  0.0105, -0.0852,  ..., -0.0747, -0.0988,  0.0089]],
       device='cuda:0'))
(tensor(6.9866, device='cuda:0'), tensor([[-2.1073e-02,  2.9512e-02, -8.5130e-02,  ..., -3.5076e-02,
         -9.5950e-02, -2.0463e-05]], device='cuda:0'))
(tensor(6.9891, device='cuda:0'), tensor([[-0.0196,  0.0428, -0.0891,  ..., -0.0380, -0.0861,  0.0080]],
       device='cuda:0'))
(tensor(6.9737, device='cuda:0'), tensor([[-0.0346,  0.0221, -0.0865,  ..., -0.0338, -0.0782,  0.0109]],
       device='cuda:0'))
(tensor(7.0251, device='cuda:0'), tensor([[-0.0321,  0.0307, -0.1015,  ..., -0.0341, -0.0792, -0.0136]],
       device='cuda:0'))
(tensor(7.0227, device='cuda:0'), tensor([[-0.0514,  0.0087, -0.1079,  ..., -0.0590, -0.0609,  0.0029]],
       device='cuda:0'))
(tensor(7.0128, device='cuda:0'), tensor([[-0.0600,  0.0156, -0.

In [39]:
eval_dataset = load_and_cache_examples(tokenizer=tokenizer, max_seq_length=128, data_dir="data/",
                                       evaluate=True, verbose=-1)
