In [7]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

import numpy as np
import os
import csv
import datetime
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold

import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset # used load data
import torch.optim as optim

from transformers.data.processors.utils import InputExample, InputFeatures
from transformers import (AdamW,
                          get_linear_schedule_with_warmup,
                          get_cosine_with_hard_restarts_schedule_with_warmup)

from transformers import (CamembertConfig,
                          CamembertForSequenceClassification,
                          CamembertTokenizer)


In [8]:
path_data="data/"

In [9]:
df_train = pd.read_csv(path_data + "input_train.csv", sep=",")

In [10]:
df_labels = pd.read_csv(path_data + "output_train.csv", sep=",")

In [11]:
df_test = pd.read_csv(path_data + "input_test_b1Yip6O.csv", sep=",")

In [33]:
class InputExample(object):

    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

    
class DataProcessor():
    
    def get_data_examples(self, sentences, labels=None):
        examples = []
        guid = "data"
        if labels is None:
            for i in range(len(sentences)):
                examples.append(InputExample(guid=guid, text_a=sentences[i], text_b=None, label="0"))
        else:
            for i in range(len(sentences)):
                examples.append(InputExample(guid=guid, text_a=sentences[i], text_b=None, label=str(labels[i])))
            
        return examples

    def get_labels(self):
        """See base class."""
        return [str(j) for j in range(51)]

    
def convert_examples_to_features(examples,
                                 tokenizer,
                                 max_length=512,
                                 label_list=None,
                                 pad_on_left=False,
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 mask_padding_with_zero=True):

    processor = DataProcessor()

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):

        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
        
        label = label_map[example.label]
        
        features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              label=label))

    return features

def load_examples(sentences,
                            labels,
                            tokenizer, 
                            max_seq_length,
                            label_list):
 
    processor = DataProcessor()
    examples = processor.get_data_examples(sentences, labels)

    features = convert_examples_to_features(examples,
                                            tokenizer,
                                            label_list=label_list,
                                            max_length=max_seq_length,
                                            pad_on_left=False,
                                            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                            pad_token_segment_id=0,
    )

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset



In [35]:
def train(model, train_dataset, tokenizer, num_train_epochs, train_batch_size, learning_rate, adam_epsilon=1e-8,
          logging_steps=None, gradient_accumulation_steps=1, max_grad_norm=1.0, weight_decay=0.0,
          warmup_steps=0, save_steps=-1, output_dir=None, evaluate_during_training=False,
          seed=None, max_steps=-1, num_cycles=1.0, eval_dataset = None, verbose=0):
    
    """ Train the model """
    
    assert not(logging_steps > 0 and eval_dataset is None), "logging_steps > 0 but no eval_dataset provided"
    
    if output_dir is None and save_steps > 0:
        output_dir = "model_" + str(datetime.datetime.now()).split(".")[0].replace(" ","_") + "/"
        
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)
    
    if logging_steps is None:
        logging_steps = len(train_dataloader) // (gradient_accumulation_steps * 5)
        
    if max_steps > 0:
        t_total = max_steps
        num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    
    # pas besoin de la partie custom ci-dessous à priori
    
    #no_decay = ['bias', 'LayerNorm.weight']
    #optimizer_grouped_parameters = [
    #    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    #    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    #    ]
    
    # change l'optimizer pour voir
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon) # optimizer_grouped_parameters
    #optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9) #eps=adam_epsilon , momentum=0.9
    #scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer,
                                                                   num_warmup_steps=warmup_steps,
                                                                   num_training_steps=t_total,
                                                                   num_cycles=num_cycles)
    #for i, tensor in enumerate(model.parameters()):
    #    if i > 1:
    #        tensor.requires_grad = False

    # Train!
    print("***** Running training *****")
    print("  Num examples = %d" % len(train_dataset))
    print("  Num Epochs = %d" % num_train_epochs)
    print(" Batch size = %d" % train_batch_size)
    print("  Total train batch size (w. parallel, distributed & accumulation) = %d" %
                   train_batch_size * gradient_accumulation_steps)
    print("  Gradient Accumulation steps = %d" % gradient_accumulation_steps)
    print("  Total optimization steps = %d" % t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(num_train_epochs), desc="Epoch")
    if seed is not None:
        set_seed(seed)
        
    for epoch, _ in enumerate(train_iterator):
        # print("Epoch %d / %d" % (epoch, num_train_epochs))
        epoch_iterator = train_dataloader
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[3]}
            inputs['token_type_ids'] = batch[2] #or None
            optimizer.zero_grad()

            outputs = model(**inputs)
            #print(outputs)
            #print(outputs[0].size())
            #print(outputs[1].size())
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            #print(loss)
            
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if verbose > 0:
                print("lr:",scheduler.get_lr()[0], "loss:", loss.item())
                
            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if logging_steps > 0 and global_step % logging_steps == 0:
                    if verbose > 0:
                        print("\nEval")
                    # Log metrics
                    dict_print = {'step':global_step,
                                  'lr': scheduler.get_lr()[0],
                                  'tr_loss': (tr_loss - logging_loss)/logging_steps}
                    if evaluate_during_training:
                        results = evaluate(model=model, eval_dataset=eval_dataset,
                                           tokenizer=tokenizer, eval_output_dir=output_dir,
                                           verbose=verbose)
                        for key, value in results.items():
                            dict_print['eval_{}'.format(key)] = value
                    print(dict_print)
                    logging_loss = tr_loss

                if save_steps > 0 and global_step % save_steps == 0:
                    # Save model checkpoint
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    save_model_dir = os.path.join(output_dir, 'checkpoint-{}'.format(global_step))
                    os.makedirs(save_model_dir)
                    model.save_pretrained(save_model_dir)
                    #torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    print("Saving model checkpoint to %s" % save_model_dir)

            if max_steps > 0 and global_step > max_steps:
                #epoch_iterator.close() #deleted since no tqdm anymore
                break
                
        if max_steps > 0 and global_step > max_steps:
            train_iterator.close()
            break

    if global_step == 0:
        global_step= 1

    return global_step, tr_loss / global_step

In [36]:
def evaluate(model, eval_dataset, tokenizer, eval_batch_size=8, prefix="", eval_output_dir=None,
             verbose=1):
        
    eval_batch_size = eval_batch_size
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size)
    
    # Eval!
    if verbose > 0:
        print("***** Running evaluation {} *****".format(prefix))
        print("  Num examples = %d", len(eval_dataset))
        print("  Batch size = %d", eval_batch_size)
        
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    iterator = tqdm(eval_dataloader, desc="Evaluating") if verbose > 0 else eval_dataloader
    
    for batch in iterator:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[3]}
            inputs['token_type_ids'] = batch[2] #or None
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]
            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds_class = np.argmax(preds, axis=1)
    acc = accuracy_score(out_label_ids, preds_class)
    
    result = {"val_loss": eval_loss, "val_acc" : acc}
    #results.update(result)

    if eval_output_dir is not None:
        if not os.path.exists(eval_output_dir):
            os.makedirs(eval_output_dir)
        
        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            writer.write("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                writer.write("  %s = %s" % (key, str(result[key])))
            writer.write("\n")

    return result

In [37]:
def predict(model, dataset, batch_size, verbose=1):
    
    # Note that DistributedSampler samples randomly
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)

    # Eval!
    if verbose > 0:
        print("***** Running prediction *****")
        print("  Num examples = %d", len(dataset))
        print("  Batch size = %d", batch_size)

    loss = 0.0
    nb_steps = 0
    preds = None
    out_label_ids = None
    iterator = tqdm(dataloader, desc="Predict") if verbose > 0 else dataloader
    
    for batch in iterator:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[3]}
            inputs['token_type_ids'] = batch[2] #or None
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]
            loss += tmp_eval_loss.mean().item()
        nb_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    loss = loss / nb_steps
    return preds

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
num_labels=51
model_name="camembert-base"

config = CamembertConfig.from_pretrained(model_name,
                                         num_labels=num_labels,
                                         finetuning_task="classification")

In [19]:
tokenizer = CamembertTokenizer.from_pretrained(model_name,
                                               do_lower_case=True)

In [20]:
model = CamembertForSequenceClassification.from_pretrained(model_name, config=config)
model = model.to(device)


In [21]:
sentences_train, sentences_val, labels_train, labels_val = train_test_split(df_train["question"].values,
                                                                            df_labels["intention"].values,
                                                                            test_size=0.2,
                                                                            random_state=2019)

In [22]:
train_dataset = load_examples(sentences=sentences_train,
                                  labels=labels_train,
                                  tokenizer=tokenizer,
                                  max_seq_length = 128,
                                  label_list = [str(j) for j in range(num_labels)])

Writing example 0


In [23]:
eval_dataset = load_examples(sentences=sentences_val,
                                  labels=labels_val,
                                  tokenizer=tokenizer,
                                  max_seq_length = 128,
                                  label_list = [str(j) for j in range(num_labels)])

Writing example 0


In [24]:
train(model=model, train_dataset=train_dataset, tokenizer=tokenizer, train_batch_size=32,
      learning_rate=3e-5, num_train_epochs=10, evaluate_during_training=True, logging_steps=2,
      max_grad_norm=1.0, save_steps=-1, num_cycles=5.0, max_steps=3, eval_dataset=eval_dataset)


***** Running training *****
  Num examples = 6422
  Num Epochs = 1
 Batch size = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3


HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…



Eval
{'step': 2, 'lr': 2.2500000000000015e-05, 'tr_loss': 3.9270644187927246, 'eval_val_loss': 3.9023621568632363, 'eval_val_acc': 0.02054794520547945}


Eval
{'step': 4, 'lr': 0.0, 'tr_loss': 3.9058990478515625, 'eval_val_loss': 3.8899942262848812, 'eval_val_acc': 0.08468244084682441}



(4, 3.9164817333221436)

# Eval on validation dataset

In [26]:
model = CamembertForSequenceClassification.from_pretrained("model_2019-12-08_14:59:37/checkpoint-400/checkpoint-800/checkpoint-1200/")
model = model.to(device)

In [12]:
evaluate(model, eval_dataset, tokenizer)

Loading features from cached file data/cached_test_128
***** Running evaluation  *****
  Num examples = %d 1606
  Batch size = %d 8


HBox(children=(IntProgress(value=0, description='Evaluating', max=201, style=ProgressStyle(description_width='…




{'val_loss': 1.5273962446409672, 'val_acc': 0.6749688667496887}

# Predict on test set and build submission file

In [22]:
test_dataset = load_examples(sentences=df_test["question"],
                                  labels=None,
                                  tokenizer=tokenizer,
                                  max_seq_length = 128,
                                  label_list = [str(j) for j in range(51)])

Writing example 0


In [24]:
preds_test = predict(model=model,
                     dataset=test_dataset,
                     batch_size = 8)
preds_class_test = np.argmax(preds_test, axis=1)

***** Running evaluation *****
  Num examples = %d 2035
  Batch size = %d 8


HBox(children=(IntProgress(value=0, description='Predict', max=255, style=ProgressStyle(description_width='ini…




In [61]:
sub = pd.DataFrame({"ID": df_test["ID"].values,  "intention": preds_class_test})

In [62]:
sub.head()

Unnamed: 0,ID,intention
0,8028,32
1,8029,32
2,8030,32
3,8031,31
4,8032,44


In [63]:
sub.to_csv("sub/sub0.csv", index=False)