<a href="https://colab.research.google.com/github/cerezamo/NLP_brouillon/blob/master/Camembert_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CamemBERT classification model 


In [1]:
import spacy 
import numpy as np 
import pandas as pd 
import os 
os.getcwd()
from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Set up Colab GPU 

In [2]:
%tensorflow_version 1.x
# First you should go in 'Edit' -> 'Notebook settings' -> Add device GPU
import tensorflow as tf


# GPU device name.
device_name = tf.test.gpu_device_name()
device_name

TensorFlow 1.x selected.


'/device:GPU:0'

Let's now tell torch that one GPU is available 

In [3]:
import torch

if torch.cuda.is_available():  
        
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


Let's install the Hugging Face Library transformer package 

In [4]:
! pip install transformers 



### Loading our corpus and preprocessing 

In [0]:
import pandas as pd
# Import medium_df_desq in "files"
# Load the dataset into a pandas dataframe.
#df=pd.read_csv('medium_df_deseq.csv',encoding='utf-8')
df=pd.read_csv('medium_df_deseq.csv',encoding='utf-8')

# We replace the labels in a more normalized way : 0=men, 1=women 
df.sexe=df.sexe.replace(1,0)
df.sexe=df.sexe.replace(2,1)


In [0]:
# Make results reproducible 
seed_val = 2020

In [0]:
def unbalanced_preprocess(df,seed_val,frac_val):

  from sklearn.model_selection import train_test_split

  #Shuffle the data 
  df_unbalanced=df.sample(frac=frac_val).reset_index()

  # Reduce to the variables we are interested in 
  df_unbalanced=df_unbalanced[['Texte','sexe']]

  # Report the number of speeches in the corpus.
  print('Number of text in the unbalanced corpus : {:,}\n'.format(df_unbalanced.shape[0]))
  prop = (len(df_unbalanced[df_unbalanced.sexe==1])/len(df_unbalanced))*100
  print('Proportions of women in the unbalanced corpus : {}\n'.format(prop))

  # We keep one little sample for evaluation 
  model_unbalanced, dev_unbalanced = train_test_split(df_unbalanced, test_size=0.02,random_state=seed_val)

  return model_unbalanced, dev_unbalanced

In [9]:
df_unbalanced = unbalanced_preprocess(df,seed_val,frac_val=0.5)[0]

Number of text in the unbalanced corpus : 2,500

Proportions of women in the unbalanced corpus : 25.8



In [0]:
def balanced_preprocess(df,seed_val,frac_val):
  from sklearn.model_selection import train_test_split

  # Let's take a balanced sample 
  df_m = df.loc[df['sexe'] == 0]
  df_f = df.loc[df['sexe'] == 1] 
  df_m = df_m[0:len(df_f)]
  df = df_f.append(df_m)

  #Shuffle the data and taking half of the sample in order not to have to many data compared to the other samples 
  df_balanced=df.sample(frac=1).reset_index()

  # Reduce to the variables we are interested in 
  df_balanced=df_balanced[['Texte','sexe']]

  # Report the number of speeches in the corpus.
  print('Number of text in this corpus : {:,}\n'.format(df_balanced.shape[0]))
  prop = (len(df_balanced[df_balanced.sexe==1])/len(df_balanced))*100
  print('Proportions of women in the balanced corpus : {}\n'.format(prop))

  # Keep one little sample for evaluation 
  model_balanced, dev_balanced = train_test_split(df_balanced, test_size=0.02,random_state=seed_val)

  return model_balanced, dev_balanced



In [11]:
df_balanced = balanced_preprocess(df,seed_val,frac_val=1)[0]

Number of text in this corpus : 2,500

Proportions of women in the balanced corpus : 50.0



In [0]:
def balanced_splitted(df,seed_val,frac_val):
  from sklearn.model_selection import train_test_split
    # Reduce to the variables we are interested in 
  df=df[['Texte','sexe']]

    # Let's take a balanced sample 
  df_m = df.loc[df['sexe'] == 0]
  df_f = df.loc[df['sexe'] == 1] 
  df_m = df_m[0:len(df_f)]
  df = df_f.append(df_m)

   #Shuffle the data
  df=df.sample(frac=frac_val).reset_index()

  from itertools import repeat

  n=2500
  chunks, label_split=[],[]
  j=0
  for text in df.Texte :
      txt=[text[i:i+n] for i in range(0, len(text), n)]
      chunks.append(txt)
      label_split.extend(repeat(df.sexe[j], len(txt)))
      j+=1

  chunks = [item for sublist in chunks for item in sublist]
  df=pd.DataFrame([chunks,label_split]).transpose()
  df.columns=['Texte','sexe']

  # Put as integer 
  df['sexe'] = df['sexe'].astype(int)

  df_balanced_split= df

  # Report the number of speeches in the corpus.
  print('Number of text in this balanced splitted corpus : {:,}\n'.format(df_balanced_split.shape[0]))
  prop = (len(df_balanced_split[df_balanced_split.sexe==1])/len(df_balanced_split))*100
  print('Proportions of women in the balanced splitted corpus : {}\n'.format(prop))

  # Keep one little sample for evaluation 
  model_balanced_split, dev_balance_split = train_test_split(df_balanced_split, test_size=0.02,random_state=seed_val)
  
  return model_balanced_split, dev_balance_split

In [13]:
df_balanced_split = balanced_splitted(df,seed_val,0.2)[0]

Number of text in this balanced splitted corpus : 2,642

Proportions of women in the balanced splitted corpus : 54.769114307342925



**We propose 3 samples to train our model :**


1.   **Unbalanced sample**

We take the raw data without any further treatment.

2.   **Balanced sample**

The second option consist in deleting randomly part of male speeches in order to get a balanced sample. Indeed, in the case of unbalanced sample our model could decide to classify all speakers in the male category which would lead to a 0.75 accuracy in our case study. In order to avoid this we feed the model with the same proportions of male and female speakers. Other kind of treatments exist to deal with unbalanced sample. This one is the simpliest one and we could argue that there is a possibility that the deleted sample contains important information that we therefore miss. However we believe that in our case this is not a big issue. Our unbalanced sample is quite large for both female and male.

3. **Balanced and splitted sample**

The third option is a response to the max length constraint of BERT models. Our text samples are big and contain much more tokens than the 512 limit. In the first two options we decide to just feed the model with the 512 first tokens and thus delete the rest of them. In this third option we cut the text into x parts containing 500 tokens each. All parts of the speech will serve to feed the model. By this technique we do not loose potential important informations at the end of the text. A lot of other techniques have been employed (see ref !!! PUT). We decide to stick to this one in this project. 



#### Tokenization of our text and preparing to feed CamemBERT

#### Loading the Camembert Tokenizer

In [17]:
# Import Camembert tokenizer
from transformers import CamembertTokenizer
# We choose a right padding side for the moment and we will test for a left padding side on a second stage
tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=False,padding_side='right') #left

HBox(children=(IntProgress(value=0, description='Downloading', max=810912, style=ProgressStyle(description_wid…




In [18]:
# Print the original text.
print(' Original: ', df.Texte[0])

# Print the text split into tokens.
print('Tokenized: ', tokenizer.tokenize(df.Texte[0]))

# Print the text mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df.Texte[0])))

 Original:  Messieurs,Je suis heureux de vous saluer. Quand je dis que je suis heureux de vous saluer, ce n'est pas une simple affirmation de politesse. Je le disais à l'instant à Monsieur le Ministre de la Défense, M. Richard, c'est pour moi un instant où il y a un peu d'émotion; je vais vous dire pourquoi.Vous êtes la première classe d'âge qui ne fera pas de service militaire. C'est une décision que j'ai prise, il y a deux ans, après une vraie réflexion et un vrai débat. Après tout, le service militaire c'est une vieille tradition nationale, il était plus que centenaire. Il y avait toutes sortes de raisons à cela, notamment la nécessité d'avoir une armée nombreuse et donc d'avoir des jeunes formés aux combats, à l'utilisation des armes de l'époque.On pouvait s'interroger sur la nécessité de poursuivre dans cette voie. Il y avait naturellement des critiques, il y avait beaucoup de jeunes qui se disaient qu'ils perdaient un peu leur temps, d'autres qui étaient satisfaits. Mais il y ava

#### Adding special tokens to the start and end of the text


Preprocessing steps : 


1.   **Add special tokens [CLS] [SEP]** 

According to the documentation we need to add special tokens to the start and end of the text Moreover, for camembert we should add a space between CLS and the first token (not sure here, we have to ask benjamin). 

2.   **Pad and truncate all texts to a single number**

Pretrained transformes like Camembert only accept input of the same length. Our corpus contains large texts and we have to pad them in order to be able to feed Camembert. We will set the max length to a large number in order to get all information possible in the text. We choose a max length of 500 which is almost the maximum (512) "sentence" length  accepted. We are aware that this choice will impact a lot training speed.

3.   **Construct an attention mask**

Attention masks are just set to 1 when the token have to be analyzed and 0 otherwise (padded tokens). All our attention mask should be 1 with this corpus. 



For sake of simplicity and to avoid errors we will use the function encode_plus of the library which is really convenient. 



#### Length and attention mask 

In [0]:
def prepare_to_feed(df,length,batch_size_value):
  from torch.utils.data import TensorDataset, random_split
  from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

  texts = df.Texte.values
  labels = df.sexe.values

  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  attention_masks = []
  num_truncated_tokens =[]
  # Apply function to our corpus
  for text in texts:
      encoded_dict = tokenizer.encode_plus(
                          text,                      # text
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = length,           # We choose for now a max length of 500.
                          pad_to_max_length = True,    # Pad text to max (marche pas en pad left ?)
                          return_attention_mask = True,   # Construct attention masks
                          return_tensors = 'pt',     # Return pytorch tensors.
                          return_overflowing_tokens =True, # return overflowing token information
                    )
      
      # Map tokens to their id in the dictionnary 
      # We add this to our list    
      input_ids.append(encoded_dict['input_ids'])
  
      #num_truncated_tokens.append(encoded_dict['num_truncated_tokens'])
      
      # 3. Attention masks
      attention_masks.append(encoded_dict['attention_mask'])

  # We convert all this into tensors in order to be able to make it work on GPU 
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)

  # Original text and transformed tensor print 
  print("Let's check for the first text indexes, attention masks and labels")
  print(" ")
  print('Original: ', texts[0][0:100])
  print('IDs:', input_ids[0][0:100])
  print('Attention masks:', attention_masks[0][0:100])
  print('labels',labels[0])


  # Combine all above
  dataset = TensorDataset(input_ids, attention_masks, labels)

  # Let's create a 80-20 train / validation dataset 
  train_size = int(0.8 * len(dataset))
  val_size = len(dataset) - train_size

  train_set, val_set = random_split(dataset, [train_size, val_size])


  print("-------------------------------------------------")
  print(" ")
  print("How many texts do we have in the train and validation sample ? ")
  print(" ")
  print('We have {} training texts'.format(train_size))
  print('We have {} validation texts'.format(val_size))
  print(" ")
  print("-------------------------------------------------")

  # We set the size of the batch ( usually set around 16 or 32), we will take the lower bound because of the large text length
  batch_size = batch_size_value

  # We create data loaders for the train and validation dataset. 
  train_dataloader = DataLoader(
              train_set,  # The training samples.
              sampler = RandomSampler(train_set), # Select batches randomly
              batch_size = batch_size # Trains with this batch size.
          )

  val_dataloader = DataLoader(
              val_set, # The validation samples.
              sampler = SequentialSampler(val_set), # Pull out batches sequentially.
              batch_size = batch_size # Evaluate with this batch size.
          )
  
  print('Data loaders created for train [0] and val [1]')

  return train_dataloader, val_dataloader 

In [0]:
print('############### Unbalanced sample ################')
train_loader_unbalanced, val_loader_unbalanced = prepare_to_feed(df_unbalanced,length=500,batch_size_value=16)

############### Unbalanced sample ################
Let's check for the first text indexes, attention masks and labels
 
Original:  Monsieur le Président,Mesdames et Messieurs les députés.A ce stade de la procédure parlementaire, la
IDs: tensor([    5,  2445,    16,  1850,     7,  6684,    10,  9402,    80,    14,
        19923,    19,  6664,     9,   243,    44,  2623,     8,    13,  2365,
        10023,     7,    13,  3787,   945,    30,  1413, 25454,     8,    17,
           11,   649,    20,  2756,     8,    44,   930,     7,     8,    17,
           11,  1159,    31,    33, 21585,    15,    58,    21,  5686,     7,
           14,   743,     8,    13,  3624,    31,    17,    11,  9486,     9,
          137,    40,     7,    19,  5743,    27,    63,   296,   331,    66,
           15,    17,    11,  1130,    32,    16,  2031,     8,    17,    11,
        13211,    26,  1496,     7,  2031,    27,    39,   349,  4925,    15,
           28,    95,  1071,  1651,     7,    39,  2890,   20

In [0]:
print('############### Balanced sample ################')
train_loader_balanced, val_loader_balanced = prepare_to_feed(df_balanced,length=500,batch_size_value=16)

############### Balanced sample ################
Let's check for the first text indexes, attention masks and labels
 
Original:  Mesdames et Messieurs,Je vous remercie pour cette invitation à ouvrir la 7ème édition de vos Assises
IDs: tensor([    5, 23605,    14, 19923,     7,  1684,    39,  5291,    24,    78,
         7030,    15,  2913,    13,   333,   544,  1533,     8,   140,    21,
        22978,    10,     7,  4630,    10,    15,    17,    11,   761,    36,
         1711,     9,   228,    80,    21, 22978,    10,    56,    22,   340,
           23,   322,     8,  1174,    26,   315,  4970,  2762,     7,    31,
         8919,    19,  2348,  1602,    25,  1711,     9,   100, 21968,   200,
           15, 14778,  3460, 25020,  7405,     7,    31,    33,  4402,  6197,
           15,    13,    21, 31756,     8,    17,    11,  7639,    25,   926,
            9,   228,    80,    21, 22978,    10,    56,    99,    24,   202,
           17,    11,   690,     8,    39,  2091,    20,  2552,

In [0]:
print('############### Balanced sample split ################')
train_loader_balanced_split, val_loader_balanced_split = prepare_to_feed(df_balanced_split,length=500,batch_size_value=16)

############### Balanced sample split ################
Let's check for the first text indexes, attention masks and labels
 
Original:  t théorique que les maires s'appliqueraient à eux-mêmes les 35 heures et non les deux fois 35 heures
IDs: tensor([    5,   271,  8777,    27,    19,  2627,    10,    52,    11,  7375,
          488,    15,   474,    26,  2835,    19,  1740,   511,    14,   165,
           19,   116,   151,  1740,   511,   106,    61,  3543,    33,    23,
         1984,     7,    15,   175,    46,    11,    88,  6797,   112,    16,
          248,    27,    19,  2627,    10,    45,    48,  9106,   113,  1085,
           29,    13,  1519,    20,  4563,  9154,     7,    20, 25339,    10,
           47,    20,   126, 19531,     9,    54,   327,     8,   589,  4925,
           22,   118,   544,  1147,    37,    17,    11,  3610,   945,    30,
           28,  9371,     7,    65,    51,   102,    33,   143,    20,  4067,
           15, 19362,     9,   442,    53,   153,  2443, 

5 and 6 seem to be the [CLS] and [SEP] special tokens 


### CamemBERT Sequence Classification model tuning


#### Loading the model


We will finally build up our model. We will use the  CamemBERT model for sequence classification which includes a special top layer designed for this task. 

In [0]:
# Importing from transformers
from transformers import CamembertForSequenceClassification, CamembertConfig

In [24]:
# Loading the model
gender_model = CamembertForSequenceClassification.from_pretrained(
    "camembert-base", 
    num_labels = 2, # We have two different labels Women = 1 and Men =0   
    output_attentions = False, 
    output_hidden_states = False, 
)

HBox(children=(IntProgress(value=0, description='Downloading', max=637, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=445032417, style=ProgressStyle(description_…




In [25]:
# We run the model on the colab GPU 
gender_model.cuda()

CamembertForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

Optimizers and Loss

#### Constructing the training and validation loop 


In [0]:
import numpy as np

from sklearn.metrics import f1_score 

from sklearn.metrics import roc_auc_score 

from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def flat_f1_score(labels,preds):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat,pred_flat,zero_division=1)

def flat_roc_auc(labels,preds):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return roc_auc_score(labels_flat,pred_flat,zero_division=0.5)


def table_scores(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return classification_report(labels_flat,pred_flat)


In [0]:

gender_model = CamembertForSequenceClassification.from_pretrained(
    "camembert-base", 
    num_labels = 2, # We have two different labels Women = 1 and Men =0   
    output_attentions = False, 
    output_hidden_states = False, 
)

In [0]:
def train_val_gendermodel(train_loader, val_loader, epochs_val,seed_val,device,lr_value):

  ############################  IMPORT MODEL ################################################
  gender_model = CamembertForSequenceClassification.from_pretrained(
    "camembert-base", 
    num_labels = 2, # We have two different labels Women = 1 and Men =0   
    output_attentions = False, 
    output_hidden_states = False, )

  model = gender_model
  model.cuda()
  
  ############################## RANDOM SEED ##################################################

  import random
 # Let's put a seed to make this result reproducible 
  seed=seed_val
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

  ############################### LEARNING RATE SCHEDULER #######################################

  # https://huggingface.co/transformers/migration.html 
  # https://pytorch.org/docs/stable/optim.html (default values)

  import torch.nn as nn
  import torch.optim as optim
  from transformers import AdamW
  from transformers import get_linear_schedule_with_warmup

  epochs = epochs_val # In order to fine tune our model we will first set the number of epochs to 4.

  # We choose Binary cross enthropy with logits loss for the loss computation. It seems to be the most adapted loss to our problem. 
  criterion = nn.BCEWithLogitsLoss()
  #Implements Adam algorithm with weight decay fix.
  opti = AdamW(model.parameters(),
                    lr =lr_value, # learning rate (default = 1e-3)
                    eps = 1e-8 # prevents division by 0 (default = 1e-8)
                  )

  num_training_steps = len(train_loader) * epochs
  # Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period (0 here)
  scheduler = get_linear_schedule_with_warmup(opti, 
                                              num_warmup_steps = 0,
                                              num_training_steps = num_training_steps)
    

  # We want to evaluate the training phase 
  training_stats = []

  for ep in range(0, epochs):
    print('===========Starting Epoch {} / {} =============='.format(ep+1,epochs))
    print('Training starts')

    ################################### TRAINING ################################

    #Put the model in training mode
    model.train()

    # Set the train loss for the epoch to 0 
    total_train_loss = 0

    for step, batch in enumerate(train_loader):
      # Clear gradients 
      model.zero_grad() # (opti.zerograd ? )

      # Cpy the 3 batch to GPU 
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)
      
      #return loss and logits
      loss, logits = model(b_input_ids, 
                          token_type_ids=None, 
                          attention_mask=b_input_mask, 
                          labels=b_labels) 
      
      # Accumulate training loss for all batches 
      total_train_loss += loss.item()

      #Backpropagating the gradients 
      loss.backward()

      # Prevent exploding gradients problem  (forcing the gradients to be small, the parameter updates will not push the parameters too far from their previous values)
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Update parameters 
      opti.step()

      # Update learning rate schedule
      scheduler.step()

    #Calculate the average training loss over all batches  
    avg_train_loss = total_train_loss / len(train_loader)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print('')
    print('And now, validation STARTS')

    ###################### VALIDATION #############################

    # Put model in evaluation mode 
    model.eval()

    # Set statistics to 0
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    total_eval_f1=0
    total_roc_auc = 0 

    # Confusion matrix ?
    predictions, true_labels = [], []

    for batch in val_loader:
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)
      
      # We don't care about gradients for eval

      with torch.no_grad(): 
        (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
      total_eval_loss += loss.item()

        # Move logits and labels to CPU 
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()

      pred_flat = np.argmax(logits, axis=1).flatten()
      labels_flat = label_ids.flatten()

      # Confusion matrix ?
      #val_batch_preds = np.argmax(logits, axis=1)
      #val_batch_labels = label_ids
      #predictions.extend(val_batch_preds)
      #true_labels.extend(val_batch_labels)

      # Accumulation accuracy for all batch
      total_eval_accuracy += flat_accuracy(logits, label_ids)

      # Accumulation f1 for all batch
      total_eval_f1 += f1_score(labels_flat,pred_flat,zero_division=1)

      # Accumulation roc_auc for all batch
      #total_roc_auc += flat_roc_auc(label_ids,logits)
      
      #Final accuracy on all batch
    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

      #Final f1 on all batch
    avg_val_f1 = total_eval_f1 / len(val_loader)
    print("  Accuracy: {0:.2f}".format(avg_val_f1))

     #Final roc_auc over all batch
    #avg_val_roc_auc = total_roc_auc / len(val_loader)
    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))

      #Final loss over all batch
    avg_val_loss = total_eval_loss / len(val_loader)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))

    # confusion matrix ? 
    pred_tags = [i for i in predictions]
    valid_tags = [i for i in true_labels]



    training_stats.append(
          {
              'epoch': ep + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Valid. Accur.': avg_val_accuracy,
              'Valid F1' : avg_val_f1,
             # 'Valid ROC AUC' : avg_val_roc_auc,
          }
      )

  print("")
  print("Done !")

  return  training_stats

In [0]:
results_unbalanced = train_val_gendermodel(train_loader=train_loader_unbalanced, val_loader=val_loader_unbalanced, epochs_val=4,seed_val=2020,device=device,lr_value=5e-5)

Training starts

  Average training loss: 0.49

And now, validation STARTS
  Accuracy: 0.76
  Accuracy: 0.54
  Validation Loss: 0.53
Training starts

  Average training loss: 0.33

And now, validation STARTS
  Accuracy: 0.83
  Accuracy: 0.59
  Validation Loss: 0.39
Training starts

  Average training loss: 0.24

And now, validation STARTS
  Accuracy: 0.86
  Accuracy: 0.62
  Validation Loss: 0.41
Training starts

  Average training loss: 0.17

And now, validation STARTS
  Accuracy: 0.87
  Accuracy: 0.67
  Validation Loss: 0.41

Done !


In [0]:
import pandas as pd

In [0]:
df_stats = pd.DataFrame(data=results_unbalanced)
df_stats = df_stats.set_index('epoch')
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Valid F1 avg
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.491583,0.528068,0.764919,0.540591
2,0.325746,0.390983,0.829435,0.591389
3,0.241597,0.411054,0.8625,0.618186
4,0.166739,0.409628,0.865726,0.667426


In [0]:
results_balanced = train_val_gendermodel(train_loader=train_loader_balanced, val_loader=val_loader_balanced, epochs_val=4,seed_val=2020,device=device,lr_value=5e-5)

Training starts

  Average training loss: 0.59

And now, validation STARTS
  Accuracy: 0.79
  Accuracy: 0.73
  Validation Loss: 0.44
Training starts

  Average training loss: 0.39

And now, validation STARTS
  Accuracy: 0.80
  Accuracy: 0.78
  Validation Loss: 0.40
Training starts

  Average training loss: 0.26

And now, validation STARTS
  Accuracy: 0.80
  Accuracy: 0.77
  Validation Loss: 0.46
Training starts

  Average training loss: 0.18

And now, validation STARTS
  Accuracy: 0.80
  Accuracy: 0.76
  Validation Loss: 0.55

Done !


In [0]:
df_stats = pd.DataFrame(data=results_balanced)
df_stats = df_stats.set_index('epoch')
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Valid F1 avg
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.59206,0.442618,0.787903,0.727892
2,0.390691,0.400924,0.802823,0.778951
3,0.26444,0.462377,0.802016,0.774889
4,0.182332,0.549111,0.797984,0.760625


In [0]:
results_balanced_split = train_val_gendermodel(train_loader=train_loader_balanced_split, val_loader=val_loader_balanced_split, epochs_val=4,seed_val=2020,device=device,lr_value=5e-5)

Training starts

  Average training loss: 0.60

And now, validation STARTS
  Accuracy: 0.77
  Accuracy: 0.78
  Validation Loss: 0.49
Training starts

  Average training loss: 0.37

And now, validation STARTS
  Accuracy: 0.81
  Accuracy: 0.80
  Validation Loss: 0.45
Training starts

  Average training loss: 0.24

And now, validation STARTS
  Accuracy: 0.84
  Accuracy: 0.83
  Validation Loss: 0.44
Training starts

  Average training loss: 0.14

And now, validation STARTS
  Accuracy: 0.85
  Accuracy: 0.83
  Validation Loss: 0.48

Done !


In [0]:
df_stats = pd.DataFrame(data=results_balanced_split)
df_stats = df_stats.set_index('epoch')
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Valid F1 avg
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.59651,0.490744,0.769531,0.778711
2,0.366554,0.448279,0.810547,0.797332
3,0.235123,0.435214,0.841797,0.830953
4,0.13806,0.476634,0.847656,0.826127


Analyse

#### Saving the model ?

## Evaluation and qualitative analysis 

We train the model just as before with the attributes and samples we have chosen. In order to increase as much as possible the performance of the model we decide to train on both train and test data in this case. We will keep our development sample in order to make our qualitative analysis. 

In [19]:
df_balanced_split, dev_balanced_split = balanced_splitted(df,seed_val,frac_val=1)
print('Size of the development sample is {}'.format(len(dev_balanced_split)))

Number of text in this balanced splitted corpus : 12,641

Proportions of women in the balanced splitted corpus : 51.41998259631359

Size of the development sample is 253


In [0]:
def prepare_to_feed_train(df,length,batch_size_value):
  from torch.utils.data import TensorDataset, random_split
  from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

  texts = df.Texte.values
  labels = df.sexe.values

  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  attention_masks = []
  num_truncated_tokens =[]
  # Apply function to our corpus
  for text in texts:
      encoded_dict = tokenizer.encode_plus(
                          text,                      # text
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = length,           # We choose for now a max length of 500.
                          pad_to_max_length = True,    # Pad text to max (marche pas en pad left ?)
                          return_attention_mask = True,   # Construct attention masks
                          return_tensors = 'pt',     # Return pytorch tensors.
                          return_overflowing_tokens =True, # return overflowing token information
                    )
      
      # Map tokens to their id in the dictionnary 
      # We add this to our list    
      input_ids.append(encoded_dict['input_ids'])
  
      #num_truncated_tokens.append(encoded_dict['num_truncated_tokens'])
      
      # 3. Attention masks
      attention_masks.append(encoded_dict['attention_mask'])

  # We convert all this into tensors in order to be able to make it work on GPU 
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)

  # Original text and transformed tensor print 
  print("Let's check for the first text indexes, attention masks and labels")
  print(" ")
  print('Original: ', texts[0][0:100])
  print('IDs:', input_ids[0][0:100])
  print('Attention masks:', attention_masks[0][0:100])
  print('labels',labels[0])


  # Combine all above
  dataset = TensorDataset(input_ids, attention_masks, labels)

  # We set the size of the batch ( usually set around 16 or 32), we will take the lower bound because of the large text length
  batch_size = batch_size_value

  # We create data loaders for the train and validation dataset. 
  train_dataloader = DataLoader(
              dataset,  # The training samples.
              sampler = SequentialSampler(dataset), 
              batch_size = batch_size # Trains with this batch size.
          )
  
  print('Data loaders created for train [0]')

  return train_dataloader

In [21]:
train_loader_balanced_split= prepare_to_feed_train(df_balanced_split,length=500,batch_size_value=16)

Let's check for the first text indexes, attention masks and labels
 
Original:   par le haut. Nous nous appuyons sur ceux qui « font », sur le terrain, c'est-à-dire vous : les acte
IDs: tensor([    5,    37,    16,   540,     9,   170,    63,  5954,  7263,   273,
           32,   320,    31,    64,   504,   311,    32,    16,   992,     7,
           60,    11,    41,    26,   169,    26,  1755,    39,    43,    19,
         1602,  7786,     7,    19,  2490,     7,    19,  3820,    18,    11,
         1894,     9,   158,   495,   490,     9, 21189,  1717,    43,   334,
         3398,     7, 10886,     7, 23564,     9,   139, 11828,     9,  3975,
          296,  7871,    42,   470,    26,  7859, 21217,  4472,     8,    85,
           13,    64,   345,  1394,    22,   680,   311,    22,   652,   348,
           43,    17,    11, 12935,   547,    36,  1174,    26,   315,     7,
         1093,    19,  5379,     8, 21691,     9,   170,  2545,   145, 11176])
Attention masks: tensor([1, 1, 1,

In [26]:
############################  IMPORT MODEL ################################################
gender_model = CamembertForSequenceClassification.from_pretrained(
"camembert-base", 
num_labels = 2, # We have two different labels Women = 1 and Men =0   
output_attentions = False, 
output_hidden_states = False, )

gender_model.cuda()

############################## RANDOM SEED ##################################################

import random
# Let's put a seed to make this result reproducible 
seed=2020
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

############################### LEARNING RATE SCHEDULER #######################################

# https://huggingface.co/transformers/migration.html 
# https://pytorch.org/docs/stable/optim.html (default values)

import torch.nn as nn
import torch.optim as optim
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

epochs = 1  # In order to fine tune our model we will first set the number of epochs to 4.

# We choose Binary cross enthropy with logits loss for the loss computation. It seems to be the most adapted loss to our problem. 
criterion = nn.BCEWithLogitsLoss()
#Implements Adam algorithm with weight decay fix.
opti = AdamW(gender_model.parameters(),
                lr =5e-5, # learning rate (default = 1e-3)
                eps = 1e-8 # prevents division by 0 (default = 1e-8)
              )

num_training_steps = len(train_loader_balanced_split) * epochs
# Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period (0 here)
scheduler = get_linear_schedule_with_warmup(opti, 
                                          num_warmup_steps = 0,
                                          num_training_steps = num_training_steps)


# We want to evaluate the training phase 
training_stats = []

for ep in range(0, epochs):
  print('===========Starting Epoch {} / {} =============='.format(ep+1,epochs))
  print('Training starts')

  ################################### TRAINING ################################

  #Put the model in training mode
  gender_model.train()

  # Set the train loss for the epoch to 0 
  total_train_loss = 0

  for step, batch in enumerate(train_loader_balanced_split):
    # Clear gradients 
    gender_model.zero_grad() # (opti.zerograd ? )

    # Cpy the 3 batch to GPU 
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    
    #return loss and logits
    loss, logits = gender_model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels) 
    
    # Accumulate training loss for all batches 
    total_train_loss += loss.item()

    #Backpropagating the gradients 
    loss.backward()

    # Prevent exploding gradients problem  (forcing the gradients to be small, the parameter updates will not push the parameters too far from their previous values)
    torch.nn.utils.clip_grad_norm_(gender_model.parameters(), 1.0)

    # Update parameters 
    opti.step()

    # Update learning rate schedule
    scheduler.step()

#Calculate the average training loss over all batches  
avg_train_loss = total_train_loss / len(train_loader_balanced_split)
print("")
print("  Average training loss: {0:.2f}".format(avg_train_loss))
print('')

training_stats.append(
      {
          'epoch': ep + 1,
          'Training Loss': avg_train_loss,
      }
  )

  

Training starts

  Average training loss: 0.48



Preparing development sequence 

In [228]:
dev_balanced_split

Unnamed: 0,Texte,sexe
8785,"collectivités, acteurs économiques, usagers, a...",1
11372,"dèle libéral-financier qui s'est installé, qui...",0
1736,é des concertations menées par ma prédécesseur...,1
4056,s me dire comment il serait éventuellement pos...,1
3415,Je suis très heureuse et honorée de vous retro...,1
...,...,...
5708,"de leçons ni apporter nos recettes, mais en p...",0
6905,vités n'ont pas qu'une fonction économique mai...,0
8259,lzheimer les 30 et 31 octobre 2008 à Paris.Ce ...,1
5687,e. C'est Daech qui a déclaré la guerre contre ...,0


In [0]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


texts = dev_balanced_split.Texte.values
labels = dev_balanced_split.sexe.values

In [0]:

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
num_truncated_tokens =[]
original_text =[]
tokenized_text=[]

for text in texts:
  original_text.append(text)
  tokenized_text.append(tokenizer.tokenize(text))
  encoded_dict = tokenizer.encode_plus(
                        text,                      # text
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 500,           # We choose for now a max length of 500.
                        pad_to_max_length = True,    # Pad text to max (marche pas en pad left ?)
                        return_attention_mask = True,   # Construct attention masks
                        return_tensors = 'pt',     # Return pytorch tensors.
                        return_overflowing_tokens =True, # return overflowing token information
                  )
    
    # Map tokens to their id in the dictionnary 
    # We add this to our list    
  input_ids.append(encoded_dict['input_ids'])

    #num_truncated_tokens.append(encoded_dict['num_truncated_tokens'])
    
    # 3. Attention masks
  attention_masks.append(encoded_dict['attention_mask'])

In [0]:

# We convert all this into tensors in order to be able to make it work on GPU 
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [110]:
# Original text and transformed tensor print 
print("Let's check for the first text indexes, attention masks and labels")
print(" ")
print('Original: ', texts[0][0:100])
print('Tokenized: ', tokenized_text[0])
print('IDs:', input_ids[0][0:100])
print('Attention masks:', attention_masks[0][0:100])
print('labels',labels[0])

Let's check for the first text indexes, attention masks and labels
 
Original:  collectivités, acteurs économiques, usagers, associations de protection de la nature et l'Etat const
Tokenized:  ['▁collectivités', ',', '▁acteurs', '▁économiques', ',', '▁usagers', ',', '▁associations', '▁de', '▁protection', '▁de', '▁la', '▁nature', '▁et', '▁l', "'", 'Etat', '▁constituent', '▁une', '▁formidable', '▁réussite', '▁française', '.', '▁La', '▁preuve', '▁que', '▁l', "'", 'on', '▁peut', '▁concilier', '▁attention', '▁à', '▁la', '▁nature', '▁et', '▁développement', '▁économique', '.', '▁Les', '▁Parc', 's', '▁constituent', '▁à', '▁la', '▁fois', '▁des', '▁espaces', '▁remarquables', ',', '▁riches', '▁de', '▁leurs', '▁patrimoine', 's', '▁naturel', ',', '▁paysager', '▁et', '▁culturel', '▁mais', '▁également', '▁des', '▁lieux', '▁de', '▁vie', '▁puisqu', "'", 'ils', '▁réunissent', '▁plus', '▁de', '▁4', '▁millions', '▁d', "'", 'habitants', '.', '▁Parmi', '▁eux', ',', '▁700', '▁000', '▁personnes', '▁travaillen

In [0]:
# Combine all above
dataset = TensorDataset(input_ids, attention_masks, labels)
batch_size = 1
# We create data loaders for the train and validation dataset. 
dev_dataloader = DataLoader(
            dataset,  # The training samples.
            sampler = SequentialSampler(dataset), # We set to sequential and we keep track 
            batch_size = batch_size # Trains with this batch size.
        )

In [0]:
# Put model in evaluation mode 
gender_model.eval()

total_pred,total_labels, total_eval_loss,scores_max,scores_min=[],[], [],[],[]
for batch in dev_dataloader:
  b_input_ids = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_labels = batch[2].to(device)
  
  # We don't care about gradients for eval

  with torch.no_grad(): 
    (loss, logits) = gender_model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,
                                labels=b_labels)
  total_eval_loss.append(loss.item())

    # Move logits and labels to CPU 
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  score_max = np.amax(logits, axis=1).flatten().item()
  score_min = np.amin(logits, axis=1).flatten().item()
  pred_flat = np.argmax(logits, axis=1).flatten().item()
  labels_flat = label_ids.flatten().item()

  total_pred.append(pred_flat)
  total_labels.append(labels_flat)
  scores_max.append(score_max)
  scores_min.append(score_min)


In [0]:
results_dev=pd.DataFrame([total_labels,total_pred,scores_max,scores_min]).transpose()
results_dev.columns=['returned_labels','model_pred','scores_max','score_min']

In [0]:
frames = [dev_balanced_split.reset_index(), results_dev]
result = pd.concat(frames,axis=1)

In [230]:
result.head(2)

Unnamed: 0,index,Texte,sexe,returned_labels,model_pred,scores_max,score_min
0,8785,"collectivités, acteurs économiques, usagers, a...",1,1.0,1.0,1.45766,-1.341431
1,11372,"dèle libéral-financier qui s'est installé, qui...",0,0.0,0.0,1.560678,-1.65303


In [229]:
result[result.model_pred!=result.sexe].head(2)

Unnamed: 0,index,Texte,sexe,returned_labels,model_pred,scores_max,score_min
7,981,"Monsieur le Président,Mesdames et Messieurs le...",0,0.0,1.0,0.388484,-0.33079
13,1998,le lien social. Nous généralisons la circonsta...,0,0.0,1.0,0.643722,-0.624046


We want to dive a bit into the model and see how it makes a choice and why it fails on thos 38 sentences. Let's take one of them. We will redo point 4 of TD4 to see the score reached by each word. 

In [0]:
max_score_fail = result[result.model_pred!=result.sexe].scores_max.max()
sentence_to_analyse = result[result.scores_max==max_score_fail]
sentence_to_analyse = sentence_to_analyse[['Texte','sexe']]

In [214]:
sentence_to_analyse['Texte']

252    Monsieur le Ministre, Mesdames, Messieurs, Jea...
Name: Texte, dtype: object

In [0]:
for text in sentence_to_analyse['Texte']:
  tokens = text.split('.')

In [0]:
sentences_to_analyse =pd.DataFrame(tokens)
sentences_to_analyse=sentences_to_analyse.assign(sexe=1)
sentences_to_analyse.columns=['Texte','sexe']


In [231]:
texts

array(['Monsieur le Ministre, Mesdames, Messieurs, Jean GLAVANY vient rappeler le contexte économique, technique dans lequel votre accord a été préparé et négocié',
       ' Vous nous exposerez les uns et les autres les enjeux et les objectifs de cet accord, pour les employeurs et les salariés des entreprises que vous représentez',
       " Je voudrais quant à moi souligner l'intérêt de cet accord qui s'inscrit entièrement dans les problématiques de deux chantiers importants du Ministère de l'Emploi et de la Solidarité, la formation professionnelle et la démocratie sociale et qui par les résultats que nous observons aujourd'hui augurent bien de la suite des travaux engagés",
       " L'Etat a apporté sa contribution dans ce dossier mais il était nécessaire que les partenaires sociaux eux-mêmes s'impliquent pour définir les mesures d'accompagnement les mieux adaptées",
       " C'est aujourd'hui le fruit de votre investissement que nous trouvons dans cet accord national tripartite que j

In [0]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


texts = sentences_to_analyse.Texte.values
labels = sentences_to_analyse.sexe.values

In [0]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
num_truncated_tokens =[]
original_text =[]
tokenized_text=[]

for text in texts:
  original_text.append(text)
  tokenized_text.append(tokenizer.tokenize(text))
  encoded_dict = tokenizer.encode_plus(
                        text,                      # text
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 50,           # We choose for now a max length of 500.
                        pad_to_max_length = True,    # Pad text to max (marche pas en pad left ?)
                        return_attention_mask = True,   # Construct attention masks
                        return_tensors = 'pt',     # Return pytorch tensors.
                        return_overflowing_tokens =True, # return overflowing token information
                  )
    
    # Map tokens to their id in the dictionnary 
    # We add this to our list    
  input_ids.append(encoded_dict['input_ids'])

    #num_truncated_tokens.append(encoded_dict['num_truncated_tokens'])
    
    # 3. Attention masks
  attention_masks.append(encoded_dict['attention_mask'])

In [0]:
# We convert all this into tensors in order to be able to make it work on GPU 
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [0]:
# Combine all above
batch_size = 1
dataset = TensorDataset(input_ids, attention_masks, labels)
# We create data loaders for the train and validation dataset. 
dev_dataloader = DataLoader(
            dataset,  # The training samples.
            sampler = SequentialSampler(dataset), # We set to sequential and we keep track 
            batch_size = batch_size # Trains with this batch size.
        )

In [0]:
# Put model in evaluation mode 
gender_model.eval()

total_pred,total_labels, total_eval_loss,scores_max,scores_min=[],[], [],[],[]
for batch in dev_dataloader:
  b_input_ids = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_labels = batch[2].to(device)
  
  # We don't care about gradients for eval

  with torch.no_grad(): 
    (loss, logits) = gender_model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,
                                labels=b_labels)
  total_eval_loss.append(loss.item())

    # Move logits and labels to CPU 
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  score_max = np.amax(logits, axis=1).flatten().item()
  score_min = np.amin(logits, axis=1).flatten().item()
  pred_flat = np.argmax(logits, axis=1).flatten().item()
  labels_flat = label_ids.flatten().item()

  total_pred.append(pred_flat)
  total_labels.append(labels_flat)
  scores_max.append(score_max)
  scores_min.append(score_min)


In [0]:
results_dev=pd.DataFrame([total_labels,total_pred,scores_max,scores_min]).transpose()
results_dev.columns=['returned_labels','model_pred','scores_max','score_min']

In [0]:
frames = [sentences_to_analyse.reset_index(), results_dev]
result = pd.concat(frames,axis=1)

In [264]:
result[result.scores_max>=1].Texte

0    Monsieur le Ministre, Mesdames, Messieurs, Jea...
2     Je voudrais quant à moi souligner l'intérêt d...
8     Six mois de négociations ont été nécessaires ...
Name: Texte, dtype: object

KeyError: ignored