1. [Configuration](#configuration)
2. [Model & Tokenizer](#model)
3. [Datasets](#datasets)
4. [Training](#training)
5. [Inference](#inference)
6. [Embeddings](#embeddings)

In [1]:
%pip install plotly-express
%pip install --upgrade nbformat

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys, os
from tqdm import tqdm

import plotly.express as px

import pickle
import numpy as np
import pandas as pd
import json

import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

from datasets import load_metric


import torch
from torch.utils.data import DataLoader

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, BertForPreTraining

import umap

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

  from .autonotebook import tqdm as notebook_tqdm


## 1. Configuration <a class="anchor" id="configuration"></a>

In [3]:
SEED = 42
VERSION = 1
val_split = 0.1

n_epochs = 5
resume_from_checkpoint = True

MODEL_NAME = "dccuchile/bert-base-spanish-wwm-uncased"


BATCH_SIZE = 14

# All classes
ALL_CLASSES_V = ['negativo', 'positivo']
N_CLASSES = len(ALL_CLASSES_V)

## 2. Model & Tokenizer <a class="anchor" id="model"></a>

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [5]:
model = AutoModelForSequenceClassification.from_pretrained('/home/ec2-user/SageMaker/mlm_training/BETO_Conversaciones/checkpoint-34170',
                                                          num_labels=N_CLASSES)

Some weights of the model checkpoint at /home/ec2-user/SageMaker/mlm_training/BETO_Conversaciones/checkpoint-34170 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some w

## 3. Datasets <a class="anchor" id="datasets"></a>

In [6]:
class DS:
    def __init__(
        self,
        dataset_df,
        tokenizer,
        training=False,
        split_seed=42,
        val_split=0.15,
        n_pad=512,
    ):
        
        self.training = training
        self.dataset_df = dataset_df.set_index('num_id')
        self.tokenizer = tokenizer
        self.n_pad = n_pad
        
        self.all_conv_v = dataset_df.num_id.unique()
        
        n_trn_samples = int( self.all_conv_v.shape[0] * (1 - val_split) )
        
        np.random.seed(split_seed)
        idx_v = np.random.permutation(self.all_conv_v.shape[0])
        np.random.seed(None)
        
        if self.training:
            self.all_conv_v = self.all_conv_v[:n_trn_samples]
        else:
            self.all_conv_v = self.all_conv_v[n_trn_samples:]
        
        return None
        
    def __getitem__(self, idx):
        sample_id = self.all_conv_v[idx]
        
        conv_df = self.dataset_df.loc[sample_id]
        
        
        lines_v = conv_df.des_message.values
        max_len = 3 * self.n_pad // len(lines_v)
        
        text = '[SEP]'.join( [' '.join( l.split(' ')[:max_len] ) for l in lines_v] )
        
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.n_pad,
            return_tensors='pt',
        )
        
        
        data = {
            'sample_id':sample_id,
            'conv_df':conv_df,
            
            'text': text,
            'label': torch.tensor( conv_df['cls'].iloc[0], dtype=torch.int64),
        }
        
        for k in inputs.keys():
            data[k] = inputs[k][0]
        
        return data
        
        
    def __len__(self):
        return len(self.all_conv_v)
    
    
    def collate_fn(self, data_v):
        keys_v = tuple( data_v[0].keys() )
        ret_d = {k: list() for k in keys_v}
        
        for k in keys_v:
            for data in data_v:
                ret_d[k].append( data[k] )
        
        for k in keys_v:
            if type( ret_d[k][0] ) is torch.Tensor:
                ret_d[k] = torch.stack( ret_d[k] )
            
            elif type( ret_d[k][0] ) is transformers.tokenization_utils_base.BatchEncoding:
                inputs = self.collate_fn(ret_d[k])                
                ret_d[k] = transformers.tokenization_utils_base.BatchEncoding(inputs)
            
            else:
                ret_d[k] = np.array( ret_d[k], dtype=np.object_ )
                
        return ret_d

In [7]:
dataset_df = pd.read_csv('data/dataset_df_inactivation.csv')

In [8]:
ds_trn = DS(
    dataset_df,
    tokenizer,
    training=True,
    val_split=val_split,
    split_seed=SEED,
)

ds_val = DS(
    dataset_df,
    tokenizer,
    training=False,
    val_split=val_split,
    split_seed=SEED,
)

## 4. Training <a class="anchor" id="training"></a>

In [9]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [10]:
training_args = TrainingArguments(
    output_dir=f"inactivation_models_v{VERSION}",
    num_train_epochs=float(n_epochs),
    evaluation_strategy="epoch",
    
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=2*BATCH_SIZE,
    gradient_accumulation_steps=1,
    
    resume_from_checkpoint=resume_from_checkpoint,
    save_strategy='epoch',
    
    metric_for_best_model='accuracy',
    load_best_model_at_end=True,
    greater_is_better=True,
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    #data_collator=ds_trn.collate_fn,
    train_dataset=ds_trn,
    eval_dataset=ds_val,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 4797
  Num Epochs = 5
  Instantaneous batch size per device = 14
  Total train batch size (w. parallel, distributed & accumulation) = 14
  Gradient Accumulation steps = 1
  Total optimization steps = 1715
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: conv_df, sample_id, text. If conv_df, sample_id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.128085,0.962477
2,0.150200,0.161989,0.956848
3,0.070200,0.192015,0.954972
4,0.070200,0.217709,0.956848
5,0.020400,0.262788,0.953096


***** Running Evaluation *****
  Num examples = 533
  Batch size = 28
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: conv_df, sample_id, text. If conv_df, sample_id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to inactivation_models_v1/checkpoint-343
Configuration saved in inactivation_models_v1/checkpoint-343/config.json
Model weights saved in inactivation_models_v1/checkpoint-343/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 533
  Batch size = 28
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: conv_df, sample_id, text. If conv_df, sample_id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to inactivation_models_

TrainOutput(global_step=1715, training_loss=0.07114963211749108, metrics={'train_runtime': 4588.7254, 'train_samples_per_second': 5.227, 'train_steps_per_second': 0.374, 'total_flos': 6310718662809600.0, 'train_loss': 0.07114963211749108, 'epoch': 5.0})

In [None]:
sys.exit(0)

## Load model

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    #'inactivation_models_v99/checkpoint-1372'
    'inactivation_models_v1/checkpoint-343'
)

loading configuration file inactivation_models_v1/checkpoint-343/config.json
Model config BertConfig {
  "_name_or_path": "inactivation_models_v1/checkpoint-343",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.21.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31002
}

loading weights file inactivation_models_v1/checkpoint-343/pytorch_model.bin
All model checkpoint weights were used when initial

## 5. Inference <a class="anchor" id="inference"></a>

In [14]:
def eval_model(model, dl, return_targets=True, device='cpu'):
    preds_v  = []
    label_v = []
    probs_v  = []
    
    
    if model.training:
        model.eval()
    
    model.to(device)
    
    with torch.no_grad():
        for data in tqdm(dl):

            logits = model(
                input_ids=data['input_ids'].to(device),
                token_type_ids=data['token_type_ids'].to(device),
                attention_mask=data['attention_mask'].to(device),
            )['logits']


            probs = logits.softmax(axis=-1).detach().cpu().numpy()
            preds = probs.argmax(axis=-1)

            probs_v.append(probs)
            preds_v.append(preds)

            if return_targets:
                label = data['label'].detach().cpu().numpy()
                label_v.append(label)

    probs_v = np.concatenate(probs_v)
    preds_v = np.concatenate(preds_v)
    
    ret_d = {
        'probs':probs_v,
        'preds':preds_v,
    }    
    
    if return_targets:
        label_v = np.concatenate(label_v)
        ret_d['label'] = label_v
    
    return ret_d

In [15]:
dl_trn = DataLoader(ds_trn, batch_size=2*BATCH_SIZE, collate_fn=ds_trn.collate_fn, shuffle=True)
dl_val = DataLoader(ds_val, batch_size=2*BATCH_SIZE, collate_fn=ds_val.collate_fn, shuffle=False)

### Train data

In [16]:
trn_preds_d = eval_model(model, dl_trn, device='cuda:0')

100%|██████████| 172/172 [05:20<00:00,  1.86s/it]


In [17]:
print('Classification Report: ds_trn')
report_str = classification_report(
    y_true=trn_preds_d['label'],
    y_pred=trn_preds_d['preds'],
    target_names=ALL_CLASSES_V,
    digits=2,
    zero_division=1,
)

print( report_str )

Classification Report: ds_trn
              precision    recall  f1-score   support

    negativo       0.99      0.99      0.99      3515
    positivo       0.96      0.97      0.97      1282

    accuracy                           0.98      4797
   macro avg       0.98      0.98      0.98      4797
weighted avg       0.98      0.98      0.98      4797



### Validation data

In [18]:
val_preds_d = eval_model(model, dl_val, device='cuda:0')

100%|██████████| 20/20 [00:35<00:00,  1.79s/it]


In [19]:
print('Classification Report: ds_val')
report_str = classification_report(
    y_true=val_preds_d['label'],
    y_pred=val_preds_d['preds'],
    target_names=ALL_CLASSES_V,
    digits=2,
    zero_division=1,
)

print( report_str )

Classification Report: ds_val
              precision    recall  f1-score   support

    negativo       0.98      0.97      0.97       384
    positivo       0.93      0.94      0.93       149

    accuracy                           0.96       533
   macro avg       0.95      0.96      0.95       533
weighted avg       0.96      0.96      0.96       533



## Plot embeddings

In [12]:
def eval_model_interaction(model, dl, return_targets=True, add_int_idx=True, device='cpu'):
    embed_lhs_v  = []
    embed_po_v = []
    label_v = []
    sample_id_v = []
    int_idx_v = []
    
    embed_model = model.bert
    
    if embed_model.training:
        embed_model.eval()
        
    embed_model.to(device)
    
    with torch.no_grad():
        for data in tqdm(dl):

            embeddings = embed_model(
                input_ids=data['input_ids'].to(device),
                token_type_ids=data['token_type_ids'].to(device),
                attention_mask=data['attention_mask'].to(device),
            )

            embed_lhs = embeddings['last_hidden_state'][:,0,:].detach().cpu().numpy()
            embed_po = embeddings['pooler_output'].detach().cpu().numpy()

            embed_lhs_v.append(embed_lhs)
            embed_po_v.append(embed_po)

            sample_id_v.append( data['sample_id'] )
            
            if add_int_idx:
                int_idx_v.append( data['int_idx'] )

            if return_targets:
                label = data['label'].detach().cpu().numpy()
                label_v.append(label)

    embed_lhs_v = np.concatenate(embed_lhs_v)
    embed_po_v = np.concatenate(embed_po_v)
    sample_id_v = np.concatenate(sample_id_v)
    
    
    ret_d = {
        'embed_lhs' :embed_lhs_v,
        'embed_po': embed_po_v,
        'sample_id': sample_id_v,
        'int_idx': int_idx_v,
    }    
    
    if return_targets:
        label_v = np.concatenate(label_v)
        ret_d['label'] = label_v
    
    if add_int_idx:
        int_idx_v = np.concatenate(int_idx_v)
        ret_d['int_idx'] = int_idx_v
        
    return ret_d

In [13]:
val_embeddings_d = eval_model_interaction(model, dl_val, return_targets=True, add_int_idx=False, device='cuda:0')

100%|██████████| 20/20 [00:32<00:00,  1.63s/it]


In [14]:
features = val_embeddings_d['embed_po']

umap_2d_po = umap.UMAP(n_components=2, init='random', random_state=0, n_neighbors=20)
proj_2d_po = umap_2d_po.fit_transform(features)

In [15]:
df_po = pd.DataFrame(data=proj_2d_po, index=None, columns=['x', 'y'])
df_po['cls'] = val_embeddings_d['label']
df_po['label'] = df_po['cls'].replace({0: 'negativo', 1: 'positivo'})

In [1]:
#fig = px.scatter(df_po, x='x', y='y', color='label', title='Por conversación - Pooler output')
#fig.show()

In [17]:
dataset_df.outcome.value_counts()

Promesa de pago                                         35182
Renuencia                                               17973
No lo conocen                                            5839
Con intención de pago, falta concretar negociación       5048
Acepto recado                                            1549
Confirmo Pago completo en tiempo y forma                 1476
Ya no vive/trabaja ahí                                   1262
Confirma próximo pago en fecha                            722
Informa pago realizado                                    509
Ratifica promesa de pago                                  482
Producto de apoyo                                         477
No Promesa / No compromete Pago                           465
No contestan /No hay nadie                                426
Ya no tiene contacto con titular                          267
Interacción incompleta                                    229
Otra razón                                                207
Pago rea

In [18]:
dataset_df.loc[dataset_df['cls']==0].outcome.value_counts()

Promesa de pago                                         35182
Renuencia                                               17973
Con intención de pago, falta concretar negociación       5048
Acepto recado                                            1549
Confirmo Pago completo en tiempo y forma                 1476
Confirma próximo pago en fecha                            722
Informa pago realizado                                    509
Ratifica promesa de pago                                  482
Producto de apoyo                                         477
No Promesa / No compromete Pago                           465
Interacción incompleta                                    229
Otra razón                                                207
Pago realizado                                            165
No acepto recado                                          133
No confirma próximo pago                                  102
Recado en Buzón o con tercero                             100
Promesa 

In [19]:
dataset_df.loc[dataset_df['cls']==1].outcome.value_counts()

No lo conocen                                       5839
Ya no vive/trabaja ahí                              1262
No contestan /No hay nadie                           426
Ya no tiene contacto con titular                     267
Teléfono Invalido / Domicilio no ubicado              19
Teléfono fuera de servicio/ Domicilio abandonado      11
Name: outcome, dtype: int64