In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 778kB 2.8MB/s 
[K     |████████████████████████████████| 3.0MB 16.4MB/s 
[K     |████████████████████████████████| 890kB 30.5MB/s 
[K     |████████████████████████████████| 1.1MB 52.8MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [None]:
!nvidia-smi

Sun Aug 23 18:48:54 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.57       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    27W / 250W |     10MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from google.colab import drive
import pandas as pd
import os
import numpy as np

In [None]:
# Mount Driver
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
#Variables
params = {
    'exp': 'Atendimento-Balanced-Multiclass',
    'data': 'resp-text',
    'label': 'Atendimento',
    'BATCH_SIZE': 8,
    'MAX_LEN': 128,
    'lr': 3e-5,
    'epochs': 1,
    'max_vocab_size': 100000,
    #'model_name': 'neuralmind/bert-base-portuguese-cased'
    'model_name': 'neuralmind/bert-large-portuguese-cased'
    #'model_name': 'mrm8488/bert-base-portuguese-cased-finetuned-squad-v1-pt'
    #'model_name': 'nlptown/bert-base-multilingual-uncased-sentiment'
    #'model_name': 'distilbert-base-multilingual-cased'
    #'model_name': 'bert-base-multilingual-cased'
}

In [None]:
# Load Data
binary = False
exp = params.get('exp')
if 'Binary' in exp:
  binary = True

base_path = '/content/gdrive/My Drive/Colab Notebooks/Simple/Datasets/' + exp
save_path = '/content/gdrive/My Drive/Colab Notebooks/Simple/' + exp + '/output/'

data = params.get('data')
label = params.get('label')

x_train_file = 'X_train.csv'
y_train_file = 'y_train.csv'
x_test_file = 'X_test.csv'
y_test_file = 'y_test.csv'

#Load data
X_train = pd.read_csv(os.path.join(base_path, x_train_file), sep=';', encoding='utf-8')
y_train = pd.read_csv(os.path.join(base_path, y_train_file), sep=';', encoding='utf-8')
X_test = pd.read_csv(os.path.join(base_path, x_test_file), sep=';', encoding='utf-8')
y_test = pd.read_csv(os.path.join(base_path, y_test_file), sep=';', encoding='utf-8')

X_train['data_type'] = 'train'
X_test['data_type'] = 'val'

X_train[label] = X_train[label].astype(int)
X_test[label] = X_test[label].astype(int)

In [None]:
df = pd.concat([X_train.loc[:,['pid', data, label, 'data_type']], X_test.loc[:,['pid', data, label, 'data_type']]], axis=0)
df.head()

In [None]:
df[label].value_counts()

In [None]:
# possible_labels = df[label].unique()

# label_dict = {}
# for index, possible_label in enumerate(possible_labels):
#     label_dict[possible_label] = index

if binary:
  label_dict = {0: 0, 2: 1}  
else:
  label_dict = {0: 0, 1: 1, 2: 2}
label_dict

In [None]:
df['label'] = df[label].map(label_dict)

In [None]:
df['label'].value_counts()

In [None]:
df.head()

In [None]:
#from sklearn.model_selection import train_test_split

# X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
#                                                   df.label.values, 
#                                                   test_size=0.3, 
#                                                   random_state=42, 
#                                                   stratify=df.label.values)

#X_train = X_train['pid'].values
#X_val = X_test['pid'].values
#y_train = y_train[label].values
#y_val = y_test[label].values

# X_train = df[df.data_type == 'train'][data]
# X_val = df[df.data_type == 'val'][data]
# y_train = df[df.data_type == 'train']['label']

y_val = df[df.data_type == 'val']['label']

#df['data_type'] = ['not_set']*df.shape[0]
#df.loc[X_train, 'data_type'] = 'train'
#df.loc[X_val, 'data_type'] = 'val'

df.groupby(['label', 'data_type']).count()

In [None]:
tokenizer = BertTokenizer.from_pretrained(params.get('model_name'), do_lower_case=False)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'][data].values,
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=params.get('MAX_LEN'), 
    return_tensors='pt',
    truncation=True # added
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'][data].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=params.get('MAX_LEN'), 
    return_tensors='pt',
    truncation=True # added
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=209528.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=155.0, style=ProgressStyle(description_…




In [None]:
model = BertForSequenceClassification.from_pretrained(params.get('model_name'),
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=648.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1342014951.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at neuralmind/bert-large-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from th

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = params.get('BATCH_SIZE')

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=params.get('lr'), 
                  eps=1e-8)
                  
epochs = params.get('epochs')

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
# Copy model to GPU
print(device)
model.to(device)

cuda


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [None]:
from datetime import datetime

try:
  os.mkdir('outputs')
except:
  pass

now = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = '/content/gdrive/My Drive/Colab Notebooks/outputs/bert/' + now

try:
  os.mkdir(output_dir)
except:
  pass

from json import dumps

with open(output_dir + '/params.json', 'w') as f:
  f.write(dumps(params))

In [None]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), output_dir + '/finetuned_BERT_epoch_' + str(epoch) + '.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

In [None]:
from sklearn.metrics import classification_report

flat_predictions = np.argmax(predictions, axis=1)
print(classification_report(flat_predictions, y_val))
 
with open(output_dir + '/classification_report.txt', 'w') as f:
  f.write(classification_report(flat_predictions, y_val))