In [1]:
!pip install transformers



In [2]:
import torch
import numpy as np
import pandas as pd
import time
import re
from random import sample

from sklearn.model_selection import train_test_split

In [3]:
#Tempo de processamento
tempoInicial = time.time()

#Faz a leitura da base
df = pd.read_csv('dataset7DiasCompleto-En.v1.csv', sep=';')
df.describe()

colunaCorpus='titulo_processado'
colunaResultado='classe'

In [4]:
randomState = sample(range(0, 1000), 1)[0];

#Definindo X, y
X = df[df.columns.difference([colunaResultado])]
y = df[colunaResultado]

#Separa base treinamento e teste
XTreino, XTeste, yTreino, yTeste = train_test_split(X, y, train_size=0.7, 
                                                    stratify=y, shuffle=True, 
                                                    random_state=randomState)
XTreino = XTreino[colunaCorpus].values
yTreino = yTreino.values
XTeste = XTeste[colunaCorpus].values
yTeste = yTeste.values

In [5]:
import transformers

## distil-bert tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [6]:
corpus = XTreino
#corpus = XTreino[colunaCorpus]
maxlen = 512

In [7]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [],[],[]
    for s in range(len(sentences)):
        sentence = sentences[s]
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=512, pad_to_max_length=True, 
                                             return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32')

X = tokenize(corpus, tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [8]:
import tensorflow as tf
from transformers import TFBertModel

## inputs
input_ids_in = tf.keras.layers.Input(shape=(512,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(512,), name='masked_token', dtype='int32') 

## pre-trained bert with config
transformer_model = TFBertModel.from_pretrained("bert-base-uncased", output_hidden_states = False)
bert_out = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]

## fine-tuning
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(bert_out)
x = tf.keras.layers.GlobalMaxPool1D()(x)
x = tf.keras.layers.Dense(50, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(len(np.unique(yTreino)), activation='sigmoid')(x)
model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = x)

for layer in model.layers[:3]:
    layer.trainable = False

model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])
model.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_token (InputLayer)       [(None, 512)]        0           []                               
                                                                                                  
 masked_token (InputLayer)      [(None, 512)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_token[0][0]',            
                                thPoolingAndCrossAt               'masked_token[0][0]']           
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [9]:
## encode y
#dic_y_mapping = {n:label for n,label in 
#                 enumerate(np.unique(yTreino))}
#inverse_dic = {v:k for k,v in dic_y_mapping.items()}
#yTreino = np.array([inverse_dic[y] for y in yTreino])

## train
training = model.fit(x=X, y=yTreino, batch_size=128, epochs=1, shuffle=True, 
                     verbose=1, validation_split=0.2)



In [10]:
corpus = XTeste
X = tokenize(corpus, tokenizer)



In [11]:
## test
predicted_prob = model.predict(X)
predicted = [np.argmax(pred) for pred in predicted_prob]

In [12]:
from sklearn.metrics import classification_report

print(classification_report(yTeste, predicted, labels=[0, 1, 2, 3, 4]))

              precision    recall  f1-score   support

           0       0.75      0.90      0.82       452
           1       0.80      0.76      0.78       629
           2       0.86      0.78      0.82       645
           3       0.80      0.81      0.80       600
           4       0.76      0.78      0.77       268

    accuracy                           0.80      2594
   macro avg       0.80      0.80      0.80      2594
weighted avg       0.80      0.80      0.80      2594



In [13]:
print("\n--- %.2f minutos ---" % ((time.time() - tempoInicial) / 60))


--- 304.46 minutos ---
