# Criando modelo p/ discurso de ódio c/ BERT pré-treinado

Vamos usar o modelo BERT pré-treinado (multilingual, p/ funcionar com o português) para classificar discursos de ódio. 
Na verdade, buscaremos reproduzir o código do Diogo Cortiz (que usa o pytorch), que reproduzimos [aqui](https://colab.research.google.com/drive/18YXlk-ZIlAymoOYn5nJQE16I3SsguUwq).

In [33]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
# Hugging Face:
from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification
from datasets import load_dataset, Dataset
from transformers import DefaultDataCollator

## Funções

In [5]:
###########################################
### Splitting datasets into random sets ###
###########################################

def shuffled_pos(length, seed):
    """
    Return indices from 0 to `length` - 1 in a shuffled state, given random `seed`.
    """
    return np.random.RandomState(seed=seed).permutation(length)


def random_index_sets(size, set_fracs, seed):
    """
    Return sets of random indices (from 0 to `size` - 1) with lengths 
    given by ~ `size` * `set_fracs`.
    
    
    Input
    -----
    
    size : int
        The size of the index list to split into sets.
        
    set_fracs : iterable
        The fractions of the list of indices that each index set 
        should contain. 
    
    seed : int
        The seed for the random number generator.
        
        
    Returns
    -------
    
    indices : tuple of arrays
        The indices for each set.
    """
    
    assert np.isclose(np.sum(set_fracs), 1), '`set_fracs` should add up to one.'
    
    # Create randomized list of indices:
    shuffled_indices = shuffled_pos(size, seed)
    
    
    indices   = []
    set_start = [0]
    # Determine the sizes of the sets:
    set_sizes = [round(size * f) for f in set_fracs]
    set_sizes[0] = size - sum(set_sizes[1:])
    assert np.sum(set_sizes) == size, 'Set sizes should add up to total size.'
    
    for i in range(0, len(set_fracs) - 1):
        # Select indices for a set:
        set_start.append(set_start[i] + set_sizes[i])
        set_indices = shuffled_indices[set_start[i]:set_start[i + 1]]
        indices.append(set_indices)
        assert len(indices[i]) == len(set(indices[i])), 'There are repeating indices in a set.'
        
    # Select the indices for the last set:
    indices.append(shuffled_indices[set_start[-1]:])
    assert len(set(np.concatenate(indices))) == sum([len(i) for i in indices]), \
    'There are common indices between sets.'
    
    return tuple(indices)


def random_set_split(df, set_fracs, seed):
    """
    Split a DataFrame into randomly selected disjoint and complete sets.
    
    
    Input
    -----
    
    df : Pandas DataFrame
        The dataframe to split into a complete and disjoint set of sub-sets.
        
    set_fracs : array-like
        The fraction of `df` that should be put into each set. The length of 
        `set_fracs` determines the number of sub-sets to create.
    
    seed : int
        The seed for the random number generator used to split `df`.
        
    
    Returns
    -------
    
    A tuple of DataFrames, one for each fraction in `set_fracs`, in that order.
    """
    # Get positional indices for each set:
    sets_idx = random_index_sets(len(df), set_fracs, seed)
    
    return tuple(df.iloc[idx] for idx in sets_idx)


In [79]:
def process_pandas_to_tfdataset(df, tokenizer, max_length=80, shuffle=True, text_col='text', target_col='label', batch_size=8):
    """
    Prepare NLP data in a Pandas DataFrame to be used 
    in a TensorFlow transformer model.
    
    Parameters
    ----------
    df : DataFrame
        The corpus, containing the columns `text_col` 
        (the sentences) and `target_col` (the labels).
    tokenizer : HuggingFace AutoTokenizer
        A tokenizer loaded from 
        `transformers.AutoTokenizer.from_pretrained()`.
    max_length : int
        Maximum length of the sentences (smaller 
        sentences will be padded and longer ones
        will be truncated). This is required for 
        training, so batches have instances of the
        same shape.
    shuffle : bool
        Shuffle the dataset order when loading. 
        Recommended True for training, False for 
        validation/evaluation.
    text_col : str
        Name of `df` column containing the sentences.
    target_col : str
        Name of `df` column containing the labels of 
        the sentences.
    batch_size : int
        The size of the batch in the output 
        tensorflow dataset.
        
    Returns
    -------
    tf_dataset : TF dataset
        A dataset that can be fed into a transformer 
        model.
    """
    
    # Security checks:
    renamed_df = df.rename({target_col:'labels'}, axis=1) # Hugging Face requer esse nome p/ y.
    
    # Define função para processar os dados com o tokenizador:
    def tokenize_function(examples):
        return tokenizer(examples[text_col], padding=True, max_length=max_length, truncation=True)
    
    # pandas -> hugging face:
    hugging_set = Dataset.from_pandas(renamed_df)
    # texto -> sequência de IDs: 
    encoded_set = hugging_set.map(tokenize_function, batched=True)
    # hugging face -> tensorflow dataset:
    data_collator = DefaultDataCollator(return_tensors="tf")
    tf_dataset = encoded_set.to_tf_dataset(columns=["attention_mask", "input_ids", "token_type_ids"], label_cols=["labels"], shuffle=shuffle, collate_fn=data_collator, batch_size=batch_size)
    
    return tf_dataset

In [34]:
def gen_tensorboard_callback(root_dir, run_name):
    """
    Return a tensorboard callback with log dir given 
    by `root_dir` + `run_name`. It avoids logging 
    to a pre-existing log inadvertently. 
    """
    
    # Root dir should exist. Check it:
    if os.path.isdir(root_dir) == False:
        raise Exception("`root_dir` {} is unknown.".format(root_dir))
    
    # Build path to log:
    fullpath = os.path.join(root_dir, run_name)
    
    # Check if log already exists:
    already_exists = os.path.isdir(fullpath)
    if already_exists:
        
        # If exists, ask if it sohuld continue:
        go_on = input("Run log '{}' already exists. Continue (y/n)?".format(run_name))
        if go_on == 'y' or go_on == 'Y':
            return tf.keras.callbacks.TensorBoard(fullpath)
       
        else:
            raise Exception('Abort so not to mess with tensorboard log.')
    
    else:
        return tf.keras.callbacks.TensorBoard(fullpath)

In [73]:
def predict_proba(model, tf_dataset):
    """
    Use the provided model to compute the
    probability that each instance is 
    in the positive class (1 in a binary 
    classification).

    Parameters
    ----------
    model : TFBertForSequenceClassification
        A Hugging Face implementation of a 
        Tensorflow transformer model.
    tf_dataset : Tensorflow Dataset
        The data for which to make predictions.
    
    Returns
    -------
    probs : array
        Probability that the corresponding 
        instance falls in the positive class
        (y = 1).
    """

    tf_predict = model.predict(tf_dataset).logits
    probs = tf.sigmoid(tf_predict)[:,0].numpy()
    
    return probs


def predict_class(model, tf_dataset, threshold=0.5):
    """
    Use the provided model to predict
    the class of each instance.

    Parameters
    ----------
    model : TFBertForSequenceClassification
        A Hugging Face implementation of a 
        Tensorflow transformer model.
    tf_dataset : Tensorflow Dataset
        The data for which to make predictions.
    
    Returns
    -------
    preds : array
        Predicted class for the corresponding
        instances.
    """

    probs = predict_proba(model, tf_dataset)
    preds = (probs > threshold).astype(int)

    return preds

## Carregando o BERTimbau

In [31]:
# Define o modelo em questão:
model_name = 'neuralmind/bert-base-portuguese-cased'
# Carregando:
tokenizer  = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)
model      = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier', 'bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Carregando os dados

Fonte: Juntamos os dados de Fortuna e Pelle (veja o notebook do modelo baseline).

In [77]:
# Carrega os dados:
mass_df = pd.read_csv('../dados/processados/hatespeech_fortuna3+offcombr2.csv')

In [78]:
# Separa os dados em amostras:
train_df, val_df, test_df = random_set_split(mass_df, [0.7, 0.15, 0.15], 1323)

In [80]:
# Tokeniza os textos e os coloca no formato do Tensorflow Dataset:
train_tfd = process_pandas_to_tfdataset(train_df, tokenizer, shuffle=True)
val_tfd   = process_pandas_to_tfdataset(val_df, tokenizer, shuffle=False)
test_tfd  = process_pandas_to_tfdataset(test_df, tokenizer, shuffle=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 15.06ba/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.04ba/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.96ba/s]


## Treinando o modelo

In [53]:
# Parâmetros do treinamento:
optimizer  = tf.keras.optimizers.Adam(learning_rate=1e-3)
model_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True) # O Hugging Face não coloca uma função de ativação na última camada, por isso usaremos 'logits'.
acc_metric = tf.keras.metrics.Accuracy()
f1_metric  = tfa.metrics.F1Score(num_classes=2)
metrics = ['accuracy']
# Preparando o modelo com o BERT congelado:
model.get_layer('bert').trainable = False
model.compile(optimizer, model_loss, metrics)
model.summary()

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108923136 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  769       
                                                                 
Total params: 108,923,905
Trainable params: 769
Non-trainable params: 108,923,136
_________________________________________________________________


In [55]:
# Monitoramento com o Tensorboard 
# tensorboard --logdir=tensor_logs/
board = gen_tensorboard_callback('tensor_logs/', 'first_try')

In [None]:
# Ajustando o modelo:
model.fit(train_tfd, initial_epoch=0, epochs=3, steps_per_epoch=50, validation_data=val_tfd, callbacks=[board])

Daqui em diante, utilizamos o Colab: <https://colab.research.google.com/drive/15MYQdJ8paNyh_OLuEP3W8tg9dMpoWLnb#scrollTo=AwspzRwmFbVX>

## Testando um modelo treinado no colab

In [75]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [72]:
# Carrega o modelo:
saved_model = TFAutoModelForSequenceClassification.from_pretrained('../modelos/bertimbau-hatespeech-v01')

Some layers from the model checkpoint at ../modelos/bertimbau-hatespeech-v01 were not used when initializing TFBertForSequenceClassification: ['dropout_151']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ../modelos/bertimbau-hatespeech-v01.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [81]:
# Realiza as previsões:
y_test_pred = predict_class(saved_model, test_tfd)



In [82]:
# Metrics:
y_true, y_pred = test_df['label'], y_test_pred
for name, scorer in {'acc': accuracy_score, 'f1': f1_score, 'prec': precision_score, 'rec': recall_score}.items():
    s = scorer(y_true, y_pred)
    print('{}: {:.3f}'.format(name, s))

acc: 0.903
f1: 0.602
prec: 0.705
rec: 0.525


## Criando uma função de previsão:

In [86]:
sampled_df = test_df.sample(5)

In [90]:
input_texts = list(sampled_df['text'])

In [99]:
def rate_speech_for_hate(model, tokenizer, texts):
    
    input_df  = pd.DataFrame({'text': texts, 'label': [1] * len(texts)})
    input_tfd = process_pandas_to_tfdataset(input_df, tokenizer, shuffle=False)

    probs = predict_proba(model, input_tfd)
    
    return probs

In [97]:
sampled_df

Unnamed: 0,label,text
1309,1,Fascistas machistas mimimi mimimi https://t.co...
1857,0,Liberdade e direitos das mulheres não podem se...
5585,1,As Florzinhas ficaram chateadas KKKKKKKKKKKKKK...
214,0,#AlexandraParkClub\nEste paraíso apenas 40min ...
2281,0,no capitalismo eu posso pegar pokemon e aí htt...
