O Bert Neuralmind foi o melhor modelo nos dados de treinamento.

Este caderno treina o Bert Neuralmind em todos os tweets para criar o modelo final de classificação da dissertação.

# Parameters

In [None]:
source_folder = 'dados'
destination_folder = 'dados\model'

In [None]:
#MODELO = "bert-base-multilingual-cased"
MODELO = 'neuralmind/bert-base-portuguese-cased'
Nr_epochs = 10
lr = 2e-5
nome_modelo = MODELO

# Libraries

In [None]:
# Libraries

import matplotlib.pyplot as plt
import pandas as pd
import torch
import re

# Preliminaries

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
from sklearn import model_selection

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, cohen_kappa_score, balanced_accuracy_score, roc_auc_score, recall_score, precision_score
import seaborn as sns

from datetime import datetime

import numpy as np

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(device)

# Preliminaries

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODELO)

In [None]:
# Model parameter
MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('texto', text_field), ('label', label_field) ]

In [None]:
df = pd.read_pickle('dados\\df_processado.pkl')
df.info()
#df = df[df['sent_manual'].fillna('nan').str.contains('N|E|S|C|D')]
df2 = df[df['sent_manual'].fillna('nan').str.contains('N|E|S|C')].copy()
#df[df['sent_manual'].str.contains('S|D')]['sent_manual'] = 'N'
def corrige_label(label):
    if label == 'S' or label == 'E':
        return('N')
    else:
        return(label)

In [None]:
#df = df[df['data']>"2010-01-01"]
df = df[~df['sent_manual'].isin(['D'])] #remove tweets marcadaos com D (delete)
df.shape

In [None]:
df2['sent_manual'] = df2['sent_manual'].apply(corrige_label)
df2['sent_manual'].value_counts()
lista_index = df2.index.values.copy()
lista_texto = df2.tweet_limpo.to_list().copy()
lista_label = df2.sent_manual.to_list().copy()

#TRANSFORMA PARA CAIXA BAIXA
corpus = lista_texto.copy()
#for i in range(0,len(corpus)):
#    corpus[i]=corpus[i].lower()


#REMOVE NUMEROS E PONTUACAO
for i in range(0,len(corpus)):
    corpus[i] = re.sub('[0-9]+', '', corpus[i])
    corpus[i] = re.sub(r'[^\w\s]','',corpus[i])
    corpus[i] = re.sub('º','',corpus[i])

# create a dataframe using texts and lables
trainDF = pd.DataFrame()
trainDF['text'] = corpus
#trainDF['text'] = lista_texto
trainDF['label'] = lista_label



trainDF['label'] = trainDF['label'].apply(lambda x: 0 if x=='N' else 1)

trainDF['label'].value_counts()

In [None]:
#split dataset
test_size=0.30
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.4, random_state = 100)

df_train = pd.DataFrame({
        'texto' : train_x,
        'label' : train_y
        })

df_valid = pd.DataFrame({
        'texto' : valid_x,
        'label' : valid_y
        })


valid_x, test_x, valid_y, test_y = model_selection.train_test_split(df_valid['texto'], df_valid['label'], test_size=0.5, random_state = 100)

df_valid = pd.DataFrame({
        'texto' : valid_x,
        'label' : valid_y
        })

df_test = pd.DataFrame({
        'texto' : test_x,
        'label' : test_y
        })

df_total = pd.DataFrame({
    'texto' : df.tweet_limpo.to_list(),
    'label' : np.ones(df.shape[0]).tolist()
})

df_train.to_csv(source_folder+"\\trainBert.csv", index = False)
df_valid.to_csv(source_folder+"\\validBert.csv", index = False)
df_test.to_csv(source_folder+"\\testBert.csv", index = False)
df_total.to_csv(source_folder+"\\totalBert.csv", index = False)


print("df_train.shape =", df_train.shape)
print("df_valid.shape =", df_valid.shape)
print("df_test.shape =", df_test.shape)
print("df_total.shape =", df_total.shape)



In [None]:
# TabularDataset

train, valid, test = TabularDataset.splits(path=source_folder, train='trainBert.csv', validation='validBert.csv',
                                           test='totalBert.csv', format='CSV', fields=fields, skip_header=True)

# Iterators

train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.texto),
                            device=device, train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.texto),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=False, sort=False)

In [None]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = MODELO
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea

In [None]:
def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']

In [None]:
# Evaluation Function

def evaluate(model, test_loader):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for (texto, label), _ in test_loader:

                label = label.type(torch.LongTensor)           
                label = label.to(device)
                texto = texto.type(torch.LongTensor)  
                texto = texto.to(device)
                output = model(texto, label)

                _, output = output
                y_pred.extend(torch.argmax(output, 1).tolist())
                y_true.extend(label.tolist())
    return y_pred
    

In [None]:
best_model = BERT().to(device)

load_checkpoint(destination_folder + '\\model.pt', best_model)

y_pred = evaluate(best_model, test_iter)

In [None]:
df['BERT_neuralmind'] = y_pred
df['BERT_neuralmind'] = df['BERT_neuralmind'].apply(lambda x: "C" if x==1 else "N")

In [None]:
print(df['BERT_neuralmind'].value_counts().sort_values(ascending=False))
print("N {:.2f}".format(df['BERT_neuralmind'].value_counts().sort_values(ascending=False)[0]/df['BERT_neuralmind'].value_counts().sort_values(ascending=False).sum()))
print("N {:.2f}".format(df['BERT_neuralmind'].value_counts().sort_values(ascending=False)[1]/df['BERT_neuralmind'].value_counts().sort_values(ascending=False).sum()))

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df[df.BERT_neuralmind == 'C'][['usuario','tweet','retweets']].sort_values(by = 'retweets', ascending=False).head(5)['tweet']

In [None]:
df['BERT_neuralmind'].to_pickle('dados\\df_BERT_neuralmind.pkl')