# BERT-Neuralmind

Normal dataset

Fonte:
https://towardsdatascience.com/bert-text-classification-using-pytorch-723dfb8b6b5b

https://colab.research.google.com/drive/1P4Hq0btDUDOTGkCHGzZbAx1lb0bTzMMa?usp=sharing


# Parameters

In [None]:
source_folder = 'dados'
destination_folder = 'dados\model'

In [None]:
#MODELO = "bert-base-multilingual-cased"
MODELO = 'neuralmind/bert-base-portuguese-cased'
Nr_epochs = 10
lr = 2e-5
nome_modelo = MODELO

# Libraries

In [None]:
# Libraries

import matplotlib.pyplot as plt
import pandas as pd
import torch
import re

# Preliminaries

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
from sklearn import model_selection

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, cohen_kappa_score, balanced_accuracy_score, roc_auc_score, recall_score, precision_score
import seaborn as sns

from datetime import datetime

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(device)

# Preliminaries

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODELO)

In [None]:
# Model parameter
MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('texto', text_field), ('label', label_field) ]

Open data frame

In [None]:
df = pd.read_pickle('dados\\df_processado.pkl')
df.info()
#df = df[df['sent_manual'].fillna('nan').str.contains('N|E|S|C|D')]
df2 = df[df['sent_manual'].fillna('nan').str.contains('N|E|S|C')].copy()
#df[df['sent_manual'].str.contains('S|D')]['sent_manual'] = 'N'
def corrige_label(label):
    if label == 'S' or label == 'E':
        return('N')
    else:
        return(label)

In [None]:
df2['sent_manual'] = df2['sent_manual'].apply(corrige_label)
df2['sent_manual'].value_counts()
lista_index = df2.index.values.copy()
lista_texto = df2.tweet_limpo.to_list().copy()
lista_label = df2.sent_manual.to_list().copy()

#TRANSFORMA PARA CAIXA BAIXA
corpus = lista_texto.copy()
#for i in range(0,len(corpus)):
#    corpus[i]=corpus[i].lower()


#REMOVE NUMEROS E PONTUACAO
for i in range(0,len(corpus)):
    corpus[i] = re.sub('[0-9]+', '', corpus[i])
    corpus[i] = re.sub(r'[^\w\s]','',corpus[i])
    corpus[i] = re.sub('º','',corpus[i])

# create a dataframe using texts and lables
trainDF = pd.DataFrame()
trainDF['text'] = corpus
#trainDF['text'] = lista_texto
trainDF['label'] = lista_label



trainDF['label'] = trainDF['label'].apply(lambda x: 0 if x=='N' else 1)

trainDF['label'].value_counts()

In [None]:
#split dataset
test_size=0.30
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.4, random_state = 100)

df_train = pd.DataFrame({
        'texto' : train_x,
        'label' : train_y
        })

df_valid = pd.DataFrame({
        'texto' : valid_x,
        'label' : valid_y
        })


valid_x, test_x, valid_y, test_y = model_selection.train_test_split(df_valid['texto'], df_valid['label'], test_size=0.5, random_state = 100)

df_valid = pd.DataFrame({
        'texto' : valid_x,
        'label' : valid_y
        })

df_test = pd.DataFrame({
        'texto' : test_x,
        'label' : test_y
        })


df_train.to_csv(source_folder+"\\trainBert.csv", index = False)
df_valid.to_csv(source_folder+"\\validBert.csv", index = False)
df_test.to_csv(source_folder+"\\testBert.csv", index = False)

print("df_train.shape =", df_train.shape)
print("df_valid.shape =", df_valid.shape)
print("df_test.shape =", df_test.shape)



In [None]:
# TabularDataset

train, valid, test = TabularDataset.splits(path=source_folder, train='trainBert.csv', validation='validBert.csv',
                                           test='testBert.csv', format='CSV', fields=fields, skip_header=True)

# Iterators

train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.texto),
                            device=device, train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.texto),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=False, sort=False)

# Models

In [None]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = MODELO
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea

# Training

In [None]:
# Save and Load Functions

def save_checkpoint(save_path, model, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [None]:
# Training Function

def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = Nr_epochs,
          eval_every = len(train_iter) // 2,
          file_path = destination_folder,
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for (texto, label), _ in train_loader:
            label = label.type(torch.LongTensor)           
            label = label.to(device)
            texto = texto.type(torch.LongTensor)  
            texto = texto.to(device)
            output = model(texto, label)
            loss, _ = output
            

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    

                    # validation loop
                    for (texto, label), _ in valid_loader:
                        label = label.type(torch.LongTensor)           
                        label = label.to(device)
                        texto = texto.type(torch.LongTensor)  
                        texto = texto.to(device)
                        output = model(texto, label)
                        loss, _ = output
                        
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '\\' + 'model.pt', model, best_valid_loss)
                    save_metrics(file_path + '\\' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '\\' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

In [None]:
model = BERT().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

train(model=model, optimizer=optimizer)

In [None]:
train_loss_list, valid_loss_list, global_steps_list = load_metrics(destination_folder + '\\metrics.pt')
plt.plot(global_steps_list, train_loss_list, label='Train')
plt.plot(global_steps_list, valid_loss_list, label='Valid')
plt.xlabel('Global Steps')
plt.ylabel('Loss')
plt.legend()
plt.show() 

# Evaluation

In [None]:
device = torch.device('cpu')

In [None]:
# Evaluation Function

def evaluate(model, test_loader):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for (texto, label), _ in test_loader:

                label = label.type(torch.LongTensor)           
                label = label.to(device)
                texto = texto.type(torch.LongTensor)  
                texto = texto.to(device)
                output = model(texto, label)

                _, output = output
                y_pred.extend(torch.argmax(output, 1).tolist())
                y_true.extend(label.tolist())
    
    print('Classification Report:')
    print(classification_report(y_true, y_pred, labels=[1,0], digits=4))
    
    kappa = cohen_kappa_score(y_true, y_pred)
    print("Kappa score: {:.3f}\n".format(kappa))
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy score: {:.3f}\n".format(acc))
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("f1 macro score: {:.3f}\n".format(f1))
    acc_bal = balanced_accuracy_score(y_true, y_pred)
    print("Balanced Accuracy score: {:.3f}\n".format(acc_bal))
    roc = roc_auc_score(y_true, y_pred)
    print("Area under the ROC curve: {:.3f}\n".format(roc))
    rec = recall_score(y_true, y_pred, pos_label = 1, average='binary')
    print("Recall classe C: {:.3f}\n".format(rec))
    prec = precision_score(y_true, y_pred, pos_label = 1, average='binary')
    print("Precision classe C: {:.3f}\n".format(prec))

    
    dateTimeObj = datetime.now()
    with open("Classificação de tweets\\resultados-classificacao.csv", "a") as myfile:
        myfile.write(nome_modelo+","+str(len(y_pred))+","+str(acc)+","+str(kappa)+","+str(f1)+","+str(acc_bal)+","+str(roc)+","+str(rec)+","+str(prec)+","+dateTimeObj.strftime("%Y-%m-%d")+"\n")


    
    cm = confusion_matrix(y_true, y_pred, labels=[1,0])
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")

    ax.set_title('Confusion Matrix')

    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

    ax.xaxis.set_ticklabels(['C', 'N'])
    ax.yaxis.set_ticklabels(['C', 'N'])

In [None]:
best_model = BERT().to(device)

load_checkpoint(destination_folder + '\\model.pt', best_model)

evaluate(best_model, test_iter)