#  XAI BERT- Suicide Detection
**Autora:** Lais Carvalho Coutinho

**Orientador:** Adonias Caetano de Oliveira

**Instituição:** IFCE

**Dataset disponível em:**


## Instalação de Pacotes

In [2]:
!pip install Unidecode transformers

Collecting Unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Unidecode
Successfully installed Unidecode-1.4.0


## Importação de Bibliotecas

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
#text preprocessing libraries

import re
import math
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_

from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc, ConfusionMatrixDisplay, confusion_matrix
from scipy.special import expit
from tqdm.notebook import tqdm

# import seaborn as sns
# from imblearn.under_sampling import RandomUnderSampler
# from unidecode import unidecode
# from string import punctuation
# # from wordcloud import WordCloud
# from lime.lime_text import LimeTextExplainer
# from scipy.special import expit

## Carregamento do Dataset

In [None]:
link_train = 'link púbico do Google Drive de acesso ao CSV de treinamento'
link_test = 'link púbico do Google Drive de acesso ao CSV de teste'

file_id_link_train = link_train.split('/')[-2]
file_id_link_test = link_test.split('/')[-2]

read_link_train = 'https://drive.google.com/uc?id=' + file_id_link_train
read_link_test = 'https://drive.google.com/uc?id=' + file_id_link_test

train_dataset = pd.read_csv(read_link_train,  index_col=0)
test_dataset = pd.read_csv(read_link_test,  index_col=0)

In [9]:
print(test_dataset.columns)

Index(['text', 'classification'], dtype='object')


In [10]:
print(train_dataset['classification'].value_counts())

classification
1    930
0    865
Name: count, dtype: int64


## Divisão dos Dados em Conjuntos de Treinamento, Validação e Teste

In [11]:
# Dividindo os dados de treino em treino e validação
train_df, valid_df, train_label, valid_label = train_test_split(
    train_dataset[['text']], train_dataset['classification'], test_size=0.20, random_state=42)

# Dividindo os dados de teste (já definidos)
test_df = test_dataset[['text']]
test_label = test_dataset['classification']

In [12]:
print("Conjunto de Treinamento:")
print(train_df.head(), "\n")
print("Conjunto de Validação:")
print(valid_df.head(), "\n")
print("Conjunto de Teste:")
print(test_df.head(), "\n")

Conjunto de Treinamento:
                                                   text
1023  i have band concert today and i was excited it...
1732  i hate myself because of one stupid mistake i ...
1555  ive always felt suicidal most of my life and i...
1640  i i cant imagine dying its not something that ...
175   m in college homeless in college i feel so sad... 

Conjunto de Validação:
                                                   text
1234  every single night its been so hard just to ex...
220   days ago i slit my wrists again i was feeling ...
1516  ive been depressed quite while now and i just ...
438                                           emptypost
1268  sometimes i wonder if people that stay alive n... 

Conjunto de Teste:
                                                text
0  sometimes life can be too much we can often fe...
1  i really am having hard time with life nothing...
2  i guess you could say its to the point where i...
3  ive been depressed and i seem to be gettin

In [13]:
# Garantir que todos os valores na coluna 'text' sejam strings
train_df['text'] = train_df['text'].fillna('').astype(str)
valid_df['text'] = valid_df['text'].fillna('').astype(str)
test_df['text'] = test_df['text'].fillna('').astype(str)

train_texts = train_df['text'].values.tolist()
valid_texts = valid_df['text'].values.tolist()
test_texts = test_df['text'].values.tolist()

In [14]:
print(f"Train: {len(train_df)} lines")
print(f"Validation: {len(valid_df)} lines")
print(f"Test: {len(test_df)} lines")

Train: 1436 lines
Validation: 359 lines
Test: 100 lines


In [15]:
label_names = ['non-suicide', 'suicide']
label_names

['non-suicide', 'suicide']

## Inicialização do Tokenizador e do Modelo

In [47]:
PRETRAINED_LM = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_LM, do_lower_case=True)
tokenizer

BertTokenizer(name_or_path='bert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

## Definição de Funções Auxiliares

In [48]:
def encode(docs):
    '''
    This function takes list of texts and returns input_ids and attention_mask of texts
    '''
    encoded_dict = tokenizer.batch_encode_plus(docs,
                                               add_special_tokens=True,
                                               max_length=128,
                                               padding='max_length',
                                               return_attention_mask=True,
                                               truncation=True,
                                               return_tensors='pt')
    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    return input_ids, attention_masks

## Preparação dos dados para o treinamento do modelo

In [49]:
train_input_ids, train_att_masks = encode(train_df['text'].values.tolist())
valid_input_ids, valid_att_masks = encode(valid_df['text'].values.tolist())
test_input_ids, test_att_masks = encode(test_df['text'].values.tolist())

KeyError: 'text'

In [None]:
train_y = torch.LongTensor(train_label.values)
valid_y = torch.LongTensor(valid_label.values)
test_y = torch.LongTensor(test_label.values)

In [None]:
N_labels = len(train_label.unique())
model = BertForSequenceClassification.from_pretrained(PRETRAINED_LM,
                                                      num_labels=N_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

In [None]:
BATCH_SIZE = 16

In [None]:
train_dataset = TensorDataset(train_input_ids, train_att_masks, train_y)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_dataset = TensorDataset(valid_input_ids, valid_att_masks, valid_y)
valid_sampler = SequentialSampler(valid_dataset)
valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_input_ids, test_att_masks, test_y)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [None]:
train_label.unique()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
model = model.to(device)

In [None]:
# Best results: 07 and 08
EPOCHS = 10
LEARNING_RATE = 2e-6

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer,
             num_warmup_steps=0,
            num_training_steps=len(train_dataloader)*EPOCHS )

## Treinamento do modelo

In [None]:
train_loss_per_epoch = []
val_loss_per_epoch = []

for epoch_num in range(EPOCHS):
    print('Epoch: ', epoch_num + 1)
    '''
    Training
    '''
    model.train()
    train_loss = 0
    for step_num, batch_data in enumerate(tqdm(train_dataloader,desc='Training')):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)

        loss = output.loss
        train_loss += loss.item()

        model.zero_grad()
        loss.backward()
        del loss

        clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    train_loss_per_epoch.append(train_loss / (step_num + 1))


    '''
    Validation
    '''
    model.eval()
    valid_loss = 0
    valid_pred = []
    with torch.no_grad():
        for step_num_e, batch_data in enumerate(tqdm(valid_dataloader,desc='Validation')):
            input_ids, att_mask, labels = [data.to(device) for data in batch_data]
            output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)

            loss = output.loss
            valid_loss += loss.item()

            valid_pred.append(np.argmax(output.logits.cpu().detach().numpy(),axis=-1))

    val_loss_per_epoch.append(valid_loss / (step_num_e + 1))
    valid_pred = np.concatenate(valid_pred)

    '''
    Loss message
    '''
    print("{0}/{1} train loss: {2} ".format(step_num+1, math.ceil(len(train_df) / BATCH_SIZE), train_loss / (step_num + 1)))
    print("{0}/{1} val loss: {2} ".format(step_num_e+1, math.ceil(len(valid_df) / BATCH_SIZE), valid_loss / (step_num_e + 1)))

In [None]:
epochs = range(1, EPOCHS +1 )
fig, ax = plt.subplots()
ax.plot(epochs,train_loss_per_epoch,label ='training loss')
ax.plot(epochs, val_loss_per_epoch, label = 'validation loss' )
ax.set_title('Training and Validation loss')
ax.set_xlabel('Epochs')
ax.set_ylabel('Loss')
ax.legend()
plt.show()

## Avaliação do Modelo

In [None]:
model.eval()
test_pred = []
test_loss= 0

with torch.no_grad():
    for step_num, batch_data in tqdm(enumerate(test_dataloader)):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)

        loss = output.loss
        test_loss += loss.item()

        test_pred.append(np.argmax(output.logits.cpu().detach().numpy(),axis=-1))
test_pred = np.concatenate(test_pred)

In [None]:
print('classification report')

print(classification_report(test_pred, test_label,target_names=label_names))

In [None]:
model.eval()
probabilities = []

with torch.no_grad():
  for step_num, batch_data in tqdm(enumerate(test_dataloader)):
      input_ids, att_mask, labels = [data.to(device) for data in batch_data]
      output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)

      prob = expit(output.logits.cpu().detach().numpy())

      probabilities.append( prob )

probabilities = np.concatenate(probabilities)

In [None]:
probabilities

In [None]:
print(type(test_label))
print(type(test_pred))

In [None]:
def plot_roc_curve(fper, tper, AUC):
    plt.plot(fper, tper, color="red", label=f"AUC = {AUC}")
    plt.plot([0, 1], [0, 1], color="green", linestyle="--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristic Curve")
    plt.legend()
    plt.show()


fper, tper, thresholds = roc_curve(test_label, test_pred)
AUC = auc(fper, tper)
plot_roc_curve(fper, tper, AUC)

In [None]:
fper, tper, thresholds = roc_curve(test_label, probabilities[::,1])
AUC = auc(fper, tper)
plot_roc_curve(fper, tper, AUC)

In [None]:
def plot_confusion_matrix(y_preds, y_true, labels=None):
  cm = confusion_matrix(y_true, y_preds, normalize="true")
  fig, ax = plt.subplots(figsize=(6, 6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
  plt.title("Normalized confusion matrix")
  plt.show()

In [None]:
plot_confusion_matrix(test_pred, test_label)

In [None]:
print(test_df.columns)

In [None]:
d = {'pred', 'label', 'prob'}
test_df = pd.DataFrame()

test_df['pred'] = test_pred
test_df['label'] = test_label
test_df['prob'] = probabilities[::,1]

test_df

## **Saving results**

In [None]:
import shutil
import os

In [None]:
original_df = pd.read_csv(test_file_path)

original_df['pred'] = test_pred
original_df['prob'] = probabilities[::,1]

os.makedirs("./Multilingual", exist_ok=True)
output_file_path = './Multilingual/test_predictions-BERTMultilingual61.csv'
original_df.to_csv(output_file_path, index=False)

print(f'Arquivo com previsões e probabilidades salvo em: {output_file_path}')

In [None]:
MODEL_PATH = "/content/test_results_BERTMultilingual.bin"
torch.save(model.state_dict(), MODEL_PATH)