In [51]:
import pandas as pd
import numpy as np
import torch
import transformers
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
import warnings
from sklearn.exceptions import UndefinedMetricWarning
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.metrics import classification_report, confusion_matrix



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Caio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
# Criar a função de pré-processamento do texto
def preprocess_text(text):
    text = text.lower()  # Converter para minúsculas
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remover palavras com 1 ou 2 caracteres
    text = re.sub(r'\s+', ' ', text).strip()  # Remover espaços extras
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remover stop words
    return text

# Carregar o dataset
data = pd.read_csv('extended_transactions_large_unique.csv')  # caminho do arquivo

data_sample = data.sample(1500, random_state=42)

# Extrair as descrições das transações e etiquetas
texts = data_sample['x'].apply(preprocess_text).tolist()
labels = data_sample['y'].tolist()

# Mapear as etiquetas para valores numéricos
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Verificar o número de rótulos únicos
num_labels = len(set(labels))
print("Número de rótulos únicos:", num_labels)
assert num_labels == 10, "O número de rótulos únicos deve ser 10."

# Dividir os dados em conjuntos de treinamento e teste
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Carregar o tokenizer do DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenizar os textos e convertê-los para tensores de entrada
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

# Converter as etiquetas para tensores
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Criar o modelo DistilBERT para classificação de sequências com Dropout
class DistilBertForSequenceClassificationWithDropout(DistilBertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.dropout = torch.nn.Dropout(p=0.5)  # Adicionando dropout de 50%

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.distilbert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[0][:, 0]  # Usar a saída [CLS]
        pooled_output = self.pre_classifier(pooled_output)
        pooled_output = torch.nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)  # Aplicando dropout
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        output = (loss, logits) if loss is not None else (logits,)
        return output

model = DistilBertForSequenceClassificationWithDropout.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

# Ponderar as classes para lidar com desequilíbrios
class_counts = np.bincount(train_labels)
class_weights = 1. / class_counts
weights = class_weights[train_labels]
sampler = torch.utils.data.WeightedRandomSampler(weights, len(weights))

# Criar um DataLoader para os dados de treinamento usando o sampler ponderado
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, sampler=sampler)  # Ajustar o tamanho do lote

# Configurar o otimizador e o scheduler de taxa de aprendizado com uma taxa de aprendizado reduzida
optimizer = AdamW(model.parameters(), lr=1e-5)  # Reduzir a taxa de aprendizado
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 10)  # numero de epocas como 10

# Loop de treinamento
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

for epoch in range(10):  # Aumentar o número de épocas para 10
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, batch_labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        batch_labels = batch_labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=batch_labels)
        loss = outputs[0]  # Extrair a perda do primeiro item da tupla
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    print(f"Epoch {epoch+1} - Average Loss: {total_loss / len(train_loader)}")

# Salvar os pesos do modelo treinado
torch.save(model.state_dict(), 'model_weights.pth')

# Verificar se os pesos foram salvos corretamente
print("Pesos do modelo treinado salvos com sucesso")

# Avaliação
model.eval()
test_encodings = {key: val.to(device) for key, val in test_encodings.items()}
with torch.no_grad():
    outputs = model(**test_encodings)
    logits = outputs[0]  # Extrair logits do primeiro item da tupla
    predictions = torch.argmax(logits, dim=1)

# Converter as etiquetas previstas e as etiquetas reais para CPU
predictions = predictions.cpu().numpy()
test_labels = test_labels.cpu().numpy()


# Criar um DataFrame com as previsões e as etiquetas reais
df_predictions = pd.DataFrame({
    'Text': test_texts,
    'Real Label': label_encoder.inverse_transform(test_labels),
    'Predicted Label': label_encoder.inverse_transform(predictions)
})

# Filtrar as previsões incorretas
df_incorrect_predictions = df_predictions[df_predictions['Real Label'] != df_predictions['Predicted Label']]

# Exibir as previsões incorretas
print(df_incorrect_predictions)


# Relatório de Classificação Detalhado
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    report = classification_report(test_labels, predictions, target_names=label_encoder.classes_, zero_division=1)

print(report)

Número de rótulos únicos: 10


Some weights of DistilBertForSequenceClassificationWithDropout were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Average Loss: 2.205546554766203
Epoch 2 - Average Loss: 1.757932041820727
Epoch 3 - Average Loss: 1.1837029849228107
Epoch 4 - Average Loss: 0.7815493141349993
Epoch 5 - Average Loss: 0.541065196457662
Epoch 6 - Average Loss: 0.41603478867756694
Epoch 7 - Average Loss: 0.34453421125286504
Epoch 8 - Average Loss: 0.29953567683696747
Epoch 9 - Average Loss: 0.2794254398659656
Epoch 10 - Average Loss: 0.2694174529690492
Pesos do modelo treinado salvos com sucesso
Empty DataFrame
Columns: [Text, Real Label, Predicted Label]
Index: []
                 precision    recall  f1-score   support

  Entertainment       1.00      1.00      1.00        30
            Gas       1.00      1.00      1.00        28
      Groceries       1.00      1.00      1.00        35
Interest Income       1.00      1.00      1.00        29
       Pharmacy       1.00      1.00      1.00        30
    Restaurants       1.00      1.00      1.00        27
         Salary       1.00      1.00      1.00        