## Dados do cartão de crédito

In [None]:
# %pip install torcheval
# %pip install nlpaug

In [1]:
import pandas as pd
import re


def read_and_process_file(file_path, card_type):
    df = pd.read_csv(file_path, sep=';', decimal=',',
                     names=['Date', 'Description', 'Price', 'Category'])
    df["Card type"] = card_type
    df["Price"] = df["Price"].apply(convert_price)
    df["Type"] = df["Price"].apply(lambda x: "DESPESA" if x < 0 else "RECEITA")
    return df


def convert_price(price):
    return float(re.sub(r'[^\d.-]', '', price).replace(',', '.')) / 100


# Processando os arquivos
df_debit = read_and_process_file('raw_data/cc.txt', 'DEBITO')
df_credit = read_and_process_file('raw_data/ca.txt', 'CREDITO')

# Concatenando e embaralhando os dados
df = pd.concat([df_debit, df_credit], ignore_index=True)
df = df.sample(frac=1, random_state=40)

# Limpando e transformando dados de texto
df["Description"] = df["Description"].str.upper().str.strip()
df["Category"] = df["Category"].str.upper().str.strip()
df["Category"] = df["Category"].replace(
    "ENTRETENIMENTO", "LAZER E ENTRETENIMENTO")

df.head(5)

Unnamed: 0,Date,Description,Price,Category,Card type,Type
739,03/04/2023,100 BEER,-112.31,RESTAURANTES,DEBITO,DESPESA
1105,08/10/2023,7076 SHOP B-CT CURT,-99.25,COMPRAS DIVERSAS,CREDITO,DESPESA
366,28/11/2023,K2 COMERCIO -CT DU01/02,-99.99,COMPRAS DIVERSAS,DEBITO,DESPESA
872,19/05/2023,PAY -MP *RUEDOCA-19/05,-202.4,RESTAURANTES,DEBITO,DESPESA
823,02/05/2023,PAY -KALUNGA CUR-30/04,-48.6,COMPRAS DIVERSAS,DEBITO,DESPESA


## Fazer o aumento dos dados

In [2]:
import nlpaug.augmenter.word as naw
import pandas as pd


def augment_text(text, augmenter, num_variations=5):
    return [augmenter.augment(text) for _ in range(num_variations)]


# Criar um augmenter
augmenter = naw.RandomWordAug(action="swap")

# Lista para armazenar as linhas do novo DataFrame
new_rows = []

# Iterar sobre o DataFrame original
for _, row in df.iterrows():
    description = row['Description']
    category = row['Category']

    # Gerar variações da descrição
    augmented_descriptions = augment_text(description, augmenter)

    # Adicionar as variações ao DataFrame
    for aug_description in augmented_descriptions:
        new_row = {'Description': aug_description[0], 'Category': category}
        new_rows.append(new_row)

# Criar um novo DataFrame com as variações
df_augmented = pd.DataFrame(new_rows)

# Concatenar com o DataFrame original
df_combined = pd.concat([df, df_augmented], ignore_index=True)

# Embaralhar o DataFrame final
df_combined = df_combined.sample(frac=1, random_state=40)

# Remover colunas desnecessárias
df_combined.drop(['Price', 'Date', 'Card type', 'Type'], axis=1, inplace=True)

df_combined.head(5)

Unnamed: 0,Description,Category
2227,- PAY POP SACOLAO - 23 10 /,MERCADO
7167,PIX MARCIO 26 TRANSF / 06,TRANSFERÊNCIA BANCÁRIA
6344,GYMPASSBR GYMPASS,SAÚDE
2676,PAMONHAS RSCSS - SE - / 24 02,RESTAURANTES
3648,PAY - RODA DAGUA,LAZER E ENTRETENIMENTO


## Data transformer

#### Removendo categoria com apenas 1 exemplo

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Supondo que df é o seu DataFrame e 'Category' é a coluna com as classes
label_encoder = LabelEncoder()

# Codificando as categorias e contando as ocorrências de cada código
category_counts = pd.Series(label_encoder.fit_transform(
    df_combined['Category'])).value_counts()

# Identificando os códigos das categorias com apenas um exemplo
categories_to_remove = category_counts[category_counts == 1].index.tolist()

# Filtrando o DataFrame para remover as categorias com apenas um exemplo
df_filtered = df_combined[~df_combined['Category'].isin(
    label_encoder.inverse_transform(categories_to_remove))]

In [4]:
df_filtered.to_csv("full_dataset.csv", decimal=",", index=False, sep=";")

In [5]:
df_filtered['Category'].unique()

array(['MERCADO', 'TRANSFERÊNCIA BANCÁRIA', 'SAÚDE', 'RESTAURANTES',
       'LAZER E ENTRETENIMENTO', 'SALÁRIO', 'OUTROS', 'SAQUES',
       'COMPRAS DIVERSAS', 'TRANSPORTE', 'POUPANÇA E INVESTIMENTOS',
       'RECEITA', 'PAGAMENTO CARTÃO', 'TELECOMUNICAÇÕES', 'MORADIA',
       'ROUPAS E ACESSÓRIOS', 'MESADA', 'EDUCAÇÃO', 'CUIDADOS COM O PET',
       'CUIDADOS PESSOAIS', 'DOAÇÕES OU CONTRIBUIÇÕES', 'SEGUROS'],
      dtype=object)

## Faz o processamento do texto

In [6]:
import pickle
import torch
from torchtext.vocab import build_vocab_from_iterator
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


class TextProcessor:
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.labels = labels
        self.vocab = None
        self.label_encoder = None
        self.one_hot_encoder = None
        self.max_len = 0
        self.num_classes = 0
        self.id_to_label = {}

    def build_vocab(self):
        def yield_tokens(data_iter):
            for text in data_iter:
                yield from text

        self.vocab = build_vocab_from_iterator(
            yield_tokens(self.sentences), specials=["<unk>"])
        self.vocab.set_default_index(self.vocab["<unk>"])

    def tokenize_and_pad(self):
        vectorized_sentences = [self.vocab(list(text))
                                for text in self.sentences]
        self.max_len = max(len(seq) for seq in vectorized_sentences)

        # Ajustar para o próximo número par, se for ímpar
        if self.max_len % 2 != 0:
            self.max_len += 1

        # Adicionando manualmente um token de preenchimento se necessário
        padded_sequences = []
        for seq in vectorized_sentences:
            seq_len = len(seq)
            if seq_len < self.max_len:
                seq += [self.vocab["<pad>"]] * (self.max_len - seq_len)
            padded_sequences.append(torch.tensor(seq))

        padded_sentences = torch.nn.utils.rnn.pad_sequence(
            padded_sequences,
            batch_first=True,
            padding_value=self.vocab["<pad>"]
        )

        return padded_sentences

    def encode_labels(self):
        self.label_encoder = LabelEncoder()
        encoded_labels = self.label_encoder.fit_transform(self.labels)
        self.one_hot_encoder = OneHotEncoder(sparse=False)
        one_hot_labels = self.one_hot_encoder.fit_transform(
            encoded_labels.reshape(-1, 1))
        
        self.num_classes = len(self.label_encoder.classes_)
        return one_hot_labels

    def save_label_mappings(self, filepath='production/id_to_label.pkl'):
        label_to_id = dict(zip(self.label_encoder.classes_,
                           range(len(self.label_encoder.classes_))))
        self.id_to_label = {id_: label for label, id_ in label_to_id.items()}
        with open(filepath, 'wb') as f:
            pickle.dump(self.id_to_label, f)

    def save_vocab(self, filepath='production/vocab.pkl'):
        with open(filepath, 'wb') as f:
            pickle.dump(text_processor.vocab, f)

    def get_vocab_info(self):
        return {
            "vocabulary": self.vocab.get_itos(),
            "max_len": self.max_len,
            "vocab_size": len(self.vocab),
            "num_classes": self.num_classes
        }


# Uso da classe
sentences = df_filtered["Description"]
labels = df_filtered['Category']
text_processor = TextProcessor(sentences, labels)

text_processor.build_vocab()
padded_sentences = text_processor.tokenize_and_pad()
one_hot_labels = text_processor.encode_labels()
text_processor.save_label_mappings()
text_processor.save_vocab()
vocab_info = text_processor.get_vocab_info()

embed_dim = vocab_info["max_len"]  # padded_sentences.size(1)
vocab_size = vocab_info["vocab_size"]
num_classes = vocab_info["num_classes"]

print("Vocabulary: ", vocab_info["vocabulary"])
print("Padded tensor sentences: ", padded_sentences.shape)
print("max_len", vocab_info["max_len"])
print("Vocab Size:", vocab_size)
print("Number of Classes:", num_classes)

Vocabulary:  ['<unk>', ' ', 'A', 'P', 'R', 'O', 'I', 'S', 'E', 'T', '-', 'C', 'N', 'L', '0', '1', 'U', 'M', 'Y', '/', 'F', 'D', '2', 'B', 'G', 'H', '*', 'V', 'X', '3', 'K', '6', '5', '4', '8', 'Z', '9', '7', 'Q', 'J', '.', 'W', 'Ç', 'Ê', 'Á', 'Ã', 'Í', 'É', 'Ó', 'a', 'e', 'p']
Padded tensor sentences:  torch.Size([7746, 32])
max_len 32
Vocab Size: 52
Number of Classes: 22




## Criando os datasets de treino e teste

In [7]:
from sklearn.model_selection import train_test_split

test_size = 0.25

# Dividindo os dados
padded_sentences_train, padded_sentences_test, one_hot_labels_train, one_hot_labels_test = train_test_split(
    padded_sentences, one_hot_labels, test_size=test_size, random_state=42, stratify=one_hot_labels
)

In [8]:
from model.dataset import TextDataset

train_dataset = TextDataset(padded_sentences_train, one_hot_labels_train)
test_dataset = TextDataset(padded_sentences_test, one_hot_labels_test)

## Treinamento

In [2]:
from model.device import get_device
from model.model import ModelXT
from model.train import Trainer

num_heads = 8
dropout = 0.3
num_layers = 2
device = get_device()
model = ModelXT(vocab_size=vocab_size, embed_dim=embed_dim,
                num_heads=num_heads, dropout=dropout, num_classes=num_classes, num_layers=num_layers)

trainer = Trainer(model, train_dataset, test_dataset, device)
trainer.train()

NameError: name 'vocab_size' is not defined

In [11]:
import json

torch.save(model.state_dict(), 'production/model.pth')


data = {
    "vocab_size": vocab_size,
    "embed_dim": embed_dim,
    "num_classes": num_classes,
    "num_layers": num_layers,
    "num_heads": num_heads,
    "dropout": dropout,
}

# Escrever os dados em um arquivo JSON
with open('production/config.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

## Production
- Executa o modelo em modo produção

In [2]:
from production.model import production

sentences = [
            "UB-ER* -12/14 TR +IP",
             "PIX TRANSF IS4B3LL",
             "5H0P PET PAY",
             "PAY QRS PARAR SEM",
             "FL3X CL4RO",
             "PAY 1F00D",
             "IF-O0D",
             "PASTEUR",
             "DOCELANDIA"
             ]
outputs = production(sentences=sentences)
print(outputs)

['TRANSPORTE', 'MESADA', 'CUIDADOS COM O PET', 'TRANSPORTE', 'TELECOMUNICAÇÕES', 'MERCADO', 'RESTAURANTES', 'RESTAURANTES', 'RESTAURANTES']
