In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import re, random, sys, pickle
                                            # tfidf          # bad-of-words
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.corpus import wordnet

sys.path.insert(0,'..')
from fields import *

In [2]:
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Treinamento do modelo

## Gerando dados

In [4]:
def find_synonyms(words, qty):

    new_words = []
    for word in words:
        if len(word.split(' ')) > 1:
            word = word.split(' ')[0]

        synonyms=[];
        for syn in wordnet.synsets(word, lang='por'):
            for lemma in syn.lemmas(lang='por'):
                synonyms.append(lemma.name())

        if len(synonyms) > 0:
            new_words.append(np.random.choice(synonyms, size=qty)[0])
        else:
            new_words.append(word)

    return new_words

def vectorize_csv(df, vectorizer, only_strings=False):
    
    if isinstance(df, np.ndarray): df = pd.DataFrame(df)

    if only_strings is True:
        is_word = r'[A-Za-z\s]+'
        words = []
        for column in df.select_dtypes(include=['object']):
            for cell in df[column]:
                if isinstance(cell, str):
                    matches = re.findall(is_word, cell)
                    words.extend(matches)
        text = ' '.join(words)
    else:
        flatten = df.astype(str).values.flatten()
        text = ' '.join(flatten)
    # Ajusta o vetorizador aos dados somente se não foi antes
    if not hasattr(vectorizer, 'vocabulary_'):
        vectorizer.fit([text])
    
    vector=[]
    # vetorizar vários documentos
    if len(df.columns) == 1:
        for doc in df.iloc[:, 0]:
            vec = vectorizer.transform([doc]).toarray()
            vector.extend(vec)
    # vetorizar apenas um
    else:
        vec = vectorizer.transform([text]).toarray()
        vector.extend(vec)
    vector=np.array(vector)

    if not hasattr(vectorizer, 'vocabulary_'):
        return vector, vectorizer
    return vector


def adjust_vectors(vectors, max_=None):
    # ajustar vetores de tipos distintos documento para concatenar

    adjusted_vectors = []
    if max_ is None:
        lengths = [len(vec) for vec in vectors]
        max_ = np.max(lengths)

    for vec in vectors:
        fill = max_ - len(vec)
        if fill > 0:
            mode = stats.mode(vec, keepdims=True)[0][0]
            fill = np.full(fill, mode)
            vec = np.concatenate([vec, fill], axis=0)
            adjusted_vectors.append(vec)
        else:
            adjusted_vectors.append(vec)

    return np.array(adjusted_vectors)


def generate_accounting_data(qty, type_, how, vectorizer=None):
    # gerar dados
    if type_ in ["dre", 1]: fields, label = dre, 1
    if type_ in ["balancete", 2]: fields, label = balancete, 2

    data = []
    if type_ in ['false', 0]:
        fields = aleatorio; label = 0
        random_words = np.random.choice(fields, size=qty, replace=False)
        random_words = find_synonyms(random_words, qty=qty)
        data.extend(random_words)

    else:
        size = int( len(list(fields.keys())) * random.uniform(0.6, 1) )
        for i in range(0, qty):
            dfs = []; text = []
            
            random_fields = np.random.choice(list(fields.keys()), size)
            for f in random_fields:
                words = [f, *fields[f]]
                words = find_synonyms(words, 1)

                if how=="text":
                    text.extend(words)

                elif how=="sheet":
                    values = [round(val, 2) for val in np.random.uniform(1e5, 2e5, size=len(words))]
                    df = pd.DataFrame(data=[words, values]).T
                    dfs.append(df)
            
            if how=="text":
                data.append(' '.join(text))

            elif how=="sheet":
                dfs = pd.concat(dfs)
        
            # data.append(dfs)
    X = np.array(data); y = np.full(len(data), label)

    if vectorizer:
        X = vectorize_csv(X, vectorizer, True)
        return X, y, vectorizer
    
    return X, y

vectorizer = CountVectorizer()
X=[]; y=[]
for doc in ['dre', 'balancete','false']:
    X_, y_, vectorizer = generate_accounting_data(50, doc, 'text', vectorizer)
    X.extend(X_); y.extend(y_)

X=adjust_vectors(X); y=np.array(y)
X.shape, y.shape

((150, 82), (150,))

## Treinamento e teste dos modelos

### Modelo multiclassificador
Um modelo que classifique em mais duas classes

In [5]:
svm = SVC(kernel='linear', probability=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)

print( classification_report(y_test,y_pred) )

(120, 82)
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         9
           1       1.00      0.91      0.95        11
           2       1.00      0.80      0.89        10

    accuracy                           0.90        30
   macro avg       0.92      0.90      0.90        30
weighted avg       0.93      0.90      0.90        30



In [6]:
rf = RandomForestClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print( classification_report(y_test,y_pred) )

(120, 82)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00         3
           2       1.00      1.00      1.00        14

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [7]:
# Identificando se são dados aleatórios
for model in rf, svm:
    X_random, y_random, vectorizer = generate_accounting_data(50, 0, 'text', vectorizer)
    y_pred_random=model.predict(X_random)
    print(set(y_pred_random))
    print(accuracy_score(y_random, y_pred_random))

{0}
1.0
{0}
1.0


In [20]:
# pickle.dump(rf, open('../models/rf_1.pkl', 'wb'))

### Modelo especialista
Dois modelos binários:
- Um modelo para validar se é um documento contábil ou não
- Outro modelo para identificar que tipo de documento que é

In [8]:
qty=50
X = []; y= []
for doc in ['dre', 'balancete','false']:
    if 'false': qty = 100 # gerar uma quantidade balanceada
    X_, y_, vectorizer = generate_accounting_data(qty, doc, 'text', vectorizer)
    X.extend(X_); y.extend(y_)

all_data = pd.DataFrame(np.hstack([X, np.array(y).reshape(1,-1).T]))
false = all_data.loc[all_data.iloc[:, -1] == 0]
not_false = all_data.loc[all_data.iloc[:, -1] != 0]
true = not_false.copy()
true.iloc[:, -1] = [1] * len(true)

print( false.shape, set(false.iloc[:,-1]), true.shape, set(true.iloc[:,-1]) )

true_false = pd.concat([false, true])
X = true_false.iloc[:, :-1]; y = true_false.iloc[:, -1]
X.shape, y.shape

(100, 83) {0} (200, 83) {1}


((300, 82), (300,))

#### É ou não documento contábil

In [9]:
rf_validator = RandomForestClassifier()
svm_validator = SVC(kernel='linear', probability=True)

for model in rf_validator, svm_validator:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print( classification_report(y_test,y_pred) )

    X_random, y_random, vectorizer = generate_accounting_data(50, 0, 'text', vectorizer)

    y_pred_random=model.predict(X_random)
    class_ = next(iter(set(y_pred_random)))
    print('não é válido' if class_ == 0 else 'válido')
    print(accuracy_score(y_random, y_pred_random))    

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        23
           1       1.00      0.97      0.99        37

    accuracy                           0.98        60
   macro avg       0.98      0.99      0.98        60
weighted avg       0.98      0.98      0.98        60

não é válido
1.0
              precision    recall  f1-score   support

           0       0.86      1.00      0.93        19
           1       1.00      0.93      0.96        41

    accuracy                           0.95        60
   macro avg       0.93      0.96      0.94        60
weighted avg       0.96      0.95      0.95        60

não é válido
1.0


#### Qual documento é

In [10]:
dre_test = pd.read_excel('../sheets/DRE_07.2022.xlsx', index_col=[0])
balan_test = pd.read_excel('../sheets/Cópia BALANCETE_31.07.2022.xlsx', index_col=[0])

def clean_sheet(sheet):
    #clean nan cols and rows
    sheet = sheet.dropna(how='all', axis=1).dropna(how='all', axis=0)
    #most frequent row nans pattern
    mode = stats.mode(sheet.isna().sum(axis=1), keepdims=True)[0][0]
    #assuming this as where to scrap data
    data = pd.concat([row for _, row in sheet.iterrows() if row.isna().sum() == mode], axis=1).T

    valid_data = pd.concat(
        [
            col for _, col in data.items() 
            if col.isna().sum() < len(col)*0.1
        ], 
    axis=1).reset_index()

    return valid_data


In [11]:
X = not_false.iloc[:, :-1]; y = not_false.iloc[:, -1]

rf_clf = RandomForestClassifier()
svm_clf = SVC(kernel='linear', probability=True)

for model in rf_clf, svm_clf:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print( classification_report(y_test,y_pred) )

    for doc in dre_test, balan_test:
        valid_data = clean_sheet(doc)
        vector = vectorize_csv(valid_data, vectorizer, only_strings=True)
        class_ = model.predict(vector)
        print(class_)
        print('dre' if class_ == 1 else 'balancete')



              precision    recall  f1-score   support

           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        19

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

[1]
dre
[1]
dre
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        19
           2       1.00      1.00      1.00        21

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

[1]
dre
[1]
dre


Requer ajuste

In [None]:
# pickle.dump(svm, open('../models/svm_1.pkl', 'wb'))
# pickle.dump(svm, open('../models/rf_1.pkl', 'wb'))


# Extraindo dados da planilha

In [None]:
dre_test = pd.read_excel('../sheets/DRE_07.2022.xlsx', index_col=[0])
balan_test = pd.read_excel('../sheets/Cópia BALANCETE_31.07.2022.xlsx')

In [None]:
balan_test.head(10)
clean_sheet(balan_test)

Unnamed: 0.1,index,Unnamed: 0,Unnamed: 2,Unnamed: 4,Unnamed: 7,Unnamed: 9,Unnamed: 11,Unnamed: 13
0,5,Conta Contábil,Cod. R.,,,Débito,Crédito,S. Atual
1,7,1.0.00.00.00.000000,1,A T I V O,1899516.77,15189090.34,14995080.36,2093526.75
2,9,1.1.00.00.00.000000,11,ATIVO CIRCULANTE,1725485.06,15180120.94,14973085.5,1932520.5
3,11,1.1.01.00.00.000000,12,CAIXA E EQUIVALENTE DE CAIXA,384989.03,4965261.93,5273272.6,76978.36
4,13,1.1.01.01.00.000000,13,CAIXA GERAL,802.4,17445.66,16513.91,1734.15
...,...,...,...,...,...,...,...,...
354,446,5.1.90.01.00.000000,51061,OUTRAS RECEITAS OPERACIONAIS,0,0,102566.96,-102566.96
355,447,5.1.90.01.05.000000,529559,ICMS S/OUTRAS ENTRADAS,0,0,288,-288
356,448,5.1.90.01.06.000000,529562,IPI S/OUTRAS ENTRADAS,0,0,576,-576
357,449,5.1.90.01.00.000002,5922,RECUPERACAO CUSTOS/DESPESAS,0,0,4272.52,-4272.52


In [None]:
def data_scraping(sheet, doc_type):
    # doc_type => recebe a classificação do modelo para alterar a abordagem de extração

    valid_data = clean_sheet(sheet).astype(str)
    
    # Extraindo campos contábeis
    fields_pattern = r'[^0-9\.,-]+' if doc_type == 2 else r'\D+'

    fields = []
    strings = valid_data.applymap(lambda x: re.findall(fields_pattern, x)) #[^0-9\.,-]
    for _, col in strings.items():
        # check valid columns
        if col.tolist().count([]) < len(col):
            for row in col:
                if len(row) > 0:
                    fields.extend(row)

    # Extraindo valor dos campos
    all_values = []
    numbers = valid_data.applymap(lambda x: re.findall(r'\d+', x))
    for _, col in numbers.items():
        values = []
        # Checar colunas válidas
        if col.tolist().count([]) < len(col):
            for row in col:
                if len(row) > 1:
                    try:
                        decimal = float( int(row[-1])/100 )
                        integer = int(''.join(row[:-1]))
                        values.append(integer + decimal)
                    except:
                        pass
                else:
                    values.append(np.nan)
        if len(values) > 0:    
            all_values.append(values)

    if doc_type == 1:
        data = pd.DataFrame(data=[fields, *all_values]).T
    
    elif doc_type == 2:
        data = pd.DataFrame(data=[fields, *all_values]).T

    return data.dropna(how='all', axis=1)

# data_scraping(dre_test, 1).head(50)
# data_scraping(balan_test, 2).head(50)

Unnamed: 0,0,2,4,5,6,7,8
0,Conta Contábil,,,,,,
1,Cod,10000000.0,,1899516.77,15189090.34,14995080.36,2093526.75
2,R,11000000.0,,1725485.06,15180120.94,14973085.05,1932520.05
3,,11010000.0,,384989.03,4965261.93,5273272.06,76978.36
4,A T I V O,11010100.0,,802.04,17445.66,16513.91,1734.15
5,ATIVO CIRCULANTE,11010101.01,,802.04,17445.66,16513.91,1734.15
6,CAIXA E EQUIVALENTE DE CAIXA,11010200.0,,293350.89,4584831.05,4859522.32,18659.62
7,CAIXA GERAL,11010201.0,,293350.89,4584831.05,4859522.32,18659.62
8,CAIXA,11010201.04,,293350.89,4107843.71,4382534.98,18659.62
9,BANCOS C/MOVIMENTO,11010201.01,,,476987.34,476987.34,
