# Introdução

Esse caderno tem por objetivo a criação de um modelo básico de treinamento, utilizando classificadores lineares, que faça a predição de homologação de arquivamentos de procedimentos enviados à 1A.CAM do MPF.

Nesse modelo vamos passar a considerar os textos das íntegras das peças de promoção de arquivamento.

**Nota**: os dados desse modelo foram recuperados de procedimentos que tiveram suas deliberações realizadas após o dia 02/07/2018, data em que a nova composição tomou posse na 1A.CAM.

# Carga de dados e pré-processamento

Vamos fazer a carga dos dados.

Vamos ler os textos das íntegras, limpá-los e associar ao dataframe com os dados de homologação.

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

from sklearn import metrics

In [3]:
PATH = "../data/"

In [6]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/paperspace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/paperspace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
# carga dos textos

from os import listdir
from os.path import isfile, join
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = stopwords.words('portuguese')
stop_words.extend(['n', 'nº', 'n°', 'n.º', ',', '.', '!', '?', ';', ':', '...', 'º', '–', '/', '(', ')'])

def get_text(file):
    with open(file, encoding='utf-8', errors='replace') as f:
        data=' '.join(line.strip() for line in f)
        f.close()
    
    return data.strip()

def get_text_non_stop_words(text):
    filtered_text = ''
    for w in word_tokenize(text, language='portuguese'):
        if w not in stop_words:
            filtered_text += w
            filtered_text += ' '            
    return filtered_text.strip()   
    

folder_integras = f'{PATH}/integras-textos'

texts = {}
for file in listdir(folder_integras):
    if isfile(join(folder_integras, file)):
        texts[file.split('.')[0]] = get_text_non_stop_words(get_text(join(folder_integras,file)))
    else:
        print('is not file', file)



In [8]:
# montando um DataFrame
df_original = pd.read_json(f'{PATH}/1A.CAM.homologacao-arquivamento.json')
df_work = df_original.copy()
cols = ['id', 'homologado']
df_work.drop(inplace=True, columns=[col for col in df_work.columns if col not in cols])
df_work['peca_promocao'] = None
df_work.sample(5)

Unnamed: 0,homologado,id,peca_promocao
1162,1,81265033,
577,1,83665696,
1885,1,73391520,
3978,1,78618699,
3098,1,74075317,


In [9]:
# Associando o texto...
for key, text in zip(texts.keys(), texts.values()):
    if key != '' and text != '':
        df_work.loc[df_work.id == int(key), 'peca_promocao'] = text
        
df_work.sample(5)

Unnamed: 0,homologado,id,peca_promocao
1090,1,77378964,MINISTÉRIO PÚBLICO FEDERAL PROCURADORIA DA REP...
5762,1,64490766,MINISTÉRIO PÚBLICO FEDERAL PROCURADORIA DA REP...
2084,1,84601745,EXCELENTÍSSIMO A SENHOR A DOUTOR A JUIZ A FEDE...
2428,1,83467644,Procedimento Preparatório 1.24.000.000198/2018...
6562,1,53391264,


In [10]:
print(len(df_work[df_work['homologado'] == 1]), len(df_work[df_work['homologado'] == 0]))

8267 134


In [11]:
# removendo os sem textos de peça de promoção (problema nos dados)
print(len(df_work[df_work['peca_promocao'].isnull()]))
df_work.dropna(subset=['peca_promocao'], inplace=True)
df_work.reset_index(drop=True, inplace=True)
print(len(df_work[df_work['peca_promocao'].isnull()]))

1688
0


In [12]:
print(len(df_work[df_work['homologado'] == 1]), len(df_work[df_work['homologado'] == 0]))

6612 101


## Treinando um modelo de Bag of Words

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(df_work['peca_promocao'], df_work['homologado'], test_size=0.15, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5706,), (5706,), (1007,), (1007,))

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words=None)
vect = cv.fit(X_train)
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<5706x85085 sparse matrix of type '<class 'numpy.int64'>'
	with 2136432 stored elements in Compressed Sparse Row format>

In [15]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
preds = lr.predict(vect.transform(X_test))

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def print_score(y, preds):
    print('Accuracy score:', accuracy_score(y, preds))
    print('Precision score: ', format(precision_score(y, preds)))
    print('Recall score: ', format(recall_score(y, preds)))
    print('F1 score: ', format(f1_score(y, preds)))
    print('AUC: ', roc_auc_score(y, preds))
    
print_score(y_test, preds)

Accuracy score: 0.98411122145
Precision score:  0.9880239520958084
Recall score:  0.9959758551307847
F1 score:  0.9919839679358717
AUC:  0.536449466027


In [18]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, preds)

array([[  1,  12],
       [  4, 990]])

In [19]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_vectorized, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
preds = nb.predict(vect.transform(X_test))
print_score(y_test, preds)

Accuracy score: 0.987090367428
Precision score:  0.9870903674280039
Recall score:  1.0
F1 score:  0.9935032483758122
AUC:  0.5


In [21]:
confusion_matrix(y_test, preds)

array([[  0,  13],
       [  0, 994]])

### Bag of words com ngrams

In [23]:
vect = CountVectorizer(stop_words=None, ngram_range=(1,3)).fit(X_train)
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<5706x3871439 sparse matrix of type '<class 'numpy.int64'>'
	with 8738487 stored elements in Compressed Sparse Row format>

In [24]:
lr = LogisticRegression()
lr.fit(X_train_vectorized, y_train)

preds = lr.predict(vect.transform(X_test))

print_score(y_test, preds)

Accuracy score: 0.986097318769
Precision score:  0.9870775347912525
Recall score:  0.9989939637826962
F1 score:  0.9930000000000001
AUC:  0.499496981891


In [25]:
confusion_matrix(y_test, preds)

array([[  0,  13],
       [  1, 993]])

In [26]:
nb = MultinomialNB()
nb.fit(X_train_vectorized, y_train)

preds = nb.predict(vect.transform(X_test))
print_score(y_test, preds)

Accuracy score: 0.987090367428
Precision score:  0.9870903674280039
Recall score:  1.0
F1 score:  0.9935032483758122
AUC:  0.5


In [27]:
confusion_matrix(y_test, preds)

array([[  0,  13],
       [  0, 994]])

In [32]:
# liberando memória

%xdel vect
%xdel X_train_vectorized

In [28]:
# Não estou conseguindo rodar essa célula: erro de memória (rodar no Paperspace)

vect = CountVectorizer(stop_words=None, ngram_range=(1,6)).fit(X_train)
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<5706x12651560 sparse matrix of type '<class 'numpy.int64'>'
	with 19527770 stored elements in Compressed Sparse Row format>

In [29]:
lr = LogisticRegression()
lr.fit(X_train_vectorized, y_train)

preds = lr.predict(vect.transform(X_test))

print_score(y_test, preds)

Accuracy score: 0.986097318769
Precision score:  0.9870775347912525
Recall score:  0.9989939637826962
F1 score:  0.9930000000000001
AUC:  0.499496981891


In [30]:
confusion_matrix(y_test, preds)

array([[  0,  13],
       [  1, 993]])

In [31]:
nb = MultinomialNB()
nb.fit(X_train_vectorized, y_train)

preds = nb.predict(vect.transform(X_test))
print_score(y_test, preds)

Accuracy score: 0.987090367428
Precision score:  0.9870903674280039
Recall score:  1.0
F1 score:  0.9935032483758122
AUC:  0.5


In [32]:
confusion_matrix(y_test, preds)

array([[  0,  13],
       [  0, 994]])

## Tfidf

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df=4, ngram_range=(1,7)).fit(X_train)
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<5706x396983 sparse matrix of type '<class 'numpy.float64'>'
	with 6787906 stored elements in Compressed Sparse Row format>

In [52]:
lr = LogisticRegression()
lr.fit(X_train_vectorized, y_train)

preds = lr.predict(vect.transform(X_test))

print_score(y_test, preds)

Accuracy score: 0.987090367428
Precision score:  0.9870903674280039
Recall score:  1.0
F1 score:  0.9935032483758122
AUC:  0.5


In [53]:
confusion_matrix(y_test, preds)

array([[  0,  13],
       [  0, 994]])

In [54]:
nb = MultinomialNB()
nb.fit(X_train_vectorized, y_train)

preds = nb.predict(vect.transform(X_test))
print_score(y_test, preds)

Accuracy score: 0.985104270109
Precision score:  0.9870646766169154
Recall score:  0.9979879275653923
F1 score:  0.9924962481240621
AUC:  0.498993963783


In [55]:
confusion_matrix(y_test, preds)

array([[  0,  13],
       [  2, 992]])