## Bibliotecas utilizadas

In [1]:
import pandas as pd
import pickle
import string
import ast

from gensim import corpora
from six import iteritems
from string import digits
from unicodedata import normalize

## 1. Realização da leitura do dataset que está no formato de entrada para o spacy.

In [4]:
spacy_dataset = pd.read_csv('datasets/spacy_dataset.csv')

## 2. Carregamento das StopWords

In [5]:
arq = open("stopwords-pt.txt", 'r')
tokens = arq.readlines()
stoplist = []
for i in tokens:
    stoplist.append(i.replace('\n', ''))
print(len(stoplist))

560


## 3. Leitura do dicionário, produzido com base em todos os textos utilizados para o conjunto de treino.

In [2]:
def LoadDictionary(File):
  with open(File, "rb") as myFile:
      dict = pickle.load(myFile)
      myFile.close()
      return dict

In [3]:
dictionary = LoadDictionary('dic_ocorerencias.dict')

## 4. Remoção das StopWords do dicionário.

In [6]:
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
            if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
print(dictionary)

Dictionary(393049 unique tokens: ['"', '28/09/17,"', '(daniele)', '(filho', '(fotografias),']...)


In [7]:
len(dictionary)

393049

## 5. Função responsável por fazer a limpeza do texto (remoção de ascentos, pontuação e etc).

In [7]:
def clean_string(text):
    remove_digits = str.maketrans('', '', digits)
    table = str.maketrans({key: None for key in string.punctuation})
    return normalize('NFKD', text.lower().translate(table)).encode('ASCII', 'ignore').decode('ASCII').translate(remove_digits).split()


## 6. Função responsável por retornar um vector com os números que correspondem as palavras no dicionário.

In [8]:
def find_dictionary(vector):
    vector_numbers = []
    for token in vector:
        if dictionary.token2id.get(token) != None:
            vector_numbers.append(dictionary.token2id.get(token))
    return vector_numbers

In [9]:
def create_x(text):
  vector_ints = find_dictionary(clean_string(text))
  return vector_ints

In [10]:
keras_dataset = pd.DataFrame(columns=['X_train', 'Y_train'])

In [11]:
spacy_dataset["x_train"] = spacy_dataset["text"].apply(create_x)

## 9. As duas proximas funções são responsáveis por construir o vector Y_train no formato de entrada aceito pelo Keras.

In [12]:
def duplicate(_list, number):
    return [element for element in _list for _ in range(number)]

### 9.1 Definição da quantidade de classes que existem no conjunto de treino do Spacy.

In [13]:
NUMBER_OF_CLASSES = 24

In [14]:
def make_vector_y(vector_text):
    vector = []
    for _ in range(len(vector_text)):
        aux = duplicate([0], NUMBER_OF_CLASSES)
        aux.insert(0, 1)
        vector.append(aux)
    return vector

## 10. Construção de um objeto (dict) com as entidades e a frase ou string que está relacionada.

In [15]:
def make_entity_object(text, entities):
    entity_object = {}
    _string = ""
    aux_ent = ast.literal_eval(entities)

    for start, end, entity in aux_ent['entities']:
        for pos, char in enumerate(text):
            if pos >= start and pos <= end:
                _string += char
        entity_object[entity] = _string
        _string = ""

    return entity_object

## 11. Construindo para cada entidade um vector com as strings que estão relacionadas com a entidade.

In [16]:
def make_string_array_with_entity(entity_object):
    for entity, text in entity_object.items():
        entity_object.update({entity: clean_string(text)})

    return entity_object

## 12. Declaração de um objeto com todas as classes e seu vector correspondente.

In [17]:
default_entity_object = {
    'ADF': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'AB': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'HOM': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'FEM': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'INTER_POL': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'ACHAD_CAD': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
    'UNID_PRISIONAL': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'ACID_TRANSITO': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'LEG_DEFESA': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'MORADOR_RUA': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'DUPLO_HOM': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'LATROC': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'LEG_DEF_TER': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'MORTE_HOSP': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'MORREU_DEPOIS': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'EXECUCAO': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
    'CONFLITO_PM': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], 
    'FOLGA': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    'TRIPLO_HOM': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
    'HOM_DOLOSO': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
    'LES_CORPORAL': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
    'LOC': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
    'ORG': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
    'PER': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
}
# OUTHERS = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

## 13. A seguinte função faz o swap entre a entidade OTHERS e as outras entidades pré definidas.

In [24]:
def swap_entities(entity_object, vector_ints, y_vector):
    pos_dic = []
    pos_vector = []
    
    try:
        for entity, vet_entity in entity_object.items():
            for token in vet_entity:
                if dictionary.token2id.get(token) != None:
                    pos_dic.append(dictionary.token2id.get(token))

            pos_vector = [vector_ints.index(i) for i in pos_dic]

            for j in pos_vector:
                y_vector[j] = default_entity_object[entity]
            pos_dic = []
    except:
        pass

    return y_vector

In [19]:
spacy_dataset.head()

Unnamed: 0.1,Unnamed: 0,text,classif,x_train
0,0,LESÕES: CINCO PERFURAÇÕES NA CABEÇA POR OBJETO...,"{'entities': [(40, 64, 'AB'), (87, 91, 'PER'),...","[36678, 45703, 5191, 67792, 56409, 949, 1808, ..."
1,1,A VÍTIMA FOI LESIONADA COM 5 PERFURAÇÕES A BAL...,"{'entities': [(29, 47, 'ADF'), (130, 134, 'AB'...","[949, 2904, 11289, 189, 26122, 8494, 4791, 160..."
2,2,A VÍTIMA FOI ALVEJADA COM 07 LESÕES POR ARMA D...,"{'entities': [(40, 52, 'ADF'), (445, 452, 'PER...","[949, 28253, 36678, 980, 1004, 33455, 1872, 89..."
3,3,"VÍTIMA DE HOMICÍDIO POR USO DE ARMA DE FOGO, 0...","{'entities': [(10, 19, 'HOM'), (31, 43, 'ADF')...","[949, 26119, 1117, 980, 1004, 36678, 28408, 45..."
4,4,VÍTIMA COM 3 LESOES POR ARMA DE FOGO NA CABEÇA...,"{'entities': [(24, 36, 'ADF')]}","[949, 36678, 980, 1004, 45703, 1655, 68493, 50..."


In [29]:
data = {'X_train': '', 'Y_train': ''}

In [31]:
for _, row in spacy_dataset.iterrows():
#    print(row['classif', 'entities'])
    vector_y_train = make_vector_y(row['x_train'])
    dict_ = make_string_array_with_entity(make_entity_object(row["text"], row["classif"]))
    
    y_train = swap_entities(dict_, row['x_train'], vector_y_train)
 #   data['X_train'].append(row['x_train'])
 #   data['Y_train'].append(y_train)
    keras_dataset = keras_dataset.append({'X_train': row['x_train'], 'Y_train': y_train}, ignore_index=True)

In [34]:
keras_dataset['X_train'][0]

[36678,
 45703,
 5191,
 67792,
 56409,
 949,
 1808,
 4176,
 68493,
 1655,
 8274,
 10601,
 5385,
 949,
 9162,
 4200,
 11792]

In [35]:
keras_dataset['Y_train'][0]

[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [36]:
vet = []

for i in keras_dataset['X_train'][0]:
    vet.append(dictionary[i])
vet

['lesoes',
 'cabeca',
 'objeto',
 'corto',
 'contundente',
 'vitima',
 'conhecida',
 'joao',
 'informacoes',
 'suspeito',
 'agiu',
 'legitima',
 'defesa',
 'vitima',
 'envolveu',
 'acidente',
 'transito']

In [44]:
vet = []
for i in keras_dataset['Y_train'][0]:
    for key, value in default_entity_object.items():
        if i == default_entity_object[key]:
            vet.append(key)
vet

['AB',
 'AB',
 'AB',
 'PER',
 'LEG_DEFESA',
 'LEG_DEFESA',
 'ACID_TRANSITO',
 'ACID_TRANSITO']

In [43]:
default_entity_object.keys()

dict_keys(['ADF', 'AB', 'HOM', 'FEM', 'INTER_POL', 'ACHAD_CAD', 'UNID_PRISIONAL', 'ACID_TRANSITO', 'LEG_DEFESA', 'MORADOR_RUA', 'DUPLO_HOM', 'LATROC', 'LEG_DEF_TER', 'MORTE_HOSP', 'MORREU_DEPOIS', 'EXECUCAO', 'CONFLITO_PM', 'FOLGA', 'TRIPLO_HOM', 'HOM_DOLOSO', 'LES_CORPORAL', 'LOC', 'ORG', 'PER'])

In [46]:
keras_dataset.to_csv('keras_dataset.csv')