# Hierarchical Multiclass and Multilabel Text Classification for Crime Report: A Traditional Machine Learning Approach

## Dataset Analysis

In [3]:
import pandas as pd

In [19]:
data = pd.read_csv("./Datasets/01_OriginalDataset.csv")
data.shape

(6533919, 2)

In [20]:
data['DINAMICA_'].isna().sum()

17458

In [21]:
data = data.dropna()
data['DINAMICA_'].isna().sum()

0

In [22]:
len(data[data['DINAMICA_'] == "."])

632

In [23]:
data = data.drop(data[data['DINAMICA_'] == "."].index)
len(data[data['DINAMICA_'] == "."])

0

In [24]:
data.shape

(6174717, 2)

## Table Data Analysis

In [25]:
len(data['TIPO_DELITO'].unique())

1169

In [26]:
data['TIPO_DELITO'].value_counts()

TIPO_DELITO
Estelionato (outros)                                                                              535684
Roubo a Transeunte                                                                                396744
Ameaça                                                                                            374633
Furto outros                                                                                      355123
Fato Atípico                                                                                      278940
                                                                                                   ...  
Omissão de Comunicação                                                                                 1
Extorsão Mediante Sequestro Qualificada - Tentativa (Art. 159 §1º c/c Art. 14, II do CP)               1
Crime de Genocídio                                                                                     1
Emprego de Processo Proibido ou de Substânc

In [27]:
quantidades = data['TIPO_DELITO'].value_counts().to_frame("Quantidade").reset_index()
quantidades['Acumulativo'] = quantidades['Quantidade'].cumsum()
quantidades['Percentual Acumulado'] = 100*quantidades['Acumulativo']/quantidades['Quantidade'].sum()
quantidades.head(80)

Unnamed: 0,TIPO_DELITO,Quantidade,Acumulativo,Percentual Acumulado
0,Estelionato (outros),535684,535684,8.675442
1,Roubo a Transeunte,396744,932428,15.100741
2,Ameaça,374633,1307061,21.167950
3,Furto outros,355123,1662184,26.919193
4,Fato Atípico,278940,1941124,31.436647
...,...,...,...,...
75,Maus-Tratos,9541,5498302,89.045409
76,Lesão Corporal Provocada por Projétil de Arma ...,9524,5507826,89.199651
77,Crime de Perseguição (Art 147-A CP),9519,5517345,89.353812
78,Sanções Penais e Administrativas ao Meio Ambiente,9352,5526697,89.505268


In [28]:
quantidades_90_porcento = quantidades.head(80)
quantidades_90_porcento.tail(10)

Unnamed: 0,TIPO_DELITO,Quantidade,Acumulativo,Percentual Acumulado
70,Apreensão de Adolescente Infrator (Artigo 104 ...,10696,5448772,88.243267
71,Roubo no Interior de Estabelecimento Comercial,10298,5459070,88.410044
72,Estupro,9992,5469062,88.571865
73,Lei de Drogas (Outros) (Lei 11.343/06),9918,5478980,88.732488
74,"Falta de Habilitação, Proibição Dirigir Veícul...",9781,5488761,88.890892
75,Maus-Tratos,9541,5498302,89.045409
76,Lesão Corporal Provocada por Projétil de Arma ...,9524,5507826,89.199651
77,Crime de Perseguição (Art 147-A CP),9519,5517345,89.353812
78,Sanções Penais e Administrativas ao Meio Ambiente,9352,5526697,89.505268
79,Porte Ilegal de Arma de Fogo de Uso Restrito,9344,5536041,89.656595


In [29]:
delitos = list(quantidades_90_porcento['TIPO_DELITO'])
delitos

['Estelionato (outros)',
 'Roubo a Transeunte',
 'Ameaça',
 'Furto outros',
 'Fato Atípico',
 'Roubo de Veículo',
 'Lesão Corporal (outros)',
 'Injúria (outros)',
 'Medida Assecuratória de Direito Futuro',
 'Roubo outros',
 'Furto de Telefone Celular',
 'Roubo de Telefone Celular',
 'Recuperação de Veículo Roubado',
 'Lesão Corporal Provocada por Socos, Tapas e Pontapés',
 'Furto a Transeunte',
 'Extravio de Documento',
 'Lesão Corporal Culposa (outros) (Lei 9503/97)',
 'Furto de Veículo',
 'Tráfico de Drogas (Lei 11.343/06)',
 'Porte de Droga para Consumo Próprio (Lei 11.343/06)',
 'Roubo no Interior de Coletivo',
 'Cumprimento de Mandado de Prisão',
 'Furto no Interior de Residência',
 'Vias de Fato',
 'Furto no Interior de Estabelecimento Comercial',
 'Apreensão (outros)',
 'Furto no Interior de Coletivo',
 'Furto a Estabelecimento Comercial',
 'Roubo de Veículo - Moto',
 'Extravio de Celular',
 'Furto no Interior de Veículo',
 'Associação para Tráfico de Droga (Lei 11.343/06)',
 'D

In [42]:
filtered_data = data[data['TIPO_DELITO'].isin(delitos)]
filtered_data.shape

(5536041, 2)

In [43]:
filtered_data = filtered_data.rename(columns={"DINAMICA_": "DINAMICA", "TIPO_DELITO": "N3"})
len(filtered_data['N3'].unique())

80

## Stratified Sample

In [32]:
from sklearn.model_selection import train_test_split

In [44]:
X = filtered_data.drop(columns=['N3'], axis=1)
y = filtered_data['N3']

SEED = 12345
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(X, y, test_size=0.05, stratify=y, random_state=SEED)

In [47]:
train_data = df_X_train
train_data['N3'] = df_y_train
train_data.shape

(5259238, 2)

In [48]:
test_data = df_X_test
test_data['N3'] = df_y_test
test_data.shape

(276803, 2)

In [49]:
test_data.to_csv(f"./Datasets/02_NeverSeenBefore.csv", index=False)
train_data.to_csv(f"./Datasets/03_TrainDataset.csv", index=False)

## Classification Tree Construction

In [53]:
domain_hierarchy = pd.read_csv('./Datasets/04_DomainHierarchy.csv')
domain_hierarchy

Unnamed: 0,N1,N2,N3
0,Crimes Contra Propriedade,Estelionato,Estelionato (outros)
1,Crimes Contra Propriedade,Estelionato,Estelionato (outros) - Tentativa
2,Crimes Contra Propriedade,Estelionato,Estelionato com Emprego de Cartão de Crédito
3,Crimes Contra Propriedade,Roubo,Roubo a Transeunte
4,Crimes Contra Propriedade,Roubo,Roubo de Veículo
...,...,...,...
75,"Resistência, Desacato ou Desobediência",Resistência qualificada,Resistência qualificada
76,Violação ou Perturbação ou Dano ou Exercício A...,Violação de Domicílio,Violação de Domicílio
77,Violação ou Perturbação ou Dano ou Exercício A...,Perturbação da Tranquilidade,Perturbação da Tranquilidade
78,Violação ou Perturbação ou Dano ou Exercício A...,Exercício Arbitrário das Próprias Razões,Exercício Arbitrário das Próprias Razões


In [59]:
augmented = pd.merge(train_data, domain_hierarchy, on='N3', how='left')
augmented = augmented[['DINAMICA', 'N1', 'N2', 'N3']].reset_index().drop(columns="index")
augmented

Unnamed: 0,DINAMICA,N1,N2,N3
0,"O presente procedimento, que, em tese, trata d...",Crimes Contra Propriedade,Estelionato,Estelionato (outros)
1,Trata-se de cumprimento de mandado de prisão n...,Recuperação de Veículo ou Atos Administrativos...,Atos Administrativos,Cumprimento de Mandado de Prisão
2,CONSTA NA BASE DE ÍNDICE NACIONAL A OCORRÊNCIA...,Recuperação de Veículo ou Atos Administrativos...,Recuperação de Veículo,Recuperação de Veículo Roubado
3,"O comunicante, ALEXANDRE MUNIZ DA SILVA, notic...",Crimes Contra Pessoa,Lesão Corporal,Lesão Corporal (outros)
4,"Relata o comunicante, SGT/PM MARCOS ALBERTO DE...",Crimes Contra Pessoa,Ameaça ou Injúria ou Perseguição ou Dano ou Ex...,Ameaça
...,...,...,...,...
5259233,"Erick Marcelo Avelino Batista , informa que en...",Crimes Contra Propriedade,Estelionato,Estelionato (outros)
5259234,WALACE COSTA DA SILVA - Rep. Legal de MANOELA ...,Crimes Contra Pessoa,Lesão Corporal,"Lesão Corporal Provocada por Socos, Tapas e Po..."
5259235,ÁREA DA 16ª DP -- BARRA\r\n\r\n\r\nESTAVA DENT...,Crimes Contra Propriedade,Furto,Furto no Interior de Coletivo
5259236,A declarante comparece hoje dia 05/05/2020 pa...,Crimes Contra Pessoa,Ameaça ou Injúria ou Perseguição ou Dano ou Ex...,Injúria (outros)


In [60]:
augmented.to_csv(f"./Datasets/05_AugmentedDataset.csv", index=False)

## Classification Tree Imbalance Analysis

In [12]:
def resume(data):
    balance = data.value_counts().to_frame('Quantidade').reset_index()
    balance['Acumulativo'] = balance['Quantidade'].cumsum()
    balance['Percentual Acumulado'] = 100*balance['Acumulativo']/balance['Quantidade'].sum()
    balance['Percentual Absoluto'] = 100*balance['Quantidade']/balance['Quantidade'].sum()
    
    return balance

In [13]:
def n_resume(data, nivel1, nivel2):
    resumes = []
    
    for label in data[nivel1].unique():
        current = resume(data[data[nivel1] == label][nivel2])
        if len(current) > 1:
            resumes.append(current)

    return resumes

In [14]:
n1_balance = resume(augmented['N1'])
n1_balance

Unnamed: 0,N1,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Crimes Contra Propriedade,139221,139221,50.296059,50.296059
1,Crimes Contra Pessoa,65057,204278,73.799056,23.502997
2,Recuperação de Veículo ou Atos Administrativos...,52092,256370,92.618216,18.81916
3,"Relacionados a Drogas, Entorpecentes e Porte d...",10807,267177,96.522437,3.904221
4,Violação ou Perturbação ou Dano ou Exercício A...,4649,271826,98.20197,1.679534
5,"Resistência, Desacato ou Desobediência",2800,274626,99.21352,1.01155
6,Crimes de Trânsito ou Meio Ambiente,2177,276803,100.0,0.78648


In [15]:
for dataset in n_resume(augmented, 'N1', 'N2'):
    display(dataset)

Unnamed: 0,N2,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Roubo,55910,55910,40.159171,40.159171
1,Furto,52016,107926,77.521351,37.36218
2,Estelionato,28979,136905,98.336458,20.815107
3,Receptação,1532,138437,99.436867,1.100409
4,Apropriação Indébita (outros),784,139221,100.0,0.563133


Unnamed: 0,N2,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Tráfico de Drogas (Lei 11.343/06),3649,3649,33.765152,33.765152
1,Porte de Droga para Consumo Próprio (Lei 11.34...,3416,7065,65.374294,31.609142
2,Associação para Tráfico de Droga (Lei 11.343/06),2068,9133,84.51004,19.135745
3,Apreensão de Substância Entorpecente,711,9844,91.089109,6.579069
4,Lei de Drogas (Outros) (Lei 11.343/06),496,10340,95.678727,4.589618
5,Porte Ilegal de Arma de Fogo de Uso Restrito,467,10807,100.0,4.321273


Unnamed: 0,N2,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Ameaça ou Injúria ou Perseguição ou Dano ou Ex...,34195,34195,52.5616,52.5616
1,Lesão Corporal,26712,60907,93.620979,41.059379
2,Homicídio,1621,62528,96.11264,2.491661
3,Estupro,1609,64137,98.585855,2.473216
4,Descumprimento de Medidas Protetivas de Urgência,920,65057,100.0,1.414145


Unnamed: 0,N2,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Dano (outros),1982,1982,42.632824,42.632824
1,Violação de Domicílio,1206,3188,68.573887,25.941063
2,Perturbação da Tranquilidade,906,4094,88.061949,19.488062
3,Exercício Arbitrário das Próprias Razões,555,4649,100.0,11.938051


Unnamed: 0,N2,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Registro de Um Acontecimento,26911,26911,51.660524,51.660524
1,Atos Administrativos,17646,44557,85.535207,33.874683
2,Recuperação de Veículo,7535,52092,100.0,14.464793


Unnamed: 0,N2,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Desacato,789,789,28.178571,28.178571
1,Resistência,757,1546,55.214286,27.035714
2,Desobediência,661,2207,78.821429,23.607143
3,Resistência qualificada,593,2800,100.0,21.178571


Unnamed: 0,N2,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Adulteração de Sinal Identificador de Veículo ...,1220,1220,56.040423,56.040423
1,"Falta de Habilitação, Proibição Dirigir Veícul...",489,1709,78.502526,22.462104
2,Sanções Penais e Administrativas ao Meio Ambiente,468,2177,100.0,21.497474


In [16]:
for dataset in n_resume(augmented, 'N2', 'N3'):
    display(dataset)

Unnamed: 0,N3,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Roubo a Transeunte,19837,19837,35.480236,35.480236
1,Roubo de Veículo,10600,30437,54.439277,18.959041
2,Roubo outros,8048,38485,68.83384,14.394563
3,Roubo de Telefone Celular,6849,45334,81.083885,12.250045
4,Roubo no Interior de Coletivo,3392,48726,87.150778,6.066893
5,Roubo de Veículo - Moto,2123,50849,90.947952,3.797174
6,Roubo no Interior de Veículo,1888,52737,94.324808,3.376856
7,Roubo de Carga,1747,54484,97.449472,3.124665
8,Roubo a Estabelecimento Comercial,911,55395,99.078877,1.629404
9,Roubo no Interior de Estabelecimento Comercial,515,55910,100.0,0.921123


Unnamed: 0,N3,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Furto outros,17756,17756,34.135651,34.135651
1,Furto de Telefone Celular,7849,25605,49.225238,15.089588
2,Furto a Transeunte,6019,31624,60.796678,11.57144
3,Furto de Veículo,3989,35613,68.465472,7.668794
4,Furto no Interior de Residência,3344,38957,74.894263,6.428791
5,Furto no Interior de Estabelecimento Comercial,2783,41740,80.24454,5.350277
6,Furto no Interior de Coletivo,2526,44266,85.100738,4.856198
7,Furto a Estabelecimento Comercial,2244,46510,89.414795,4.314057
8,Furto no Interior de Veículo,2091,48601,93.434712,4.019917
9,Furto de Veículo - Moto,1760,50361,96.818287,3.383574


Unnamed: 0,N3,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Lesão Corporal (outros),10560,10560,39.532794,39.532794
1,"Lesão Corporal Provocada por Socos, Tapas e Po...",6224,16784,62.833184,23.300389
2,Lesão Corporal Culposa (outros) (Lei 9503/97),4274,21058,78.833483,16.000299
3,Vias de Fato,3130,24188,90.551063,11.71758
4,Lesão Corporal Culposa Provocada por Colisão d...,1337,25525,95.556304,5.005241
5,Lesão Corporal Culposa Provocada por Atropelam...,711,26236,98.218029,2.661725
6,Lesão Corporal Provocada por Projétil de Arma ...,476,26712,100.0,1.781971


Unnamed: 0,N3,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Estelionato (outros),26784,26784,92.42555,92.42555
1,Estelionato (outros) - Tentativa,1355,28139,97.101349,4.6758
2,Estelionato com Emprego de Cartão de Crédito,840,28979,100.0,2.898651


Unnamed: 0,N3,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Ameaça,18732,18732,54.779939,54.779939
1,Injúria (outros),10027,28759,84.102939,29.323
2,Difamação,1712,30471,89.109519,5.00658
3,Calúnia,1447,31918,93.341132,4.231613
4,Extorsão (outros),787,32705,95.642638,2.301506
5,Injúria por preconceito,537,33242,97.213043,1.570405
6,Maus-Tratos,477,33719,98.607984,1.394941
7,Crime de Perseguição (Art 147-A CP),476,34195,100.0,1.392016


Unnamed: 0,N3,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Fato Atípico,13947,13947,51.826391,51.826391
1,Extravio de Documento,6017,19964,74.185277,22.358887
2,Extravio de Celular,2121,22085,82.066813,7.881535
3,Desaparecimento outros,1543,23628,87.800528,5.733715
4,Encontro de Desaparecido,1384,25012,92.943406,5.142878
5,Morte sem Assistência Médica,951,25963,96.477277,3.533871
6,Extravio de Placa de Veículo,948,26911,100.0,3.522723


Unnamed: 0,N3,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Medida Assecuratória de Direito Futuro,8109,8109,45.953757,45.953757
1,Cumprimento de Mandado de Prisão,3371,11480,65.057237,19.10348
2,Apreensão (outros),2753,14233,80.658506,15.601269
3,Remoção para Verificação de Óbito,1697,15930,90.275417,9.61691
4,Apreensão de Veículo,635,16565,93.873966,3.598549
5,Proveniente de Prisão Preventiva da Unidade Po...,546,17111,96.968151,3.094186
6,Apreensão de Adolescente Infrator (Artigo 104 ...,535,17646,100.0,3.031849


Unnamed: 0,N3,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Recuperação de Veículo Roubado,6281,6281,83.357664,83.357664
1,Recuperação de Veículo Furtado,1254,7535,100.0,16.642336


Unnamed: 0,N3,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Estupro de Vulnerável,1109,1109,68.924798,68.924798
1,Estupro,500,1609,100.0,31.075202


Unnamed: 0,N3,Quantidade,Acumulativo,Percentual Acumulado,Percentual Absoluto
0,Homicídio Provocado por Projétil de Arma de Fogo,977,977,60.271437,60.271437
1,Homicídio Provocado por Projétil de Arma de Fo...,644,1621,100.0,39.728563


## Tree Balance

### Mount

In [61]:
import copy

In [64]:
tree = [
    {
        "class": "root",
        "count": 0,
        "parent": None,
        "children": [],
        "level": "N0"
    }
]
levels = ['N0', 'N1', 'N2', 'N3']

In [65]:
for categ in domain_hierarchy['N1'].unique():
    # Adiciona o categ ao children do pai
    tree[0]['children'].append(categ)

    # Cria o nó
    node = {
        "class": categ,
        "count": 0,
        "parent": tree[0]["class"],
        "children": [],
        "level": "N1"
    }

    # Adiciona a árvore
    tree.append(node)

In [66]:
for categ in domain_hierarchy['N2'].unique():
    # Adiciona o categ ao children do pai
    parent = domain_hierarchy[domain_hierarchy['N2'] == categ]['N1'].iloc[0]
    parent_index = next((index for (index, d) in enumerate(tree) if d["class"] == parent), None)
    tree[parent_index]['children'].append(categ)

    # Cria o nó
    node = {
        "class": categ,
        "count": 0,
        "parent": tree[parent_index]["class"],
        "children": [],
        "level": "N2"
    }

    # Adiciona a árvore
    tree.append(node)

In [67]:
for categ in domain_hierarchy['N3'].unique():
    # Adiciona o categ ao children do pai
    parent = domain_hierarchy[domain_hierarchy['N3'] == categ]['N2'].iloc[0]
    parent_index = next((index for (index, d) in enumerate(tree) if d["class"] == parent), None)
    tree[parent_index]['children'].append(categ)

    # Cria o nó
    node = {
        "class": categ,
        "count": 0,
        "parent": tree[parent_index]["class"],
        "children": [],
        "level": "N3"
    }

    # Adiciona a árvore
    tree.append(node)

In [68]:
def get_node(categ, level, target):
    for node in target:
        if node['class'] == categ and node['level'] == level:
            return node

    return None

In [69]:
def update_parents(count, node, target):
    print(f"{node['class']} - {node['count'] + count}")
    if node['class'] == 'root':
        node['count'] += count
        print("\n")
        return

    node['count'] += count
    level = levels.index(node['level']) - 1
    print(f"{node['parent']} - {levels[level]}")
    update_parents(count, get_node(node['parent'], levels[level], target), target)

In [71]:
values = augmented['N3'].value_counts()

total = 0

for node in tree:
    if node['level'] == 'N3':
        if node['class'] in values:
            count = values[node['class']]
            total += count
            update_parents(count, node, tree)

print(f"TOTAL: {total}")

Estelionato (outros) - 508900
Estelionato - N2
Estelionato - 508900
Crimes Contra Propriedade - N1
Crimes Contra Propriedade - 508900
root - N0
root - 508900


Estelionato (outros) - Tentativa - 25742
Estelionato - N2
Estelionato - 534642
Crimes Contra Propriedade - N1
Crimes Contra Propriedade - 534642
root - N0
root - 534642


Estelionato com Emprego de Cartão de Crédito - 15961
Estelionato - N2
Estelionato - 550603
Crimes Contra Propriedade - N1
Crimes Contra Propriedade - 550603
root - N0
root - 550603


Roubo a Transeunte - 376907
Roubo - N2
Roubo - 376907
Crimes Contra Propriedade - N1
Crimes Contra Propriedade - 927510
root - N0
root - 927510


Roubo de Veículo - 201397
Roubo - N2
Roubo - 578304
Crimes Contra Propriedade - N1
Crimes Contra Propriedade - 1128907
root - N0
root - 1128907


Roubo outros - 152920
Roubo - N2
Roubo - 731224
Crimes Contra Propriedade - N1
Crimes Contra Propriedade - 1281827
root - N0
root - 1281827


Roubo de Telefone Celular - 130137
Roubo - N2
Roubo 

In [73]:
def remove_node(categ, level, target):
    node = get_node(categ, level, target)
    target.remove(node)

    level = levels.index(node['level']) - 1
    
    parent = get_node(node['parent'], levels[level], target)
    parent['children'].remove(categ)


In [74]:
for node in tree:
    if node['count'] == 0:
        print(f"{node['class']} - {node['count']}")
        remove_node(node['class'], node['level'], tree)

In [75]:
def get_level_nodes(level, target):
    nodes = []
    for node in target:
        if node['level'] == level:
            nodes.append(node)

    return nodes

In [76]:
def tree_print(target):
    print(f"{target[0]['class']} - {target[0]['count']}")
    for child in target[0]['children']:
        node = get_node(child, 'N1', target)
        print(f"\t|- {node['class']} - {node['count']}")
        for child_2 in node['children']:
            node_2 = get_node(child_2, 'N2', target)
            print(f"\t|\t|- {node_2['class']} - {node_2['count']}")
            for child_3 in node_2['children']:
                node_3 = get_node(child_3, 'N3', target)
                print(f"\t|\t|\t|- {node_3['class']} - {node_3['count']}")

In [77]:
def print_level_count(level, target):
    if level in levels:
        if level != 'N0':
            index = levels.index(level)
            for node in get_level_nodes(levels[index - 1], target):
                print([get_node(child, level, target)['count'] for child in node['children']])
        else:
            print(target[0]['count'])
    else:
        print(f"Levels: {levels}")

In [78]:
tree_print(tree)

root - 5259238
	|- Crimes Contra Propriedade - 2645215
	|	|- Estelionato - 550603
	|	|	|- Estelionato (outros) - 508900
	|	|	|- Estelionato (outros) - Tentativa - 25742
	|	|	|- Estelionato com Emprego de Cartão de Crédito - 15961
	|	|- Roubo - 1062306
	|	|	|- Roubo a Transeunte - 376907
	|	|	|- Roubo de Veículo - 201397
	|	|	|- Roubo outros - 152920
	|	|	|- Roubo de Telefone Celular - 130137
	|	|	|- Roubo no Interior de Coletivo - 64440
	|	|	|- Roubo de Veículo - Moto - 40335
	|	|	|- Roubo no Interior de Veículo - 35871
	|	|	|- Roubo de Carga - 33202
	|	|	|- Roubo a Estabelecimento Comercial - 17314
	|	|	|- Roubo no Interior de Estabelecimento Comercial - 9783
	|	|- Furto - 988300
	|	|	|- Furto outros - 337367
	|	|	|- Furto de Telefone Celular - 149122
	|	|	|- Furto a Transeunte - 114355
	|	|	|- Furto de Veículo - 75800
	|	|	|- Furto no Interior de Residência - 63541
	|	|	|- Furto no Interior de Estabelecimento Comercial - 52868
	|	|	|- Furto no Interior de Coletivo - 47997
	|	|	|- Fur

### Balance

In [80]:
# Balancear Nível 3
tree_n3 = copy.deepcopy(tree)

for node in get_level_nodes('N2', tree_n3):
    minimum = min([get_node(child, 'N3', tree_n3)['count'] for child in node['children']])
    
    for child in node['children']:
        node = get_node(child, 'N3', tree_n3)
        
        diff = node['count'] - minimum
        node['count'] = minimum

        parent_N2 = get_node(node['parent'], 'N2', tree_n3)
        parent_N2['count'] -= diff
        parent_N1 = get_node(parent_N2['parent'], 'N1', tree_n3)
        parent_N1['count'] -= diff
        parent_N0 = get_node(parent_N1['parent'], 'N0', tree_n3)
        parent_N0['count'] -= diff

In [81]:
# Balancear o Nível 2
tree_n2 = copy.deepcopy(tree_n3)

for node_1 in get_level_nodes('N1', tree_n2):
    minimum = min([get_node(child, 'N2', tree_n2)['count'] for child in node_1['children']])

    for child_2 in node_1['children']:
        node_2 = get_node(child_2, 'N2', tree_n2)
        delta = minimum // len(node_2['children'])

        for child_3 in node_2['children']:
            node_3 = get_node(child_3, 'N3', tree_n2)
            diff = node_3['count'] - delta

            node_3['count'] -= diff
            node_2['count'] -= diff
            node_1['count'] -= diff
            tree_n2[0]['count'] -= diff

In [82]:
# Balancear o Nível 1
tree_n1 = copy.deepcopy(tree_n2)

minimum = min([get_node(child, 'N1', tree_n1)['count'] for child in tree[0]['children']])

for child_1 in tree[0]['children']:
    node_1 = get_node(child_1, 'N1', tree_n1)
    delta_1 = minimum // len(node_1['children'])

    for child_2 in node_1['children']:
        node_2 = get_node(child_2, 'N2', tree_n1)
        delta_2 = delta_1 // len(node_2['children'])

        for child_3 in node_2['children']:
            node_3 = get_node(child_3, 'N3', tree_n1)
            diff = node_3['count'] - delta_2

            node_3['count'] -= diff
            node_2['count'] -= diff
            node_1['count'] -= diff
            tree_n1[0]['count'] -= diff

In [83]:
print_level_count('N1', tree_n1)

[26646, 26645, 26652, 26650, 26652, 26652, 26652]


In [84]:
tree_print(tree_n1)

root - 186549
	|- Crimes Contra Propriedade - 26646
	|	|- Estelionato - 5328
	|	|	|- Estelionato (outros) - 1776
	|	|	|- Estelionato (outros) - Tentativa - 1776
	|	|	|- Estelionato com Emprego de Cartão de Crédito - 1776
	|	|- Roubo - 5330
	|	|	|- Roubo a Transeunte - 533
	|	|	|- Roubo de Veículo - 533
	|	|	|- Roubo outros - 533
	|	|	|- Roubo de Telefone Celular - 533
	|	|	|- Roubo no Interior de Coletivo - 533
	|	|	|- Roubo de Veículo - Moto - 533
	|	|	|- Roubo no Interior de Veículo - 533
	|	|	|- Roubo de Carga - 533
	|	|	|- Roubo a Estabelecimento Comercial - 533
	|	|	|- Roubo no Interior de Estabelecimento Comercial - 533
	|	|- Furto - 5328
	|	|	|- Furto outros - 444
	|	|	|- Furto de Telefone Celular - 444
	|	|	|- Furto a Transeunte - 444
	|	|	|- Furto de Veículo - 444
	|	|	|- Furto no Interior de Residência - 444
	|	|	|- Furto no Interior de Estabelecimento Comercial - 444
	|	|	|- Furto no Interior de Coletivo - 444
	|	|	|- Furto a Estabelecimento Comercial - 444
	|	|	|- Furto no 

## Dataset Balanced Sample

In [85]:
print("Total of Registers:")
print(f"N1, N2 and N3 Balanced: {tree_n1[0]['count']}")
print(f"Unbalanced: {tree[0]['count']}")

Total of Registers:
N1, N2 and N3 Balanced: 186549
Unbalanced: 5259238


In [86]:
def sample_data(target, dataset):
    sampled_data_list = []
    for node in get_level_nodes('N3', target):
        sampled_data_list.append(dataset[dataset['N3'] == node['class']].sample(n=node['count']))
    
    return pd.concat(sampled_data_list)

In [103]:
sampled_data = sample_data(tree_n1, augmented)
sampled_data.shape

(186549, 4)

In [104]:
sampled_data.columns

Index(['DINAMICA', 'N1', 'N2', 'N3'], dtype='object')

In [105]:
sampled_data.to_csv("./Datasets/06_SampledBalancedDataset.csv", index=False)

## NLP Preprocessing

In [92]:
!pip install spacy
!python3 -m spacy download "pt_core_news_lg"

Collecting pt-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_lg-3.8.0/pt_core_news_lg-3.8.0-py3-none-any.whl (568.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.2/568.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_lg')


In [94]:
# https://medium.com/@maleeshadesilva21/preprocessing-steps-for-natural-language-processing-nlp-a-beginners-guide-d6d9bf7689c9
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import spacy

In [110]:
# Fazendo uma cópia para manter a base sem preprocessamento
preprocessed_data = copy.deepcopy(sampled_data)

In [111]:
# Passando para lowercase
preprocessed_data['DINAMICA'] = sampled_data['DINAMICA'].str.lower()

In [112]:
# Removendo possíveis urls no texto
url_pattern = re.compile(r'https?://\S+')

def remove_url(text):
    new_text = url_pattern.sub('', text)

    # Se quiser visualizar as alterações
    # if text != new_text:
    #     print(f"{text}\n")
    #     print(f"{new_text}\n\n\n\n")
    
    return new_text

preprocessed_data['DINAMICA'] = preprocessed_data['DINAMICA'].apply(remove_url)

In [113]:
# Removendo caracteres doidos
preprocessed_data['DINAMICA'] = preprocessed_data['DINAMICA'].replace(to_replace=r'[^\w\s]', value='', regex=True)

In [114]:
# Removendo dígitos
preprocessed_data['DINAMICA'] = preprocessed_data['DINAMICA'].replace(to_replace=r'\d', value='', regex=True)

In [115]:
# Tokenização para poder remover stopwords e fazer o stemming/lemmitizing
preprocessed_data['DINAMICA'] = preprocessed_data['DINAMICA'].apply(word_tokenize)

In [116]:
# Removendo stopwords
# https://www-geeksforgeeks-org.translate.goog/removing-stop-words-nltk-python/?_x_tr_sl=en&_x_tr_tl=pt&_x_tr_hl=pt&_x_tr_pto=tc

# nltk.download('stopwords')
stop = set(stopwords.words('portuguese'))

def remove_stopwords(text):
    new_text = [word for word in text if word not in stop]

    # Se quiser visualizar as alterações
    # if text != new_text:
    #     print(f"{text}\n\n")
    #     print([word for word in data['DINAMICA'][0] if word not in stop])
    #     print("\n\n\n\n")

    return new_text

preprocessed_data['DINAMICA'] = preprocessed_data['DINAMICA'].apply(remove_stopwords)

In [117]:
# Stemming ou Lemmatizing
# https://www.alura.com.br/artigos/lemmatization-vs-stemming-quando-usar-cada-uma?srsltid=AfmBOorxWm2lE7ueBG612G7amCSTLy06GjdyjeN6jPhlIGtLoMBdRf3c
nlp = spacy.load('pt_core_news_lg')

def lemmatizer(text):
    new_text = [token.lemma_ for token in nlp(' '.join(text))]
    return new_text
    
preprocessed_data['DINAMICA'] = preprocessed_data['DINAMICA'].apply(lemmatizer)

In [118]:
# Desfazer a tokenização pq o E5 vai fazer a dele de novo
def detokenizer(text):
    new_text = ' '.join(text)
    return new_text
    
preprocessed_data['DINAMICA'] = preprocessed_data['DINAMICA'].apply(detokenizer)

In [120]:
preprocessed_data.to_csv("./Datasets/07_PreprocBalancedDataset.csv", index=False)

## Numerical Vectors

In [5]:
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
from tqdm import tqdm

import math

tqdm.pandas()

  from tqdm.autonotebook import tqdm, trange
2025-03-05 13:52:57.203630: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-05 13:52:57.958542: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741193578.253340   10212 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741193578.343256   10212 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-05 13:52:59.128147: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow b

In [6]:
model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')

In [7]:
columns = [f"Embedding_{i}" for i in range(1024)]

def generate_embedding(row):
    embedding = model.encode(row['DINAMICA'], convert_to_tensor=False, normalize_embeddings=True)
    return pd.concat([row, pd.Series(embedding, index=columns)])

In [8]:
# start_index é o número inicial do próximo arquivo. Por exemplo 000 ou 001 ou 002
# mas coloca só o número  como 0 ou 1 ou 2
def embed(dataset, file_number, slice_size, name_reductor, start_index=0, base_type='Raw'):
    SLICE = slice_size
    REDUCTOR = name_reductor
    
    for i in range(start_index, math.ceil(len(dataset) / SLICE)):
        start = i * SLICE
        start_name = int((i * SLICE) / REDUCTOR)
        end = (i + 1) * SLICE
        end_name = int(end  / REDUCTOR)
        index = "{:03d}".format(i)

        file_name = f"{file_number}_{index}_{base_type}_embeddings_{start_name}_{end_name}.csv"

        print(f"Index {index}: Start Row: {start} - End Row: {end}")
        print(f"File Name: {file_name}")

        dataset.iloc[start:end,:].progress_apply(generate_embedding, axis=1).to_csv(f"./Datasets/{file_name}", index=False)

### Train Datasets

In [128]:
embed(sampled_data, '08', 50000, 1000, start_index=0, base_type='Raw')

Index 000: Start Row: 0 - End Row: 50000
File Name: 08_000_Raw_embeddings_0_50.csv


100%|███████████████████████████████| 50000/50000 [34:58<00:00, 23.83it/s]


Index 001: Start Row: 50000 - End Row: 100000
File Name: 08_001_Raw_embeddings_50_100.csv


100%|███████████████████████████████| 50000/50000 [28:28<00:00, 29.27it/s]


Index 002: Start Row: 100000 - End Row: 150000
File Name: 08_002_Raw_embeddings_100_150.csv


100%|███████████████████████████████| 50000/50000 [36:57<00:00, 22.54it/s]


Index 003: Start Row: 150000 - End Row: 200000
File Name: 08_003_Raw_embeddings_150_200.csv


100%|███████████████████████████████| 36549/36549 [26:25<00:00, 23.06it/s]


In [131]:
embed(preprocessed_data, '09', 50000, 1000, start_index=0, base_type='Preproc')

Index 000: Start Row: 0 - End Row: 50000
File Name: 09_000_Preproc_embeddings_0_50.csv


100%|███████████████████████████████| 50000/50000 [21:40<00:00, 38.46it/s]


Index 001: Start Row: 50000 - End Row: 100000
File Name: 09_001_Preproc_embeddings_50_100.csv


100%|███████████████████████████████| 50000/50000 [18:32<00:00, 44.93it/s]


Index 002: Start Row: 100000 - End Row: 150000
File Name: 09_002_Preproc_embeddings_100_150.csv


100%|███████████████████████████████| 50000/50000 [24:07<00:00, 34.53it/s]


Index 003: Start Row: 150000 - End Row: 200000
File Name: 09_003_Preproc_embeddings_150_200.csv


100%|███████████████████████████████| 36549/36549 [16:43<00:00, 36.41it/s]


### Never-seen-before Dataset

In [9]:
embed(test_data, '12', 50000, 1000, start_index=0, base_type='NeverSeen')

Index 000: Start Row: 0 - End Row: 50000
File Name: 12_000_NeverSeen_embeddings_0_50.csv


100%|█████████████████████████████████████| 50000/50000 [31:19<00:00, 26.60it/s]


Index 001: Start Row: 50000 - End Row: 100000
File Name: 12_001_NeverSeen_embeddings_50_100.csv


100%|█████████████████████████████████████| 50000/50000 [30:01<00:00, 27.76it/s]


Index 002: Start Row: 100000 - End Row: 150000
File Name: 12_002_NeverSeen_embeddings_100_150.csv


100%|█████████████████████████████████████| 50000/50000 [30:03<00:00, 27.72it/s]


Index 003: Start Row: 150000 - End Row: 200000
File Name: 12_003_NeverSeen_embeddings_150_200.csv


100%|█████████████████████████████████████| 50000/50000 [29:29<00:00, 28.26it/s]


Index 004: Start Row: 200000 - End Row: 250000
File Name: 12_004_NeverSeen_embeddings_200_250.csv


100%|█████████████████████████████████████| 50000/50000 [29:43<00:00, 28.04it/s]


Index 005: Start Row: 250000 - End Row: 300000
File Name: 12_005_NeverSeen_embeddings_250_300.csv


100%|█████████████████████████████████████| 26803/26803 [15:51<00:00, 28.16it/s]


## Dataset Anonymization

In [2]:
import glob

In [3]:
def read_and_concat(dir_path):
    dir_path = dir_path + '*.csv'
    filelist = glob.glob(dir_path)
    dataframes = []
    
    for filename in sorted(filelist): 
        print (str(filename))
        dataframes.append(pd.read_csv(filename))

    data = pd.concat(dataframes)
    data = data.drop(columns=['DINAMICA'])
    
    return data

### Train Datasets

In [143]:
raw = read_and_concat("./Datasets/08")
raw.head()

./Datasets/08_000_Raw_embeddings_0_50.csv
./Datasets/08_001_Raw_embeddings_50_100.csv
./Datasets/08_002_Raw_embeddings_100_150.csv
./Datasets/08_003_Raw_embeddings_150_200.csv


Unnamed: 0,N1,N2,N3,Embedding_0,Embedding_1,Embedding_2,Embedding_3,Embedding_4,Embedding_5,Embedding_6,...,Embedding_1014,Embedding_1015,Embedding_1016,Embedding_1017,Embedding_1018,Embedding_1019,Embedding_1020,Embedding_1021,Embedding_1022,Embedding_1023
0,Crimes Contra Propriedade,Estelionato,Estelionato (outros),0.012348,0.040149,-0.01596,-0.058446,0.001246,-0.016166,-0.041804,...,-0.046289,-0.005526,0.031592,-0.010259,-0.003605,0.007059,0.004279,-0.009982,-0.024476,-0.012056
1,Crimes Contra Propriedade,Estelionato,Estelionato (outros),0.024326,0.026108,-0.019716,-0.079539,0.005686,-0.010184,-0.061117,...,-0.029137,-0.014215,0.021615,-0.012493,-0.025498,0.018614,0.016457,-0.018251,-0.061449,0.007135
2,Crimes Contra Propriedade,Estelionato,Estelionato (outros),0.003191,0.024324,-0.027929,-0.080219,-0.004631,-0.020091,-0.040208,...,-0.020344,-0.033554,0.017295,-0.019509,-0.003888,0.008601,0.015066,-0.017995,-0.046333,-0.000415
3,Crimes Contra Propriedade,Estelionato,Estelionato (outros),-0.010253,0.015941,-0.000855,-0.071614,0.007259,-0.033037,-0.041098,...,-0.039573,-0.013523,0.041449,-0.026685,0.007844,0.00981,0.02653,-0.017604,-0.048347,-0.00366
4,Crimes Contra Propriedade,Estelionato,Estelionato (outros),-0.003626,0.022467,-0.010117,-0.08065,-0.008691,-0.027933,-0.043706,...,-0.02823,-0.021014,0.034443,-0.020132,-0.003016,0.012413,0.025609,-0.026614,-0.043004,-0.008974


In [146]:
preproc = read_and_concat("./Datasets/09")
preproc.head()

./Datasets/09_000_Preproc_embeddings_0_50.csv
./Datasets/09_001_Preproc_embeddings_50_100.csv
./Datasets/09_002_Preproc_embeddings_100_150.csv
./Datasets/09_003_Preproc_embeddings_150_200.csv


Unnamed: 0,N1,N2,N3,Embedding_0,Embedding_1,Embedding_2,Embedding_3,Embedding_4,Embedding_5,Embedding_6,...,Embedding_1014,Embedding_1015,Embedding_1016,Embedding_1017,Embedding_1018,Embedding_1019,Embedding_1020,Embedding_1021,Embedding_1022,Embedding_1023
0,Crimes Contra Propriedade,Estelionato,Estelionato (outros),0.024872,0.012243,-0.015403,-0.055005,0.002113,-0.019193,-0.049924,...,-0.051954,-0.015725,0.019631,-0.03388,0.005812,0.001146,0.008117,-0.031364,-0.035207,0.000757
1,Crimes Contra Propriedade,Estelionato,Estelionato (outros),0.012854,0.014167,-0.010214,-0.06745,0.024207,-0.01021,-0.05336,...,-0.040303,-0.039631,0.01625,-0.021304,-0.034026,0.008419,0.022837,-0.035077,-0.051346,0.00826
2,Crimes Contra Propriedade,Estelionato,Estelionato (outros),0.012884,0.025145,-0.028492,-0.078217,0.005706,-0.01733,-0.04849,...,-0.013685,-0.044063,0.022132,-0.034129,-0.003693,0.015664,0.034866,-0.018603,-0.043413,-0.004023
3,Crimes Contra Propriedade,Estelionato,Estelionato (outros),-0.000294,-0.001026,-0.005048,-0.064348,0.018758,-0.027287,-0.036498,...,-0.0335,-0.021881,0.032972,-0.035627,0.004569,0.003972,0.013301,-0.025269,-0.060999,-0.002277
4,Crimes Contra Propriedade,Estelionato,Estelionato (outros),0.001136,0.006518,-0.011555,-0.057545,0.005468,-0.021173,-0.034731,...,-0.034211,-0.03846,0.02068,-0.040388,0.00089,0.004069,0.023184,-0.040071,-0.053121,-0.000497


In [147]:
raw.to_csv("./Datasets/10_RawTrainANON.csv", index=False)
preproc.to_csv("./Datasets/11_PreprocTrainANON.csv", index=False)

### Never-seen-before Dataset

In [5]:
import pandas as pd

In [6]:
never_seen = read_and_concat("./Datasets/12")
never_seen.head()

./Datasets/12_001_NeverSeen_embeddings_0_50.csv
./Datasets/12_002_NeverSeen_embeddings_50_100.csv
./Datasets/12_003_NeverSeen_embeddings_100_150.csv
./Datasets/12_004_NeverSeen_embeddings_150_200.csv
./Datasets/12_005_NeverSeen_embeddings_200_250.csv
./Datasets/12_006_NeverSeen_embeddings_250_300.csv


Unnamed: 0,N1,N2,N3,Embedding_0,Embedding_1,Embedding_2,Embedding_3,Embedding_4,Embedding_5,Embedding_6,...,Embedding_1014,Embedding_1015,Embedding_1016,Embedding_1017,Embedding_1018,Embedding_1019,Embedding_1020,Embedding_1021,Embedding_1022,Embedding_1023
0,Crimes Contra Propriedade,Roubo,Roubo outros,0.034882,0.06379,-0.015701,-0.063728,0.000136,-0.022096,-0.070146,...,-0.023706,-0.027641,0.029843,-0.00692,-0.006165,0.030821,0.023737,-0.014206,-0.029136,0.018848
1,"Relacionados a Drogas, Entorpecentes e Porte d...",Tráfico de Drogas (Lei 11.343/06),Tráfico de Drogas (Lei 11.343/06),0.01978,0.044434,-0.027141,-0.051432,-0.002872,-0.033528,-0.040532,...,-0.04699,-0.032765,0.045602,-0.000821,0.002429,0.036821,0.032598,0.000609,-0.057442,0.009181
2,Crimes Contra Propriedade,Furto,Furto de Veículo,0.017073,0.048304,0.003193,-0.06301,0.004186,-0.01859,-0.038764,...,-0.03415,-0.036247,0.015059,-0.000946,0.019414,0.044906,0.031639,-0.01945,-0.067131,0.011651
3,Crimes Contra Pessoa,Lesão Corporal,Lesão Corporal (outros),0.009665,0.037534,-0.02005,-0.075351,0.01053,-0.022181,-0.044396,...,-0.023913,-0.020958,0.014824,-0.004242,-0.004369,0.033687,0.028239,-0.001719,-0.027303,0.018591
4,Crimes Contra Pessoa,Lesão Corporal,Lesão Corporal (outros),0.004789,0.038305,-0.024471,-0.070292,0.025082,-0.007041,-0.04401,...,-0.020124,-0.023722,0.025267,-0.015664,0.004516,0.008269,0.01357,-0.010425,-0.046632,0.004869


In [7]:
never_seen.to_csv("./Datasets/13_NeverSeenANON.csv", index=False)