# Clustering Named Entities with InfoLeg
#### Mining for insights in legal texts

InfoLeg is a legistative database for the *Ministerio de Justicia y Derechos Humanos de la Nación* in Argentina.

In [1]:
import spacy
import nltk
from spacy.tokens import Doc
import re
import os
import numpy as np
import pandas as pd
from nltk import regexp_tokenize
from nltk import sent_tokenize
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans
from sklearn import preprocessing

In [2]:
PATH = './entities_infoleg/'

When dealing with multi-word named entities, such as 'Ley Nº 26.442', in **regular expressions** help us to isolate such words and look for patterns of their use in the texts.  
These regex patterns were created by Lucas Agustin De Francesca, https://github.com/lucasdefrancesca/text-mining.

In [3]:
# regex pattern #these patterns are from Lucas Agustin De Francesca, https://github.com/lucasdefrancesca/text-mining
PATTERN = r'''(?x)
   (?:Ley\sN[ºo°]*\s\d{1,}(?:\.\d+)*)                # Leyes como entidad
   | (?:[Aa]nexo[s]?[\sIVXLCDMy,]*[IVXLCDM]+)        # anexos
   | (?:Nota[\sA-Za-z()ºo°\.]+[\d|/]+)               # Notas como entidad
   | (?:Decreto[A-Zºo°\sa-z]+[\d|/]+)                # Decretos como entidad
   | (?:[Aa]rt[ií]culo[A-Z|\sºo°]+\d+º*)             # Articulos como entidad
   | (?:[Aa]rt\.*[\s+\d+]+º*)                        # abreviacion de articulo
   | (?:Resoluci[óo]n[A-Zºo°\sa-z]+[\d/]+)           # Resoluciones como entidad
   | (?:Disposici[óo]n[A-Zºo°\sa-z]+[\d/]+)          # Disposicion como entidad
   | (?:Expediente[A-Zºo°\s]+[\d/]+)                 # Expediente como entidad
   | (?:punto\s[\d\.]+)                              # punto x.x.x. como entidad
   | (?:\d{1,2}[\sa-z]+\d{4})                        # fechas
   | (?:[A-Z][a-záéíóú]+\s[A-Z]\.\s[A-Z][a-záéíóú]+) # entidades humanas Fulano M. Mengano
   | (?:MINISTERIO[\sA-Z,]*[A-Z]+)                   # Ministerios como entidad
   | (?:REPUBLICA[\sDE]*[A-Z]+)                      # Republica como entidad
   | (?:SECRETARIA[\sA-Z,]*[A-Z]+)                   # Secretaria como entidad
   | (?:SERVICIO[\sA-Z,]*[A-Z]+)                     # Servicios como entidad
   | (?:DIRECCION[\sA-Z,]*[A-Z]+)                    # Direccion como entidad
   | \w+(?:-\w+)*                                    # palabras con '-' opcional
   | \.\.\.                                          # ...
   | [][.,;"'?():-_`]
   | (?:\d+)                                         # numeros
   | (?:[.\n])                                       # punto y aparte
'''

In [4]:
# named entities
NE = r'''(?x)
   (?:Ley\sN[ºo°]*\s\d{1,}(?:\.\d+)*)                # Leyes como entidad
   | (?:Nota[\sA-Za-z()ºo°\.]+[\d|/]+)               # Notas como entidad
   | (?:Decreto[A-Zºo°\sa-z]+[\d|/]+)                # Decretos como entidad
   | (?:Resoluci[óo]n[A-Zºo°\sa-z]+[\d/]+)           # Resoluciones como entidad
   | (?:Disposici[óo]n[A-Zºo°\sa-z]+[\d/]+)          # Disposicion como entidad
   | (?:Expediente[A-Zºo°\s]+[\d/]+)                 # Expediente como entidad
   | (?:punto\s[\d\.]+)                              # punto x.x.x. como entidad
   | (?:[A-Z][a-záéíóú]+\s[A-Z]\.\s[A-Z][a-záéíóú]+) # entidades humanas Fulano M. Mengano
   | (?:MINISTERIO[\sA-Z,]*[A-Z]+)                   # Ministerios como entidad
   | (?:REPUBLICA[\sDE]*[A-Z]+)                      # Republica como entidad
   | (?:SECRETARIA[\sA-Z,]*[A-Z]+)                   # Secretaria como entidad
   | (?:SERVICIO[\sA-Z,]*[A-Z]+)                     # Servicios como entidad
   | (?:DIRECCION[\sA-Z,]*[A-Z]+)                    # Direccion como entidad
'''

In [5]:
# See https://spacy.io/usage/linguistic-features #native-tokenizers
class CustomTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab
    def __call__(self, text):
        words = regexp_tokenize(text, PATTERN)
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

In [6]:
def isEntity(string, pattern):
    return re.match(pattern, string) != None

### Preprocessing with spaCy:
Tokenization with spaCy is prefered here (to nltk) because of it performs well in Spanish with morphological patterns, such as part-of-speech tagging and dependency. 

In [7]:
def preProcessing(dname, PATTERN):
    processed_tokens = []
    nlp = spacy.load("es")
    nlp.tokenizer = CustomTokenizer(nlp.vocab)
    for file in os.listdir(dname):
        if file.endswith(".txt"):
            with open(os.path.join(dname, file), "r") as fd:
                for document in fd.readlines():

                    document = document.strip()
                    nlp.max_length = max(len(document), nlp.max_length)

                    tokens = [ token for token in nlp(document) 
                       if not (token.is_stop or token.is_punct or token.is_digit)
                     ]
            # The complex formatting of these texts makes it hard 
            # to tokenize by sentence so each document will be 
            # one entire law, instead of one sentence.
                    processed_tokens.append(tokens)
    return processed_tokens

## Feature Extraction:
The following features we compose our matrix:
* Part of speech
* Dependency Triples, which show the connection between our named entities and the word in the sentence that has the strongest syntactic relationship to that entity, called here ```tok.head.orth_```

In [8]:
def feature_extraction(processed_tokens):
    database = {}
    for document in preprocessed_tokens:
        has_prev = False
        for idx, tok in enumerate(document):
            if isEntity(tok.text, NE):
                features = {}
                pos = "POS__" + tok.pos_
                if not pos in features:
                    features[pos] = 0
                    features[pos] += 1
                tripla = 'TRIPLA__'+tok.lemma_+'_DEP_' + tok.dep_ + '_HEAD_' + tok.head.orth_
                if not tripla in features:
                    features[tripla] = 0
                    features[tripla] += 1
                database[tok.text] = features
    return database.keys(), database.values()

## Code execution

In [9]:
preprocessed_tokens = preProcessing(PATH, NE)
print(len(preprocessed_tokens))

1783


In [10]:
vocabulary, features = feature_extraction(preprocessed_tokens)

**Vectorization** transforms out words into numerical representations.

In [16]:
# See https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html
v = DictVectorizer()
vectors = v.fit_transform(features)

## Clustering

In [17]:
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(vectors)
cluster = kmeans.labels_.tolist()

In [18]:
# Recuento del número de elementos en cada cluster
for i in range(num_clusters):
    print ('Cluster %i has %i elements' % (i, cluster.count(i)))

Cluster 0 has 179 elements
Cluster 1 has 64 elements
Cluster 2 has 1 elements
Cluster 3 has 9 elements
Cluster 4 has 1 elements
Cluster 5 has 1 elements
Cluster 6 has 1 elements
Cluster 7 has 10 elements
Cluster 8 has 6 elements
Cluster 9 has 1 elements


In [19]:
labels = kmeans.predict(vectors)
labels_words = list(zip(labels, vocabulary))

In [20]:
clusters = defaultdict(set)
for idc, word in labels_words:
    clusters[idc].add(word)
    
# show clusters
clusters

defaultdict(set,
            {0: {'DIRECCION DE CONTROL DE BOMBEROS VOLUNTARIOS Y COORDINACION DE ORGANIZACIONES NO GUMERNAMENTALES',
              'DIRECCION DE CONTROL DE BOMBEROS VOLUNTARIOS Y DE COORDINACION DE ORGANIZACIONES NO GUBERNAMENTALES',
              'DIRECCION DE PROGRAMACION Y CONTROL PRESUPUESTARIO',
              'DIRECCION GENERAL DE ADMINISTRACION',
              'DIRECCION GENERAL DE ASUNTOS JURIDICOS',
              'DIRECCION NACIONAL DE PROTECCION CIVIL',
              'DIRECCION NACIONAL DEL REGISTRO OFICIAL',
              'Decreto 618',
              'Decreto Nacional Nº 2089/93',
              'Decreto Nacional Nº 270/97',
              'Decreto N° 27/2018',
              'Decreto N° 565/2008',
              'Decreto N° 618',
              'Decreto N° 903/2014',
              'Decreto Nº 1',
              'Decreto Nº 1474',
              'Decreto Nº 1697/2004',
              'Decreto Nº 1759/72',
              'Decreto Nº 1759/91',
              'Decreto Nº 