In [1]:
import re
from glob import glob
from os import path
from unicodedata import normalize

In [2]:
def normalized(text):
    try:
        return normalize('NFKD', text).encode('ASCII','ignore').decode('ASCII')
    except:
        pass
    return text

In [3]:
def clean_blank_lines(lines):
    cleaned_lines = []
    for line in lines:
        copy = line
        re.sub("(\s)+", "", copy)
        if len(copy) > 0:
            cleaned_lines.append(line)
    return cleaned_lines

In [4]:
def starts_with_lower(string):
    if string[0].lower() == string[0]:
        return True

In [358]:
bad_words = ['processo', 'sa', 'ltda', 'acao', 'vara', 'juiz', 'autos', 'contratuais', 
             'contratos', 'vistos', 'rua',
            'a', 'ante', 'apos', 'ate', 'com', 'contra', 'em', 'entre', 
            'para', 'per', 'por', 'perante', 'sem', 'sob', 'sobre', 'tras']
def check_name(string):
    words = re.split("\W",string)
    words = [word for word in words if word != ""] 
    if len(words) < 2 or len(words) > 6: #supondo que o nome completo de uma pessoa deve ter mais de uma palavra
        return False
    if starts_with_lower(words[0]): #supondo que a primeira letra de todas mesmo do nome TEM que ser maiuscula
        return False
    last_lower = False
    for word in words:
        if word.lower() in bad_words:
            return False
        if len(word) < 2:
            return False
        if len(word) < 3: #eh razoavel supor que um nome deve ter mais que 2 caracteres
            continue
        first_letter_is_lower = starts_with_lower(word)
        if first_letter_is_lower and last_lower: #primeira letra minuscula, provavelmente nao eh um nome
            return False
        last_lower = first_letter_is_lower
    if string.upper() == string: #supondo que se eh tudo maiusculo e nao caiu nos outros casos
        return True
    return True

In [330]:
def check_possible_names(possible_names):
    probable = None
    for possible_name in possible_names:
        if not probable:
            if check_name(possible_name):
                probable = possible_name
                break
            sentences = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', possible_name)
            for sentence in sentences:
                if check_name(sentence):
                    probable = sentence
                    break
    return probable

In [7]:
def find_name(string):
    probable = None
    normalized_s = normalized(line)
    possible_names = re.findall('[a-zA-Z\s]+', normalized_s)
    probable = check_possible_names(possible_names)
    if not probable:
        possible_names = re.findall('[A-Z\s]+', normalized_s)
        probable = check_possible_names(possible_names)
    return probable

In [8]:
def final_name(string):
    return string.strip().upper()

In [9]:
def get_name_from_signature(line):
    name = None
    #CASO Este documento foi assinado digitalmente por [nome do juiz]
    assinatura_re = re.findall('Este documento foi assinado digitalmente por', line)
    if assinatura_re:
        rest = line.replace('Este documento foi assinado digitalmente por ', '')
        has_name = find_name(rest)
        if has_name:
            name = final_name(has_name)
    return name

In [321]:
# Pattern 1: [nome do juiz] + "Juiz de Direito" na mesma linha
def find_judge_string(name, text): 
    found = re.findall('juiz.*de.*direito.*', text.lower())
    if found:
        return final_name(name)
    found = re.findall('ajuiz', text.lower())
    if found:
        return final_name(name)

In [287]:
def get_line_name(i,line, qt_lines):
    name = get_name_from_signature(line)
    if name:
        return name
    has_name = find_name(line)
    if has_name:
        name = find_judge_string(has_name, line)
        if name:
            return name
        if i+1 != qt_lines:
            name = find_judge_string(has_name, lines[i+1])
        if name:
            return name
        if i != 0:
            name = find_judge_string(has_name, lines[i-1])
    return name

In [366]:
def get_name(lines):
    name = None
    qt_lines = len(lines)
    for i, line in enumerate(lines):
        if name:
            break
        name = get_line_name(i,line,qt_lines)
    return name

In [298]:
def not_judge(text):
    found = re.findall('art.*40.*Lei.*9\.099/95', text)
    if found:
        return True
    return False

In [353]:
def document_iterator(DIR):
    for file in glob(path.join(DIR, '*.txt')):
        if file[-5:] == '1.txt' :
            yield open(file, 'r', encoding='cp1254').read()

In [354]:
DIR = "../data" 

In [370]:
names = []
for c,file in enumerate(document_iterator(DIR)):
    name = None
    if not_judge(file):
        names.append('LEIGO')
        continue
    lines = file.splitlines()
    lines = clean_blank_lines(lines)
    qt_lines = len(lines)
    for i, line in enumerate(lines):
        if name:
            break
        name = get_line_name(i,line, qt_lines)
    names.append(name)
    if c == 100:
        break

In [371]:
count = [name for name in names if name is not None]
print('%d/%d'%(len(count), len(names)))

69/101


In [361]:
names

['RAIMUNDO NONATO BORGES BRAGA',
 'GIOVANI AUGUSTO SERRA AZUL GUIMARAES',
 'CINTHIA SANTOS DA SILVA',
 'ALEXANDRE TORRES DE AGUIAR',
 'RONNIE PAES SANDRE',
 None,
 'CAROLINE QUADROS DA SILVEIRA PEREIRA',
 'KARINA FERRARO AMARANTE INNOCENCIO',
 'ANTONIO RICARDO PAVONNE',
 None,
 None,
 None,
 'SINVAL RIBEIRO DE SOUZA',
 'RODRIGO RAFAEL',
 'MONICA TUCUNDUVA SPERA MANFIO',
 None,
 'IVO RIMONDI',
 None,
 None,
 'LUCAS NOGUEIRA ISRAEL',
 'JOSE LAZARO DA SILVA',
 'CLODAIR EDENILSON BORIN',
 'FRANCISCO DE FARIA  OAB',
 None,
 'ALEX BAIMA SOARES',
 'ALEX BAIMA SOARES',
 None,
 'JOSE AILTON FROTA GONCALVES',
 None,
 None,
 'ANA MARIA RODRIGUES',
 'FRANCISCO FERREIRA DA SILVA',
 None,
 'ANA MARIA RODRIGUES',
 None,
 'ADERSON ROCHA DE FREITAS',
 'ALEX BAIMA SOARES',
 None,
 'SIRLEY CINTIA LIMA PACHECO',
 'JEOVA DE CASTRO SOUSA',
 None,
 None,
 'MARISA IVONE HOFFMEISTER',
 'JOSE CLAUDIO DA LUZ',
 None,
 'CELSO DA ROCHA MARTINS',
 'HERACLITO JOSE DE OLIVEIRA',
 'ALEXANDRE SCHWARTZ MANICA',
 'CARLOS