In [61]:
import re
from unicodedata import normalize


In [62]:
def compile_multiple(regexes):
    programs = []
    for regex in regexes:
        programs.append(re.compile(regex, re.IGNORECASE))
    return programs

In [63]:
from glob import glob
from os import path

DIR = '../data'
def document_iterator(DIR):
    for index, file in enumerate(glob(path.join(DIR, '*.txt'))):
        yield open(file, 'r', encoding='cp1252').read()
        if index >= 99:
            break

In [64]:
def normalized(text):
    try:
        return normalize('NFKD', text).encode('ASCII','ignore').decode('ASCII')
    except:
        pass
    return text

In [65]:
def varas_identify(text):
    p = re.compile('(^|\W)vara\W', re.IGNORECASE)
    lines = []
    for line in text.splitlines():
        if p.search(line) is not None:
            lines.append(normalized(line.strip()))
    return lines

In [66]:
def varas_filter(lines):
    regexes = ['\d+[ª,a]?.*vara\W', '(^|\W)Vara d[e,o,a]', '(^|\W)Vara:', '(^|\W)Vara civel', '(^|\W)Vara comercial', '(^|\W)Vara regional', '\w+eira vara', 'segunda vara', 'quarta vara', 'quinta vara', 'sexta vara', 'setima vara', 'oitava vara', 'decima vara', 'nona vara', '\w+esima vara', 'vara unica', 'vara judicial']

    programs = compile_multiple(regexes)

    regex_filtered = []
    for text in lines:
        for p in programs:
            if p.search(text) is not None:
                regex_filtered.append(text)
                break
    return regex_filtered

In [67]:
def varas_first(lines):
    if len(lines) > 0:
        return lines[0]
    else:
        return None

In [68]:
def varas_from_matching(line):
    startregexes = ['primeira vara.*', 'segunda vara.*', 'terceira vara.*', 'quarta vara.*', 'quinta vara.*', 'sexta vara.*', 'setima vara.*', 'oitava vara.*', 'decima vara.*', 'nona vara.*', '\w+esima vara.*', '\d+[ª,a]?(.{2}|\s)vara\W.*', 'vara unica', 'vara judicial', '(^|\W)Vara civel .*', '(^|\W)Vara comercial', '(^|\W)Vara regional .*', '(^|\W)Vara .*']

    programs = compile_multiple(startregexes)
    if line is not None:
        for p in programs:
            if p.search(line) is not None:
                return p.search(line).group(0).strip()
    
    return line

In [69]:
def varas_final_value(line):
    p = re.compile('(\s\s+|,|-)')

    if line is not None:
        return p.split(line)[0].strip()
    return line
        
        

In [72]:
def get_vara(text):
    lines = varas_identify(text)
    lines = varas_filter(lines)
    line = varas_first(lines)
    vara = varas_from_matching(line)
    vara = varas_final_value(vara)
    return vara

In [73]:
varas = []
for text in document_iterator(DIR):
    varas.append(get_vara(text))

varas
    
    
    

[None,
 'VARA DO JUIZADO ESPECIAL CIVEL E CRIMINAL',
 '2a VARA',
 None,
 None,
 None,
 None,
 '4a VARA CIVEL',
 '4a VARA CIVEL',
 '3a Vara Civel',
 None,
 None,
 None,
 None,
 '15" VARA CEVEL',
 'VARA DO OFICIO DA FAMILIA E SUCESSOES',
 '2a Vara de Direito Bancario',
 '1a VARA CIVEL DE FRANCISCO BELTRAO',
 None,
 None,
 'TERCEIRA VARA CIVEL DE BRASILIA',
 'VARA CIVEL DE PLANALTINA',
 'VIGESIMA VARA CIVEL DE BRASILIA',
 None,
 '1a VARA CIVEL',
 None,
 None,
 '13a Vara Civel',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 '29a Vara Civel',
 None,
 '22a Vara Civel',
 '6a Vara Civel',
 '4a VARA CIVEL',
 '2a VARA CIVEL DA COMARCA DE VIAMAO',
 '2a VARA CIVEL',
 '1a VARA CIVEL',
 '2a VARA CIVEL DO FORO CENTRAL',
 '4a VARA CIVEL',
 '1a VARA CIVEL',
 None,
 '3a VARA CIVEL',
 '5a VARA CIVEL',
 None,
 'TERCEIRA VARA',
 None,
 '7a Vara do Juizado Especial desta Comarca',
 '17a Vara Civel de Competencia Especial',
 None,
 None,
 '3a Vara Civel Residual da Comarca de 