In [1]:
from lxml import etree
import re
import os
import sqlite3
import whoosh.index
import whoosh.fields

In [2]:
conn = sqlite3.connect('rechtspraak.db')
c = conn.cursor()

In [3]:
def retrieve_from_web(ecli):
    link = 'http://data.rechtspraak.nl/uitspraken/content?id='+ecli
    return etree.ElementTree().parse(link)

def retrieve_from_filesystem(ecli, rootpath):
    year = ecli[11:15]
    fn = str(year)+'/'+re.sub(':', '_', ecli)+'.xml'
    path = os.path.join(rootpath, fn)
    try:
        return etree.ElementTree().parse(path)
    except Exception as e: 
        print('Exception: ', path)
        print(e)
        return None

In [2]:
rootpath = '/media/sf_VBox_Shared/CaseLaw/'
xml_path = os.path.join(rootpath, 'OpenDataUitspraken/')

In [3]:
indexpath = os.path.join(rootpath, 'index')
if not os.path.exists(indexpath):
        os.makedirs(indexpath)

In [6]:
schema = whoosh.fields.Schema(path=whoosh.fields.ID(stored=True), 
                              content=whoosh.fields.TEXT)

In [4]:
#ix = whoosh.index.create_in(indexpath, schema)
ix = whoosh.index.open_dir(indexpath)

In [8]:
def get_uitspraak_text(id0, element):
    uitspraken = list(element.iterchildren('{*}uitspraak'))
    if len(uitspraken)>0:
        uitspraak = uitspraken[0]
        uitspraak_xml = etree.tostring(uitspraak)
        uitspraak_text = ' '.join([e.text for e in uitspraak.iterdescendants() if e.text is not None])
        #remove consecutive spaces
        uitspraak_text = re.sub(' +',' ', uitspraak_text)
        return uitspraak_text

In [None]:
ids = c.execute('SELECT id from uitspraken_meta').fetchall()#.fetchmany(10)
writer = ix.writer()
i = 0
batchsize = 100
for row in ids:
    ecli = row[0]
    xml_tree = retrieve_from_filesystem(ecli, xml_path)
    if xml_tree is not None:
        uitspraak_text = get_uitspraak_text(ecli, xml_tree)
        writer.add_document(path=re.sub(':', '_', ecli)+'.xml',
                    content=uitspraak_text)
        i += 1
        if i%batchsize == 0:
            print(i)
            writer.commit()
            writer = ix.writer()
writer.commit()

In [5]:
ix.doc_count()

27000

In [10]:
from whoosh.qparser import QueryParser
query = 'aansprakelijkheid'
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse(query)
    results = searcher.search(query)
    print(results.is_empty())

False


In [11]:
results.items()

<generator object Results.items.<locals>.<genexpr> at 0x7f5b8a749e08>

In [14]:
print(results.top_n)

[(8.00556356228712, 13078), (8.002779512479837, 26714), (7.769660772177065, 16000), (7.754754740713833, 20992), (7.686062655462563, 23641), (7.611378204505405, 5856), (7.5558330897620385, 26527), (7.5512970325708055, 26168), (7.523949516783552, 14232), (7.448067248644524, 19224)]


In [30]:
ix.close()

In [13]:
conn.close()