In [1]:
from lxml import etree
import re
import os
import sqlite3
import whoosh.index
import whoosh.fields

In [2]:
conn = sqlite3.connect('../rechtspraak.db')
c = conn.cursor()

In [3]:
def retrieve_from_web(ecli):
    link = 'http://data.rechtspraak.nl/uitspraken/content?id='+ecli
    return etree.ElementTree().parse(link)

def retrieve_from_filesystem(ecli, rootpath):
    year = ecli[11:15]
    fn = str(year)+'/'+re.sub(':', '_', ecli)+'.xml'
    path = os.path.join(rootpath, fn)
    try:
        return etree.ElementTree().parse(path)
    except Exception as e: 
        print('Exception: ', path)
        print(e)
        return None

In [2]:
rootpath = '/media/sf_VBox_Shared/CaseLaw/'
xml_path = os.path.join(rootpath, 'OpenDataUitspraken/')

In [3]:
indexpath = os.path.join(rootpath, 'index')
if not os.path.exists(indexpath):
        os.makedirs(indexpath)

In [6]:
schema = whoosh.fields.Schema(path=whoosh.fields.ID(stored=True), 
                              content=whoosh.fields.TEXT)

In [4]:
#ix = whoosh.index.create_in(indexpath, schema)
ix = whoosh.index.open_dir(indexpath)

In [8]:
def get_uitspraak_text(id0, element):
    uitspraken = list(element.iterchildren('{*}uitspraak'))
    if len(uitspraken)>0:
        uitspraak = uitspraken[0]
        uitspraak_xml = etree.tostring(uitspraak)
        uitspraak_text = ' '.join([e.text for e in uitspraak.iterdescendants() if e.text is not None])
        #remove consecutive spaces
        uitspraak_text = re.sub(' +',' ', uitspraak_text)
        return uitspraak_text

In [None]:
ids = c.execute('SELECT id from uitspraken_meta').fetchall()#.fetchmany(10)
writer = ix.writer()
i = 0
batchsize = 100
for row in ids:
    ecli = row[0]
    xml_tree = retrieve_from_filesystem(ecli, xml_path)
    if xml_tree is not None:
        uitspraak_text = get_uitspraak_text(ecli, xml_tree)
        writer.add_document(path=re.sub(':', '_', ecli)+'.xml',
                    content=uitspraak_text)
        i += 1
        if i%batchsize == 0:
            print(i)
            writer.commit()
            writer = ix.writer()
writer.commit()

In [5]:
ix.doc_count()

27000

In [41]:
from whoosh.qparser import QueryParser
#query = 'werkgever aansprakelijkheid'
query = '7:658'
searcher = ix.searcher()
query = QueryParser("content", ix.schema).parse(query)
results = searcher.search(query, limit=None)
print(results.is_empty())
print(len(results))
print(results[0])

False
90
<Hit {'path': 'ECLI_NL_HR_2012_BV0616.xml'}>


In [20]:
for it in results.items():
    print(it)

(13078, 16.87403299472294)
(10291, 15.319609413568433)
(12075, 15.276825549466025)
(17727, 14.310729606720846)
(4471, 14.298012191872104)
(7925, 14.045200000942202)
(9715, 13.698638314053806)
(9588, 13.548476398463475)
(9693, 13.53002331979911)
(5632, 13.514518668271261)


In [37]:
results.scored_length()

122

In [38]:
res0 = results[0]
print(res0)

<Hit {'path': 'ECLI_NL_HR_2012_BV1295.xml'}>


In [39]:
res0.matched_terms()

NoTermsException: 

In [42]:
searcher.close()

In [43]:
ix.close()

In [13]:
conn.close()