In [3]:
from lxml import etree
import re
import os
import sqlite3

In [6]:
conn = sqlite3.connect('rechtspraak.db')
c = conn.cursor()

In [41]:
#Create table
c.execute(''' DROP TABLE IF EXISTS uitspraken''')
c.execute(''' CREATE TABLE uitspraken
            (id text PRIMARY KEY, 
            xml text,
            text text
            )
        ''')

<sqlite3.Cursor at 0x7f9154dda340>

In [40]:
def retrieve_from_web(ecli):
    link = 'http://data.rechtspraak.nl/uitspraken/content?id='+ecli
    return etree.ElementTree().parse(link)

def retrieve_from_filesystem(ecli, rootpath):
    year = ecli[11:15]
    fn = str(year)+'/'+re.sub(':', '_', ecli)+'.xml'
    path = os.path.join(rootpath, fn)
    try:
        return etree.ElementTree().parse(path)
    except: 
        print('Exception: ', path)
        return None

In [5]:
rootpath = '/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/'

In [21]:

def year_from_id(text):
    return int(text.split(':')[3])

def instantie_from_id(text):
    return text.split(':')[2]

conn.create_function('year_from_id', 1, year_from_id)
conn.create_function('instantie_from_id', 1, instantie_from_id)

ECLIs = c.execute('SELECT id FROM uitspraken_meta where year_from_id(id) = 2014').fetchmany(10)
ECLIs = [s[0] for s in ECLIs]
ECLIs

[u'ECLI:NL:HR:2014:1',
 u'ECLI:NL:HR:2014:10',
 u'ECLI:NL:HR:2014:100',
 u'ECLI:NL:HR:2014:1000',
 u'ECLI:NL:HR:2014:1001',
 u'ECLI:NL:HR:2014:1002',
 u'ECLI:NL:HR:2014:1003',
 u'ECLI:NL:HR:2014:1004',
 u'ECLI:NL:HR:2014:1005',
 u'ECLI:NL:HR:2014:1006']

In [74]:
def insert_into_uitspraken(id0, element, curs):
    uitspraken = list(el.iterchildren('{*}uitspraak'))
    if len(uitspraken)>0:
        uitspraak = uitspraken[0]
        uitspraak_xml = etree.tostring(uitspraak)
        uitspraak_text = ' '.join([e.text for e in uitspraak.iterdescendants() if e.text is not None])
        #remove consecutive spaces
        uitspraak_text = re.sub(' +',' ', uitspraak_text)
        query = ''' INSERT OR REPLACE INTO uitspraken
        VALUES (?, ?, ?)
        '''
        curs.execute(query, (id0, uitspraak_xml, uitspraak_text))
        

In [None]:
ids = c.execute('SELECT id from uitspraken_meta')
c2 = conn.cursor()
for row in ids:
    ecli = row[0]
    el = retrieve_from_filesystem(ecli, rootpath)
    if el is not None:
        insert_into_uitspraken(ecli, el, c2)
conn.commit()

('Exception: ', u'/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/1994/ECLI_NL_HR_1994_AA2975.xml')
('Exception: ', u'/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/1994/ECLI_NL_HR_1994_AA2980.xml')
('Exception: ', u'/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/1994/ECLI_NL_HR_1994_AA2992.xml')
('Exception: ', u'/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/1994/ECLI_NL_HR_1994_AA2998.xml')
('Exception: ', u'/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/1994/ECLI_NL_HR_1994_AA3003.xml')
('Exception: ', u'/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/1995/ECLI_NL_HR_1995_AA1634.xml')
('Exception: ', u'/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/1995/ECLI_NL_HR_1995_AA1638.xml')
('Exception: ', u'/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/1996/ECLI_NL_HR_1996_AA1888.xml')
('Exception: ', u'/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/1996/ECLI_NL_HR_1996_AA1896.xml')
('Exception: ', u'/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/1996/ECLI_NL_HR_1996_AA1

In [76]:
c.execute('SELECT count(*) from uitspraken').fetchall()

[(27056,)]

In [78]:
conn.commit()
conn.close()