In [1]:
from lxml import etree
import re
import os
import sqlite3

In [2]:
conn = sqlite3.connect('rechtspraak.db')
c = conn.cursor()

Dit zijn de mogelijke wetboeken:
Ieder wetboek heeft een naam: 
* Strafrecht (sw of sr), 
* Strafvorderingen (sv), 
* Burgelijk WB (bwb, 7 boeken 7:658), 
* AWB (algemene wet bestuursrecht, meerdere gedeelten)
* evrm = europees verdrag rechten van de mens

In [4]:
import nltk
rows = c.execute('SELECT id, text from uitspraken').fetchall()
texts = [nltk.Text(nltk.word_tokenize(text.lower())) for id0, text in rows]

In [94]:
example = nltk.Text(nltk.word_tokenize("art. 1:23 BWB en artikel 4 van de flora- en faunawet, art 4, lid 4 awb".lower()))
print(example)

<Text: art . 1:23 bwb en artikel 4 van...>


In [100]:
known_books = ['sw','sr', 'sv', 'bwb', 'awb', 'evrm', 'ro']
regex_knownbooks = r"(?:<" + "|".join(known_books) +r">)"
print(regex_knownbooks)
article_regex = r"<art(?:ikel)?> " +\
                r"<\.>? " +\
                r"<[0-9]+(?::[0-9]+)?> " +\
                r"(?:<,>? <lid> <[0-9]+>)?" +\
                r"(?:<van> <het|de> <.*> | " +\
                regex_knownbooks + ")"
print(article_regex)

(?:<sw|sr|sv|bwb|awb|evrm|ro>)
<art(?:ikel)?> <\.>? <[0-9]+(?::[0-9]+)?> (?:<,>? <lid> <[0-9]+>)?(?:<van> <het|de> <.*> | (?:<sw|sr|sv|bwb|awb|evrm|ro>))


In [114]:
def get_articles(text):
    verwijzingen = nltk.TokenSearcher(text).findall(article_regex)
    result = []
    for v in verwijzingen:
        art_number = nltk.TokenSearcher(v).findall("<[0-9]+(?::[0-9]+)?> ")[0][0]
        art_name = v[-1]
        result.append((art_number, art_name))
    return result

In [118]:
from collections import Counter
uitspraak_article = []
for i in range(len(rows)):
    articles = get_articles(texts[i])
    for (art_number, art_name), cnt in Counter(articles).most_common():
        uitspraak_article.append((rows[i][0], art_number, art_name, cnt))

In [119]:
import pandas as pd
uitspraak_article = pd.DataFrame(uitspraak_article, columns=['id', 'art_number', 'art_name', 'cnt'])
uitspraak_article.head()

Unnamed: 0,id,art_number,art_name,cnt
0,ECLI:NL:HR:1988:AD0289,288,sr,3
1,ECLI:NL:HR:1988:AD0289,225,sr,2
2,ECLI:NL:HR:1988:AD0289,422,sv,2
3,ECLI:NL:HR:1988:AD0289,287,sr,1
4,ECLI:NL:HR:1988:AD0289,297,sv,1


In [123]:
c2 = conn.cursor()
c2.execute('''
        DROP TABLE IF EXISTS uitspraken_articles
''')
c2.execute(''' CREATE TABLE uitspraken_articles
            (id text, 
            article_name text,
            article_number text,
            cnt integer
            )
        ''')

<sqlite3.Cursor at 0x7f7c6aa1e2d0>

In [144]:
from collections import Counter

rows = c.execute('SELECT id, text from uitspraken')

for i in range(len(uitspraak_article)):
    values = (uitspraak_article['id'][i], uitspraak_article['art_name'][i], 
              uitspraak_article['art_number'][i], uitspraak_article['cnt'][i])
    query = ''' INSERT INTO uitspraken_articles
        VALUES (?, ?, ?, ?)
        '''
    c2.execute(query, values)

In [122]:
uitspraak_article.groupby(['art_number', 'art_name']).sum().sort('cnt', ascending=False).head(30)

  if __name__ == '__main__':


Unnamed: 0_level_0,Unnamed: 1_level_0,cnt
art_number,art_name,Unnamed: 2_level_1
81,ro,6896
6,evrm,1307
81,wet,997
457,sv,511
8,evrm,457
440,sv,369
415,sv,276
15,wet,232
10,evrm,208
94,sv,200


In [145]:
#How many links do we have?
c.execute('''select count(distinct id), count(*) from uitspraken_articles''').fetchall()

[(13934, 22159)]

In [146]:
conn.commit()
conn.close()