In [1]:
from lxml import etree
import re
import os
import sqlite3

In [42]:
conn = sqlite3.connect('rechtspraak.db')
c = conn.cursor()

Dit zijn de mogelijke wetboeken:
Ieder wetboek heeft een naam: 
* Strafrecht (sw of sr), 
* Strafvorderingen (sv), 
* Burgelijk WB (bwb, 7 boeken 7:658), 
* AWB (algemene wet bestuursrecht, meerdere gedeelten)
* evrm = europees verdrag rechten van de mens

In [31]:
import nltk
rows = c.execute('SELECT id, text from uitspraken').fetchall()

In [3]:
texts = [nltk.Text(nltk.word_tokenize(text.lower())) for id0, text in rows]

In [19]:
example = nltk.Text(nltk.word_tokenize("art. 1:23 BWB en artikel 4b van de flora- en faunawet, art 4, lid 4 awb en ook art 42 van de wet bladibla".lower()))
print(example)

<Text: art . 1:23 bwb en artikel 4b van...>


In [24]:
known_books = ['sw','sr', 'sv', 'bwb', 'bw', 'awb', 'evrm', 'ro', 'zfw']
regex_knownbooks = r"(?:<" + "|".join(known_books) +r">)"
print(regex_knownbooks)
article_regex = r"<art(?:ikel)?> " +\
                r"<\.>? " +\
                r"<[0-9]+(?::[0-9]+)?[a-z]*> " +\
                r"(?:<,>? <lid> <[0-9]+>)?" +\
                r"(?:<van> <het|de> <wet>? <.*> | " +\
                regex_knownbooks + ")"
print(article_regex)

(?:<sw|sr|sv|bwb|bw|awb|evrm|ro|zfw>)
<art(?:ikel)?> <\.>? <[0-9]+(?::[0-9]+)?[a-z]*> (?:<,>? <lid> <[0-9]+>)?(?:<van> <het|de> <wet>? <.*> | (?:<sw|sr|sv|bwb|bw|awb|evrm|ro|zfw>))


In [25]:
example.findall(article_regex)

art . 1:23 bwb; artikel 4b van de flora-; art 4 , lid 4 awb; art 42
van de wet bladibla


In [37]:
def get_articles(text):
    verwijzingen = nltk.TokenSearcher(text).findall(article_regex)
    result = []
    for v in verwijzingen:
        art_number = nltk.TokenSearcher(v).findall("<[0-9]+(?::[0-9]+)?[a-z]*> ")[0][0]
        art_name = v[-1]
        result.append((art_number, art_name))
    return result

In [38]:
from collections import Counter
uitspraak_article = []
for i in range(len(texts)):
    articles = get_articles(texts[i])
    for (art_number, art_name), cnt in Counter(articles).most_common():
        uitspraak_article.append((rows[i][0], art_number, art_name, cnt))

In [39]:
import pandas as pd
uitspraak_article = pd.DataFrame(uitspraak_article, columns=['id', 'art_number', 'art_name', 'cnt'])
uitspraak_article.head()

Unnamed: 0,id,art_number,art_name,cnt
0,ECLI:NL:HR:1988:AD0289,288,sr,3
1,ECLI:NL:HR:1988:AD0289,225,sr,2
2,ECLI:NL:HR:1988:AD0289,422,sv,2
3,ECLI:NL:HR:1988:AD0289,287,sr,1
4,ECLI:NL:HR:1988:AD0289,297,sv,1


In [44]:
c2 = conn.cursor()
c2.execute('''
        DROP TABLE IF EXISTS uitspraken_articles
''')
c2.execute(''' CREATE TABLE uitspraken_articles
            (id text, 
            article_name text,
            article_number text,
            cnt integer
            )
        ''')

<sqlite3.Cursor at 0x7f6570230d50>

In [45]:
from collections import Counter

for i in range(len(uitspraak_article)):
    values = (uitspraak_article['id'][i], uitspraak_article['art_name'][i], 
              uitspraak_article['art_number'][i], uitspraak_article['cnt'][i])
    query = ''' INSERT INTO uitspraken_articles
        VALUES (?, ?, ?, ?)
        '''
    c2.execute(query, values)

In [46]:
uitspraak_article.groupby(['art_number', 'art_name']).sum().sort('cnt', ascending=False).head(30)

  if __name__ == '__main__':


Unnamed: 0_level_0,Unnamed: 1_level_0,cnt
art_number,art_name,Unnamed: 2_level_1
81,ro,6917
5a,administratieve,1471
6,evrm,1307
81,op,976
80a,op,623
80a,ro,599
457,sv,511
359a,sv,469
8,evrm,457
552a,sv,420


In [47]:
uitspraak_article.groupby(['art_name']).sum().sort('cnt', ascending=False).head(30)

  if __name__ == '__main__':


Unnamed: 0_level_0,cnt
art_name,Unnamed: 1_level_1
ro,8106
bw,7011
sv,6612
op,3392
sr,2999
evrm,2499
wetboek,2282
administratieve,1497
awb,913
algemene,841


In [None]:
#How many links do we have?
c.execute('''select count(distinct id), count(*) from uitspraken_articles''').fetchall()

In [49]:
conn.commit()
conn.close()

ProgrammingError: Cannot operate on a closed database.