In [1]:
from lxml import etree
import re
import os
import sqlite3
import lda
from sklearn.feature_extraction.text import CountVectorizer
import itertools
import pandas as pd
import gensim
import nltk

In [2]:
#nltk.download()

In [3]:
conn = sqlite3.connect('rechtspraak.db')
c = conn.cursor()

In [4]:
rows = c.execute('SELECT id, text from uitspraken').fetchall()

In [5]:
stemmer = nltk.stem.snowball.DutchStemmer(ignore_stopwords=True)
def tokenize(text):
    tokenized = nltk.word_tokenize(text)
    return [stemmer.stem(w) for w in tokenized if w.isalnum()]

Gensim

In [6]:
texts = [tokenize(text) for id0, text in rows]

In [7]:
s = "abd321a 324 [23a] #ABD aDc"
stemmer.stem('23a')
tokenize(s)

['abd321a', '324', '23a', 'abd', 'adc']

In [8]:
dictionary = gensim.corpora.Dictionary(texts)

In [9]:
# remove stop words and words that appear only once
from six import iteritems
min_count = 20
max_features = 100000
dictionary.filter_extremes(no_below=min_count, no_above=0.7, keep_n=max_features)

In [10]:
dictionary.compactify()  # remove gaps in id sequence after words that were removed

In [11]:
print(dictionary)

Dictionary(12488 unique tokens: ['vervolgen', 'neerslag', 'juwelier', 'academisch', 'efficient']...)


In [22]:
dictionary.save('/tmp/dictionary.dict')

In [12]:
corpus = [dictionary.doc2bow(tokenize(text)) for id0, text in rows]

In [23]:
gensim.corpora.MmCorpus.serialize('/tmp/hr.mm', corpus)

scipy

In [13]:
#vectorizer = CountVectorizer(max_df=0.8, tokenizer=tokenize)
#vectorizer = CountVectorizer(max_df=0.8, min_df=10, max_features=100000)

In [14]:
#wordcounts = vectorizer.fit_transform((text for (id0, text) in rows))

In [15]:
#wordcounts.shape

In [16]:
#len(vectorizer.get_feature_names())

In [17]:
#corpus = gensim.matutils.Sparse2Corpus(wordcounts)

In [18]:
# featurenames = vectorizer.get_feature_names()
# id2word = {i:featurenames[i] for i in range(len(featurenames))}
# len(id2word.keys())

In [19]:
tfidf = gensim.models.TfidfModel(corpus, id2word=dictionary)
corpus_tfidf = tfidf[corpus]

In [20]:
lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
corpus_lsi = lsi[corpus_tfidf]

In [21]:
lsi.print_topics()

[(0,
  '0.250*"belanghebb" + 0.214*"eiser" + 0.197*"verdacht" + 0.131*"verweerder" + 0.125*"beschik" + 0.124*"aanslag" + 0.090*"inspecteur" + 0.088*"rechtbank" + 0.084*"ik" + 0.082*"klacht"'),
 (1,
  '0.475*"eiser" + -0.379*"belanghebb" + 0.243*"verweerder" + -0.222*"aanslag" + -0.160*"volksverzeker" + -0.135*"inspecteur" + -0.133*"z" + -0.128*"x" + -0.125*"staatssecretaris" + 0.125*"verweerster"'),
 (2,
  '0.466*"verdacht" + -0.378*"eiser" + 0.227*"ik" + -0.181*"verweerder" + -0.156*"belanghebb" + 0.138*"slachtoffer" + 0.125*"schriftur" + -0.104*"aanslag" + 0.097*"terechtzit" + -0.093*"verweerster"'),
 (3,
  '-0.262*"klaarblijk" + -0.237*"2014" + -0.233*"2015" + -0.218*"80a" + -0.181*"verzoeker" + -0.178*"treur" + -0.173*"cassatieberoep" + -0.166*"th" + -0.165*"groeneveld" + -0.160*"organisatie"'),
 (4,
  '0.423*"beschik" + -0.379*"eiser" + 0.365*"vrouw" + 0.330*"verzoeker" + 0.330*"man" + 0.128*"verzoekster" + -0.103*"klaarblijk" + 0.099*"cassatierekest" + -0.097*"2015" + -0.097*"201

In [26]:
lsi.save('/tmp/model.lsi')

In [27]:
#ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=100, passes=20, id2word=id2word)
ldamodel = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=100, passes=3, id2word=dictionary)
corpus_lda = ldamodel[corpus]

In [28]:
ldamodel.print_topics()

[(58,
  '0.164*onroer + 0.134*waarder + 0.064*beschik + 0.036*grond + 0.033*wet + 0.020*z + 0.019*81 + 0.018*betreff + 0.018*x + 0.015*184'),
 (56,
  '0.041*eer + 0.039*voorkeursrecht + 0.039*schenck + 0.032*verzwaard + 0.021*domein + 0.018*vervaltermijn + 0.014*185 + 0.014*gunn + 0.013*dm + 0.012*429'),
 (32,
  '0.081*the + 0.031*to + 0.029*and + 0.021*or + 0.019*hasjiesj + 0.018*jeugdig + 0.014*zittingsplat + 0.013*that + 0.012*for + 0.011*brak'),
 (85,
  '0.050*eiser + 0.032*hag + 0.028*verweerder + 0.024*eiseres + 0.021*verweerster + 0.020*schap + 0.015*verlop + 0.013*e + 0.012*aangeduid + 0.012*buruma'),
 (71,
  '0.236*onroerendezaakbelast + 0.041*sluiting + 0.037*heropen + 0.036*vandag + 0.036*navorderingstermijn + 0.029*audi + 0.024*bell + 0.024*vught + 0.020*operatie + 0.019*tek'),
 (83,
  '0.696*2014 + 0.058*2012 + 0.028*verweerschrift + 0.015*478 + 0.010*dexia + 0.007*belastingdruk + 0.006*klei + 0.006*wro + 0.006*rotterdam + 0.005*huwelijksgoederengemeenschap'),
 (39,
  '0.0

In [29]:
ldamodel.save('/tmp/model.lda')