## Further text preprocessing

### Pre-requisites: Downloading nltk Dutch stopwords, data handling tools, model preprocessing & plotting tools, and SpaCy model

In [1]:
import nltk
nltk.download('stopwords', 'dutch')

[nltk_data] Downloading package stopwords to dutch...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

In [3]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [4]:
# SpaCy for lemmatization
import spacy

In [5]:
# Plotting tools
!pip install pyLDAvis
###!pip install gensimvis.py

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
###import pyLDAvis.gensim as gensimvis

import matplotlib.pyplot as plt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


  from collections import Iterable


In [6]:
# Ignoring warnings
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

### Preparing stopwords

In [7]:
# NLTK Stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = nltk.corpus.stopwords.words('dutch')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Tokenizing words and cleaning-up text

In [8]:
# Importing file handling library
import os

# Opening one sample Dutch legal text document
file = open('drive/MyDrive/numac=2019041722.txt','rt')
dutch_text = file.read()
file.close()

In [9]:
# Putting all words in lowercase
text = dutch_text.replace('\n', '')
raw_text = text.lower()
print(raw_text)

verslag aan de koning sire, het ontwerp van koninklijk besluit dat wij de eer hebben aan uwe majesteit voor te leggen, beoogt de uitvoering van de artikelen 93 ter tot 93 quinquies van het wetboek van de belasting over de toegevoegde waarde (hierna "wbtw"), de artikelen 412 bis, 433 tot 435 van het wetboek van de inkomstenbelastingen 1992 (hierna "wib 92"), de artikelen 35 tot 37 en 43 tot 45 en 47 van het wetboek van de minnelijke en gedwongen invordering van fiscale en niet-fiscale schuldvorderingen (hierna "invorderingswetboek") en de artikelen 157 tot 159 en 161 van de programmawet (i) van 29 maart 2012 (hierna "programmawet", zoals gewijzigd door de wet van 11 februari 2019 houdende fiscale, fraude bestrijdende, financiële alsook diverse bepalingen en de wet van 23 april 2020 houdende wijzigingen van het wetboek van de belasting over de toegevoegde waarde, het wetboek van de inkomstenbelastingen 1992, het wetboek van de minnelijke en gedwongen invordering van fiscale en niet-fisca

In [10]:
# Splitting text document on the basis of each word
content = raw_text.split(" ")
print(content)

['verslag', 'aan', 'de', 'koning', 'sire,', 'het', 'ontwerp', 'van', 'koninklijk', 'besluit', 'dat', 'wij', 'de', 'eer', 'hebben', 'aan', 'uwe', 'majesteit', 'voor', 'te', 'leggen,', 'beoogt', 'de', 'uitvoering', 'van', 'de', 'artikelen', '93', 'ter', 'tot', '93', 'quinquies', 'van', 'het', 'wetboek', 'van', 'de', 'belasting', 'over', 'de', 'toegevoegde', 'waarde', '(hierna', '"wbtw"),', 'de', 'artikelen', '412', 'bis,', '433', 'tot', '435', 'van', 'het', 'wetboek', 'van', 'de', 'inkomstenbelastingen', '1992', '(hierna', '"wib', '92"),', 'de', 'artikelen', '35', 'tot', '37', 'en', '43', 'tot', '45', 'en', '47', 'van', 'het', 'wetboek', 'van', 'de', 'minnelijke', 'en', 'gedwongen', 'invordering', 'van', 'fiscale', 'en', 'niet-fiscale', 'schuldvorderingen', '(hierna', '"invorderingswetboek")', 'en', 'de', 'artikelen', '157', 'tot', '159', 'en', '161', 'van', 'de', 'programmawet', '(i)', 'van', '29', 'maart', '2012', '(hierna', '"programmawet",', 'zoals', 'gewijzigd', 'door', 'de', 'wet',

In [18]:
# Using list comprehension + split()
# Tokenizing strings in list of strings
data_words = [sub.split() for sub in content]
print(data_words)

[['verslag'], ['aan'], ['de'], ['koning'], ['sire,'], ['het'], ['ontwerp'], ['van'], ['koninklijk'], ['besluit'], ['dat'], ['wij'], ['de'], ['eer'], ['hebben'], ['aan'], ['uwe'], ['majesteit'], ['voor'], ['te'], ['leggen,'], ['beoogt'], ['de'], ['uitvoering'], ['van'], ['de'], ['artikelen'], ['93'], ['ter'], ['tot'], ['93'], ['quinquies'], ['van'], ['het'], ['wetboek'], ['van'], ['de'], ['belasting'], ['over'], ['de'], ['toegevoegde'], ['waarde'], ['(hierna'], ['"wbtw"),'], ['de'], ['artikelen'], ['412'], ['bis,'], ['433'], ['tot'], ['435'], ['van'], ['het'], ['wetboek'], ['van'], ['de'], ['inkomstenbelastingen'], ['1992'], ['(hierna'], ['"wib'], ['92"),'], ['de'], ['artikelen'], ['35'], ['tot'], ['37'], ['en'], ['43'], ['tot'], ['45'], ['en'], ['47'], ['van'], ['het'], ['wetboek'], ['van'], ['de'], ['minnelijke'], ['en'], ['gedwongen'], ['invordering'], ['van'], ['fiscale'], ['en'], ['niet-fiscale'], ['schuldvorderingen'], ['(hierna'], ['"invorderingswetboek")'], ['en'], ['de'], ['art

### Creating bigrams and trigrams

In [19]:
# Building the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
###print(trigram_mod[bigram_mod[data_words]])

<gensim.interfaces.TransformedCorpus object at 0x7f028dfeb4d0>




###  Removing stopwords, making bigrams & trigtrams, and lemmatizing

In [20]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

### Initializing SpaCy's Dutch NLP model (large size)

In [16]:
 !pip install -U spacy
 !python -m spacy download nl_core_news_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 5.1 MB/s 
[?25hCollecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.5 MB/s 
[?25hCollecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 32.9 MB/s 
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (457 kB)
[K     |████████████████████████████████| 457 kB 60.3 MB/s 
[?25hCollecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.1-py3-none-any.whl (27 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting spacy-legacy<3.1.0,>=3.0.9

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nl-core-news-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/nl_core_news_lg-3.3.0/nl_core_news_lg-3.3.0-py3-none-any.whl (568.1 MB)
[K     |████████████████████████████████| 568.1 MB 8.5 kB/s 
Installing collected packages: nl-core-news-lg
Successfully installed nl-core-news-lg-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('nl_core_news_lg')


### Calling the functions in order

In [21]:
 # Removing Stop Words
data_words_nostops = remove_stopwords(data_words)

# Forming Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Loading the SpaCy 'nl' model, keeping only tagger component (for efficiency)
nlp = spacy.load('nl_core_news_lg', disable=['parser', 'ner'])

# Doing lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized)

[['verslag'], [], [], ['koning'], ['sire'], [], ['ontwerp'], [], ['koninklijk'], ['besluit'], [], [], [], ['eer'], [], [], ['uwe'], ['majesteit'], [], [], ['leggen'], ['beogen'], [], ['uitvoering'], [], [], ['artikel'], [], [], [], [], ['quinquie'], [], [], ['wetboek'], [], [], ['belasting'], [], [], ['toegevoegde'], ['waarde'], ['hierna'], ['wbtw'], [], ['artikel'], [], ['bis'], [], [], [], [], [], ['wetboek'], [], [], [], [], ['hierna'], [], [], [], ['artikel'], [], [], [], [], [], [], [], [], [], [], [], ['wetboek'], [], [], ['minnelijk'], [], ['dwingen'], ['invordering'], [], ['fiscaal'], [], ['fiscaal'], [], ['hierna'], [], [], [], ['artikel'], [], [], [], [], [], [], [], ['programma_wet'], [], [], [], [], [], ['hierna'], ['programma_wet'], [], ['wijzigen'], [], [], ['wet'], [], [], [], [], ['houdenen'], ['fiscaal'], ['_fraude'], ['bestrijden'], ['financieel'], [], ['divers'], ['bepaling'], [], [], ['wet'], [], [], [], [], ['houdenen'], ['wijziging'], [], [], ['wetboek'], [], [], 

**NOTE**: The empty lists (within this extensive python list of individual Dutch words) are negligible down the line, especially when it comes to ***computing model perplexity and coherence score***. These empty lists are indices where the stopwords used to be (and removed henceforth). There's an option to strip this extensive list of these empty lists, but for another time. 

### Creating the Dictionary and Corpus needed for Topic Modeling

In [22]:
# Creating Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Creating Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Viewing corpus
print(corpus)

[[(0, 1)], [], [], [(1, 1)], [(2, 1)], [], [(3, 1)], [], [(4, 1)], [(5, 1)], [], [], [], [(6, 1)], [], [], [(7, 1)], [(8, 1)], [], [], [(9, 1)], [(10, 1)], [], [(11, 1)], [], [], [(12, 1)], [], [], [], [], [(13, 1)], [], [], [(14, 1)], [], [], [(15, 1)], [], [], [(16, 1)], [(17, 1)], [(18, 1)], [(19, 1)], [], [(12, 1)], [], [(20, 1)], [], [], [], [], [], [(14, 1)], [], [], [], [], [(18, 1)], [], [], [], [(12, 1)], [], [], [], [], [], [], [], [], [], [], [], [(14, 1)], [], [], [(21, 1)], [], [(22, 1)], [(23, 1)], [], [(24, 1)], [], [(24, 1)], [], [(18, 1)], [], [], [], [(12, 1)], [], [], [], [], [], [], [], [(25, 1)], [], [], [], [], [], [(18, 1)], [(25, 1)], [], [(26, 1)], [], [], [(27, 1)], [], [], [], [], [(28, 1)], [(24, 1)], [(29, 1)], [(30, 1)], [(31, 1)], [], [(32, 1)], [(33, 1)], [], [], [(27, 1)], [], [], [], [], [(28, 1)], [(34, 1)], [], [], [(14, 1)], [], [], [(15, 1)], [], [], [(16, 1)], [(17, 1)], [], [(14, 1)], [], [], [], [], [], [(14, 1)], [], [], [(21, 1)], [], [(22, 1)

Gensim creates a unique id for each word in the document. The produced corpus shown above is a mapping of (word_id, word_frequency).

For example, (0, 1) above implies, **word id 0 occurs once** in the document. Likewise, **word id 1 occurs once too**, and so on.

This is used as the input by the LDA model.

If you want to see what word a given id corresponds to, pass the id as a key to the dictionary. (From https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)

In [23]:
# Passing the id as a key to the dictionary to see what word a given ID corresponds to
id2word[0]

'verslag'

In [25]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus]

[[('verslag', 1)],
 [],
 [],
 [('koning', 1)],
 [('sire', 1)],
 [],
 [('ontwerp', 1)],
 [],
 [('koninklijk', 1)],
 [('besluit', 1)],
 [],
 [],
 [],
 [('eer', 1)],
 [],
 [],
 [('uwe', 1)],
 [('majesteit', 1)],
 [],
 [],
 [('leggen', 1)],
 [('beogen', 1)],
 [],
 [('uitvoering', 1)],
 [],
 [],
 [('artikel', 1)],
 [],
 [],
 [],
 [],
 [('quinquie', 1)],
 [],
 [],
 [('wetboek', 1)],
 [],
 [],
 [('belasting', 1)],
 [],
 [],
 [('toegevoegde', 1)],
 [('waarde', 1)],
 [('hierna', 1)],
 [('wbtw', 1)],
 [],
 [('artikel', 1)],
 [],
 [('bis', 1)],
 [],
 [],
 [],
 [],
 [],
 [('wetboek', 1)],
 [],
 [],
 [],
 [],
 [('hierna', 1)],
 [],
 [],
 [],
 [('artikel', 1)],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [('wetboek', 1)],
 [],
 [],
 [('minnelijk', 1)],
 [],
 [('dwingen', 1)],
 [('invordering', 1)],
 [],
 [('fiscaal', 1)],
 [],
 [('fiscaal', 1)],
 [],
 [('hierna', 1)],
 [],
 [],
 [],
 [('artikel', 1)],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [('programma_wet', 1)],
 [],
 [],
 [],
 [],
 [],
 [

### Building the topic model

In [26]:
# Building LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

###  Viewing the topics in LDA model

You can see the keywords for each topic and the weightage(importance) of each keyword using lda_model.print_topics() as shown next.

In [27]:
# Printing keywords for index 0 to 10
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.273*"wijzigen" + 0.161*"wet" + 0.112*"overheid_dienst" + 0.112*"federaal" '
  '+ 0.038*"notariaat" + 0.038*"hetzelfde" + 0.020*"streepje" + 0.010*"divers" '
  '+ 0.003*"financieel" + 0.003*"_fraude"'),
 (1,
  '0.292*"zien" + 0.159*"raadpleging" + 0.159*"voegen" + 0.159*"tabel" + '
  '0.051*"wbtw" + 0.027*"Beeldgezien" + 0.025*"kader" + 0.013*"associeren" + '
  '0.012*"vallen" + 0.004*"begunstigen"'),
 (2,
  '0.382*"bedoelen" + 0.081*"erfop_volging" + 0.065*"btw" + 0.065*"eenheid" + '
  '0.064*"verplichting" + 0.064*"bijzonder" + 0.002*"sire" + 0.002*"volledig" '
  '+ 0.002*"opnieuw" + 0.002*"lang"'),
 (3,
  '0.165*"quinquie" + 0.125*"belasten" + 0.090*"eigenaar" + 0.079*"houden" + '
  '0.069*"Hypotheek" + 0.041*"volgen" + 0.041*"opmaak" + 0.031*"machtigden" + '
  '0.031*"beheers_ysteem" + 0.012*"vermellen"'),
 (4,
  '0.729*"artikel" + 0.058*"zending" + 0.046*"bepalen" + 0.001*"maken" + '
  '0.001*"Beeldgezien" + 0.001*"hetzelfde" + 0.001*"woord" + '
  '0.001*"associeren" + 0.

**NOTE**: For each index (0 to 10), there's a top 10 keywords that contribute to this topic.

The weights reflect how important a keyword is to that topic.

Looking at these keywords, you can guess what this topic could be.

### Compute Model Perplexity and Coherence Score

In [28]:
# Computing Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Computing Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.148686727401539

Coherence Score:  0.8031396675007777
