# Information extraction (30th October 2021)

This notebook applies word2vec to the corpus of tribunal decisions.

In particular, the notebook does:

1. Data preparation for word2vec.

2. Implementation of word2vec.

3. Topic model with Latent Dirichlet Allocation (LDA) and Latent Semantic Indexing (LSI).

The resulting trained model is... .

This notebook should run in the tfm environment, which can be created with the environment.yml file.

In [3]:
from os import listdir
from os.path import isfile, join, getsize
import numpy as np

import time
import re
import json
import pickle
import pandas as pd
import whois
import sys
import datetime
from tqdm import tqdm
import textract
import gensim
import spacy
import scipy as sp
import sys
import multiprocessing
import gensim
from gensim import corpora
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import remove_stopwords

import logging

from smart_open import smart_open
import nltk
#nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer


import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)



IN_COLAB = 'google.colab' in sys.modules


# What environment am I using?
print(f'Current environment: {sys.executable}')

# Change the current working directory
os.chdir('/Users/albertamurgopacheco/Documents/GitHub/TFM')
# What's my working directory?
print(f'Current working directory: {os.getcwd()}')


Current environment: /Users/albertamurgopacheco/anaconda3/envs/tfm/bin/python
Current working directory: /Users/albertamurgopacheco/Documents/GitHub/TFM


In [4]:
# Define working directories in colab and local execution

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')
    docs_path = '/content/gdrive/MyDrive/TFM/data/raw'
    input_path = '/content/gdrive/MyDrive/TFM'
    output_path = '/content/gdrive/MyDrive/TFM/output'

else:
    docs_path = './data/raw'
    input_path = '.'
    output_path = './output'

# WORD2VEC

# 1. The data preparation

Loading the data to a list of documents (corpus) where each document is a judicial decision.

In [6]:
# Open jsonDataFinal file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# List to store the judicial decisions
corpus = []

corruptFiles = ['HU077022015', 'HU029682017']

# Search data list of dictionaries for dict where {"File":} = file_name
for d in tqdm(data):
    # Dealing with corrupt and empty files
    if d.get('File') not in corruptFiles:
        doc = d.get('String')
        if doc:
            corpus.append(doc)
        else:
            continue

print(f'The corpus includes {len(corpus)} documents.')
print(f'The documents are {type(corpus[0])}.')

100%|██████████| 35308/35308 [00:00<00:00, 880564.67it/s]

The corpus includes 35305 documents.
The documents are <class 'str'>.





CLeaning each document

In [34]:
# Gensim-implemented filters for preprocessing data
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, 
strip_multiple_whitespaces, strip_non_alphanum, strip_numeric, remove_stopwords]

# List storing thr preprocessed documents
corpus_clean = [preprocess_string(doc, CUSTOM_FILTERS) for doc in corpus]

# Removing non-numerical characters
#brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in corpus_clean)

In [35]:
# Number of available processing cores
cores = multiprocessing.cpu_count()
print(f'Available cores {cores}.')

Available cores 4.


Lemmatizing and removing the stopwords and non-alphabetic characters.

In [58]:
# IF RAN, DO IT BEFORE CLEANING... 


import spacy
#nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed
#nlp = spacy.load('en_core_web_sm') # disabling Named Entity Recognition for speed
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

def stem_doc(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    stem_doc = [porter_stemmer.stem(wd) for wd in doc]
    # if len(wd) > 2
    return stem_doc

    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small

#txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size = 50, n_process = cores -1)]
stemmed_corpus = [stem_doc(doc) for doc in corpus_clean]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 587.13 mins


In [60]:
print(stemmed_corpus[0][:15])

['pic', 'iac', 'fh', 'ck', 'v', 'upper', 'tribun', 'immigr', 'asylum', 'chamber', 'appeal', 'number', 'hu', 'immigr', 'act']


In [61]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer_doc(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    lemm_doc = [wordnet_lemmatizer.lemmatize(wd) for wd in doc]
    # if len(wd) > 2
    return lemm_doc

    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small

#txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size = 50, n_process = cores -1)]
lemmatized_corpus = [lemmatizer_doc(doc) for doc in stemmed_corpus]

In [63]:
lemmatized_corpus = lemmatized
print(lemmatized_corpus[0][:15])

['pic', 'iac', 'fh', 'ck', 'v', 'upper', 'tribun', 'immigr', 'asylum', 'chamber', 'appeal', 'number', 'hu', 'immigr', 'act']


Detecting common phrases (multi-word or collocations) expressions from the stream of sentences.

In [66]:
from gensim.models.phrases import Phrases, Phraser

sent = [row.split() for row in corpus if row]

phrases = Phrases(sent, min_count = 30, progress_per = 10000)

AttributeError: 'list' object has no attribute 'split'

In [67]:
from collections import defaultdict  # For word frequency
bigram = Phraser(phrases)

#sentences = bigram[corpus_clean]
sentences = bigram[lemmatized_corpus]

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

90189

In [68]:
sorted(word_freq, key = word_freq.get, reverse = True)[:20]

['appel',
 's',
 'judg',
 'decis',
 'tribun',
 'appeal',
 'evid',
 'state',
 'respond',
 'mr',
 'applic',
 'reason',
 'paragraph',
 'case',
 'immigr',
 'tier',
 'consid',
 'famili',
 'law',
 'rule']

# 2. Implementation of word2vec model

The word embedding model is a model that can provide numerical vectors for a given word.

In [69]:
import multiprocessing

from gensim.models import Word2Vec

w2v_model = Word2Vec(min_count = 20,
                     window = 10,
                     vector_size = 300,
                     sample = 6e-5,
                     alpha = 0.03,
                     min_alpha = 0.0007,
                     negative = 20,
                     workers = cores-1)

In [70]:
t = time()

w2v_model.build_vocab(sentences, progress_per = 10000)

print('Time to build the vocabulary: {} mins'.format(round((time() - t) / 60, 2)))

Time to build the vocabulary: 1.12 mins


In [71]:
t = time()

w2v_model.train(sentences, total_examples = w2v_model.corpus_count, epochs = 30, report_delay = 1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 40.47 mins


In [72]:
w2v_model.save('./output/gensim-model-All_bigrams_lemmatized')

Similarities:

In [74]:
w2v_model.wv.similarity('refuge', 'error')

-0.06634754

In [None]:
w2v_model.wv.similarity('refugee', 'error')

Most similar:

In [76]:
w2v_model.wv.most_similar(positive = ["judg"])

[('find', 0.6643282175064087),
 ('er', 0.640164315700531),
 ('arguabl', 0.6103781461715698),
 ('determin', 0.6081475019454956),
 ('ground', 0.6040764451026917),
 ('second_ground', 0.5902823805809021),
 ('reason', 0.5790618658065796),
 ('error', 0.578513503074646),
 ('tier', 0.5736409425735474),
 ('properli', 0.5588219165802002)]

In [77]:
w2v_model.wv.most_similar(positive = ["refuge"])

[('humanitarian', 0.6074868440628052),
 ('geneva', 0.5982367992401123),
 ('convent', 0.571603536605835),
 ('protect', 0.5548973083496094),
 ('cessat', 0.5113057494163513),
 ('refoul', 0.4869057834148407),
 ('persecut', 0.465663343667984),
 ('stateless_person', 0.463575541973114),
 ('seek_refuge', 0.4553128778934479),
 ('found', 0.4405970871448517)]

In [80]:
w2v_model.wv.most_similar(positive=["biometr"])

[('brp', 0.507870078086853),
 ('biograph', 0.4448765218257904),
 ('student_migrant', 0.40723922848701477),
 ('aadhaar', 0.3919180929660797),
 ('inid', 0.38970157504081726),
 ('invalid', 0.382639080286026),
 ('valid', 0.3679819107055664),
 ('permit', 0.3635072410106659),
 ('applic', 0.3578976094722748),
 ('centim', 0.34924787282943726)]

End of gensim tutorial

In [81]:
w2v_model.wv.most_similar(positive=["uk"])

[('unit', 0.6758801341056824),
 ('kingdom', 0.6745372414588928),
 ('remain', 0.6378411650657654),
 ('year', 0.5883458256721497),
 ('leav', 0.5810246467590332),
 ('live', 0.5688135623931885),
 ('british', 0.5632806420326233),
 ('continu', 0.5620144009590149),
 ('enter', 0.5400956273078918),
 ('visitor', 0.5124759078025818)]

Odd-One-Out:

In [None]:
w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])

In [34]:
data = [d for d in dataset]
print(type(data[0][0]))

<class 'str'>


In [15]:
# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in corpus_pre]

# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
print(dictionary)

Dictionary(1613 unique tokens: ['abandoned', 'ability', 'able', 'absence', 'accept']...)


In [17]:
# Show the word to id map
#print(dictionary.token2id)


create a bag of words corpus

In [18]:
# Tokenize the docs
tokenized_list = [simple_preprocess(doc) for doc in corpus_pre]

# Create the Corpus
mydict = corpora.Dictionary()
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list]
#pprint(mycorpus)

word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus]
#pprint(word_counts)

In [12]:
w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])

# 3. Topic model with Latent Dirichlet Allocation (LDA) and Latent Semantic Indexing (LSI)

The objective of topic models is to extract the underlying topics from a collection of text documents. Each document in the text is considered as a combination of topics and each topic is considered as a combination of related words.

Topic modeling can be done by algorithms like Latent Dirichlet Allocation (LDA) and Latent Semantic Indexing (LSI).

3.1. Latent Dirichlet Allocation (LDA)
Each document can be described by a distribution of topics and each topic can be described by a distribution of words. https://towardsdatascience.com/light-on-math-machine-learning-intuitive-guide-to-latent-dirichlet-allocation-437c81220158

In [None]:
from gensim.models import LdaModel, LdaMulticore

In [83]:
# Starting from corpus_clean

# Build the bigram and trigram models
bigram = gensim.models.Phrases(corpus_clean, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[corpus_clean], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



NameError: name 'data_words' is not defined

In [85]:
# See trigram example
#print(trigram_mod[bigram_mod[corpus_clean[0]]])

In [86]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [88]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# Remove Stop Words
data_words_nostops = remove_stopwords(corpus_clean)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])



[['pic', 'upper', 'tribunal', 'immigration', 'asylum', 'chamber', 'appeal', 'number', 'immigration', 'act', 'hear', 'field_house', 'decision', 'reason', 'promulgate', 'extempore', 'upper', 'tribunal', 'judge', 'anonymity', 'direction', 'appellant', 'respondent', 'counsel_instructe', 'consultant', 'office', 'present', 'officer', 'decision', 'reason', 'appellant', 'bear', 'section', 'nationality', 'act', 'decision', 'secretary', 'state', 'refuse', 'application', 'leave', 'remain', 'appeal', 'decision', 'hear', 'tier', 'tribunal', 'march', 'reason', 'set', 'decision', 'refuse', 'reason', 'set', 'decision', 'copy', 'annex', 'decision', 'set', 'aside', 'appellant', 'enter', 'visit', 'visa', 'remain', 'leave', 'form', 'relationship', 'refer', 'sponsor', 'son', 'british', 'previous', 'relationship', 'case', 'long', 'contact', 'child', 'father', 'appellant', 'form', 'parental', 'relationship', 'appellant', 'family', 'live', 'relatively', 'recently', 'sponsor', 'employ', 'secretary', 'state', '

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corp = [id2word.doc2bow(text) for text in texts]

# View
print(corp[:1])

In [None]:
id2word[0]

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corp[:1]]

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corp,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corp]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corp))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corp, id2word)
vis

Building LDA Mallet Model

In [None]:
# Building LDA Mallet Model

# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = './data/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

In [None]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

Optimal number of topics

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

What topic each document?

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Most representative document for each topic

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics