# Information extraction (28th October 2021)

This notebook extracts additional information from the text of the tribunal decisions and stores it in the relevant dictionary.

In particular, the notebook performs information extraction on:

1. The label included in the name of the file.

2. The court where the case was heard ("Heard at").

3. The judges.

4. The legal representation for the appellant and the respondent.

5. The decision/ruling by the judge.

Each of these filds is added to the dictionary of each judicial decision.

The resulting data set - a list of updated dictionaries -  is serialised as a json object (jsonDataFinal.json).

This notebook should run in the tfm environment, which can be created with the environment.yml file.

In [1]:
import ipykernel
from os import listdir
from os.path import isfile, join, getsize
import numpy as np
import time
import re
import json
import pickle
import pandas as pd
import sys
import datetime
from tqdm import tqdm

from gensim.models import Word2Vec
import multiprocessing
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score


from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import remove_stopwords


import sys
IN_COLAB = 'google.colab' in sys.modules


# What environment am I using?
print(f'Current environment: {sys.executable}')

# Change the current working directory
os.chdir('/Users/albertamurgopacheco/Documents/GitHub/TFM')
# What's my working directory?
print(f'Current working directory: {os.getcwd()}')


Current environment: /Users/albertamurgopacheco/anaconda3/envs/tfm/bin/python
Current working directory: /Users/albertamurgopacheco/Documents/GitHub/TFM


In [2]:
# Define working directories in colab and local execution

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')
    docs_path = '/content/gdrive/MyDrive/TFM/data/raw'
    input_path = '/content/gdrive/MyDrive/TFM'
    output_path = '/content/gdrive/MyDrive/TFM/output'

else:
    docs_path = './data/raw'
    input_path = '.'
    output_path = './output'

# DOC2VEC: PARTIAL & FULL TEXT OF THE RULING

# 1. The data and functions needed for the averaged word2vec and the doc2vec

There are two 



In [3]:
# Open jsonDataFinal file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# List to store the judicial decisions
corpus = []

corruptFiles = ['HU077022015', 'HU029682017']

# Search data list of dictionaries for dict where {"File":} = file_name
for d in tqdm(data):
    # Dealing with corrupt and empty files
    if d.get('File') not in corruptFiles:
        doc = d.get('String')
        #dec = d.get('Decision:')
        #dec_label = d.get('Decision label:')
        if doc:
            corpus.append(doc)
            #decisions.append(dec)
            #decisions_labels.append(dec_label)
        else:
            continue

print(f'The corpus includes {len(corpus)} documents.')
#print(f'The number of documents with a decision: {len(decisions)}.')
#print(f'The number of decisions with a label: {len(decisions_labels)} documents.')

print(f'The documents are of type: {type(corpus[0])}.')

100%|██████████| 35305/35305 [00:00<00:00, 211566.90it/s]

The corpus includes 35305 documents.
The documents are of type: <class 'str'>.





The corpus needs to be cleaned. The decisions and the labels are already clean. They were cleaned during the extraction process as their exctraction required tokenization (stanza).

In [4]:
# Gensim-implemented filters for preprocessing data
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, 
strip_multiple_whitespaces, strip_non_alphanum, strip_numeric, remove_stopwords]

# List storing thr preprocessed documents
corpus_clean = [preprocess_string(doc, CUSTOM_FILTERS) for doc in corpus]


Some functions:

In [5]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding
    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features


def mbkmeans_clusters(
	X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

# 2. AVERAGED WORD2VEC ON THE FULL TEXT OF THE RULING

The strategy consists on training a word2vec model using the entire text of all the 35305 rulings. The results are averaged per document. Clustering algorithms are used to identify types of documents and their topics. 

In [6]:
# Number of processing cores
cores = multiprocessing.cpu_count()

w2v_model = Word2Vec(min_count = 20,
                     window = 10,
                     vector_size = 300,
                     sample = 6e-5,
                     alpha = 0.03,
                     min_alpha = 0.0007,
                     negative = 20,
                     workers = cores-1)

model = Word2Vec.load('./output/gensim-model-All')

vectorized_docs = vectorize(corpus_clean, model = model)
len(vectorized_docs), len(vectorized_docs[0])

(35305, 300)

In [None]:
# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Create df with data
df = pd.DataFrame(data)

# Replace np.nan values for empty list where no decision 
df['Decision:'] = [ [] if isinstance(x, float) else x for x in df['Decision:']]

# Original texts and labels of the decisions tokenized and ready to train a w2v
decisions = df['Decision:'].tolist()
decisions_labels = df['Decision label:'].tolist()


In [62]:


vectorized_decisions = vectorize(decisions, model = model)
len(vectorized_decisions), len(vectorized_decisions[0])

(35305, 300)

In [73]:


docs = df['Decision:'].tolist()
decision_labels = df['Decision label:'].tolist()


clustering, cluster_labels = mbkmeans_clusters(
	X = vectorized_decisions,
    k = 2,
    mb = 500,
    print_silhouette_values = True,
)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in corpus_clean],
    "cluster": cluster_labels,
    "label": decision_labels
})

For n_clusters = 2
Silhouette coefficient: 0.42
Inertia:422917.35099671595
Silhouette values:
    Cluster 1: Size:9328 | Avg:0.59 | Min:0.03 | Max: 0.78
    Cluster 0: Size:25977 | Avg:0.35 | Min:-0.11 | Max: 0.53


In [74]:
df_clusters.head(100)

Unnamed: 0,text,tokens,cluster,label
0,"[notice, of, decision, directions, the, decisi...",pic iac fh ck v upper tribunal immigration asy...,1,Accepted
1,"[in, light, of, my, conclusions, on, that, poi...",utijr jr upper tribunal immigration asylum cha...,1,Rejected
2,"[notice, of, decision, the, decision, of, the,...",pic upper tribunal immigration asylum chamber ...,0,Rejected
3,"[notice, of, decision, the, decision, of, the,...",pic upper tribunal immigration asylum chamber ...,0,Accepted
4,"[decision, the, decision, of, tribunal, judge,...",pic upper tribunal immigration asylum chamber ...,0,Accepted
...,...,...,...,...
95,"[decision, the, judge, materially, erred, in, ...",pic upper tribunal immigration asylum chamber ...,0,Accepted
96,"[decision, the, judge, materially, erred, in, ...",pic upper tribunal immigration asylum chamber ...,0,Accepted
97,"[from, the, point, at, which, the, applicant, ...",upper tribunal pic jr field house breams build...,1,Neutral
98,[],pic case jr v upper tribunal immigration asylu...,1,


In [70]:
print("Most representative terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=10)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")


Most representative terms per cluster (based on centroids):
Cluster 0: decision tribunal appeal tier aside dismiss remade determination ftt remake 
Cluster 1: decision tribunal appeal tier ftt accordingly aside determination judge error 


IndexError: index 2 is out of bounds for axis 0 with size 2

# 3. AVERAGED WORD2VEC ON TEXT OF THE RULING DESCRIBING THE DECISION

The strategy consists on training a word2vec model using the part of the text describing the judges decision from each the 35305 rulings. The results are averaged per document. Clustering algorithms are used to identify types of documents and their topics. There should be three types of documents: "Accepted", "Rejected" and "Salomonic/cannot tell".

In [10]:
# Number of processing cores
cores = multiprocessing.cpu_count()

w2v_model = Word2Vec(min_count = 20,
                     window = 10,
                     vector_size = 300,
                     sample = 6e-5,
                     alpha = 0.03,
                     min_alpha = 0.0007,
                     negative = 20,
                     workers = cores-1)

#
#model_decisions = w2v_model.build_vocab(decisions, progress_per = 10000)
model_decisions = Word2Vec(sentences = decisions, vector_size = 100, workers = 1, sg = 1)

# print('Time to build the vocabulary: {} mins'.format(round((time() - t) / 60, 2)))

model_decisions.save('./output/gensim-model-decisions')

NameError: name 'decisions' is not defined

In [None]:
clean_decisions = [x for x in decisions if x != None]

vectorized_decisions = vectorize(decisions, model = model_decisions)

len(vectorized_decisions), len(vectorized_decisions[0])

(35305, 100)

In [None]:

clustering, cluster_labels = mbkmeans_clusters(
	X = vectorized_decisions,
    k = 2,
    mb = 500,
    print_silhouette_values = True,
)
df_clusters = pd.DataFrame({
    "text": decisions,
    "tokens": [" ".join(text) for text in decisions],
    "cluster": cluster_labels,
    "label": decisions_labels
})

For n_clusters = 2
Silhouette coefficient: 0.38
Inertia:275689.5366233938
Silhouette values:
    Cluster 1: Size:8680 | Avg:0.58 | Min:0.05 | Max: 0.76
    Cluster 0: Size:26625 | Avg:0.32 | Min:-0.13 | Max: 0.51


In [7]:
print("Most representative terms per cluster (based on centroids):")
for i in range(2):
    tokens_per_cluster = ""
    most_representative = model_decisions.wv.most_similar(positive = [clustering.cluster_centers_[i]], topn = 5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):


NameError: name 'model_decisions' is not defined

In [11]:
df_clusters

NameError: name 'df_clusters' is not defined

doc2vec

In [104]:
from gensim.models.doc2vec import Doc2Vec

model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 35305/35305 [00:00<00:00, 722309.28it/s]


AttributeError: 'list' object has no attribute 'words'