# Information extraction (20th November 2021)

This notebook extracts additional information from the text of the tribunal decisions and stores it in the relevant dictionary.

In particular, the notebook performs information extraction on:

1. The label included in the name of the file ('Code label:').

2. The court where the case was heard ('Heard at').

3. The judges ('Judges:').

4. The legal representation ('Representation:') for the appellant ('Representation appellant:') and the respondent ('Representation respondent:').

5. The decision/ruling by the judge ('Decision:').

6. The sense of the decision/ruling ('Decision label:').

7. The nationality of the the subject of the case (appellant or respondent).

Each of these fields is added to the dictionary of each judicial decision.

The resulting data set - a list of updated dictionaries -  is serialised as a json object (jsonDataFinal.json).

This notebook should run in the tfm environment, which can be created with the environment.yml file.

In [2]:
from os import listdir
from os.path import isfile, join, getsize
import numpy as np
import re
import json
import pickle
import pandas as pd
import whois
import sys
import datetime
from tqdm import tqdm
import textract
import re
from pprint import pprint

from nltk.tokenize import sent_tokenize, word_tokenize
import stanza
import spacy

import sys
IN_COLAB = 'google.colab' in sys.modules


# What environment am I using?
print(f'Current environment: {sys.executable}')

# Change the current working directory
os.chdir('/Users/albertamurgopacheco/Documents/GitHub/TFM')
# What's my working directory?
print(f'Current working directory: {os.getcwd()}')


Current environment: /Users/albertamurgopacheco/anaconda3/envs/tfm/bin/python
Current working directory: /Users/albertamurgopacheco/Documents/GitHub/TFM


In [3]:
# Define working directories in colab and local execution

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')
    docs_path = '/content/gdrive/MyDrive/TFM/data/raw'
    input_path = '/content/gdrive/MyDrive/TFM'
    output_path = '/content/gdrive/MyDrive/TFM/output'

else:
    docs_path = './data/raw'
    input_path = '.'
    output_path = './output'

# INFORMATION EXTRACTION

# 0. Auxiliary functions and files

In [None]:
# Function to capture whether all elements exist in a list
def sublist(sublist, lst):
    """
    Given a list of sentences/lists all_exist checks whether sublist list exists in lst

    :sublist: list to search
    :lst: list to be searched
    :return: the list with the match
    """
    if not isinstance(sublist, list):
        raise ValueError("sublist must be a list")
    if not isinstance(lst, list):
        raise ValueError("lst must be a list")

    sublist_len = len(sublist)
    k=0
    s=None

    if (sublist_len > len(lst)):
        return False
    elif (sublist_len == 0):
        return True

    for x in lst:
        if x == sublist[k]:
            if (k == 0): s = x
            elif (x != s): s = None
            k += 1
            if k == sublist_len:
                return True
        elif k > 0 and sublist[k-1] != s:
            k = 0

    return False

# Function to capture if all elements exist in a list
def all_exist(avalue, bvalue):
    """
    Given a list of sentences/lists all_exist checks whether avalue list exists in bvalue

    :avalue: list to search
    :bvalue: list to be searched
    :return: the list with the match
    """
    return all(any(x in y for y in bvalue) for x in avalue)

# 1. The label included in the name of the file

There are two categories of cases: the reported and the unreported ones. The reported cases include richer data while the unreported ones (the vast majority of cases) miss several data fields due to a request for annonimity from any of the parties involved in the legal dispute.

The first two letters in the file name seem to follow some logic. Inspecting the documents reveals the following meanings:

In [40]:
# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each decision and extract first two characters of the file's name
for decision in tqdm(data):
    # Only 'unteported' decisions include this 2-letter code
    if decision.get('Status of case:') == 'Unreported':
        string_code = decision.get('File')[:2]
    else:
        string_code = 'NA'
    
    # Add dictionary key 'Code label' with value string to the dictionary
    decision.update({'Code label:': string_code})

# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)

100%|██████████| 35305/35305 [00:00<00:00, 1588874.25it/s]


# 2. The court where the case was heard

An inspection of a sample of judicial decisions reveals that the name of the court is located in the first part of the document and it usually follows the expression "Heard at".

The strategy to capture this field will consist of a search using regular expressions. 

In [41]:
# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each text file and extract Court information
for decision in tqdm(data):
    # Obtain the text of the court decision
    decision_string = decision.get('String')
    # Deal with empty/corrupt files that didn't upload a sentence string
    if decision_string:
        # Regex expression: What comes after "Heard at" until hitting 3 balnks or new line
        #regex = '(?<=Heard at).*[^\S\r\n]{3,}'
        regex = 'Heard at(.*)[\S\r\n]| (?<=Heard at).*[^\S\r\n]{3,}'
        catch = re.search(regex, decision_string)

        # If the catch is successful
        if catch :
            string = catch.group(0)
            # Remove ':' if included in the catch
            string = string.replace(':','')
            # Remove leading and trailing spaces
            string = string.strip()
            # Avoids picking up parts of tables and '|'
            string = string.split('   ')
            string = string[0]
            # Remove 'Heard at' if included in the catch
            string = string.replace('Heard at ','')
            # Remove 'manually' some strings often included in the catch
            string = string.replace('|Decision & Reasons Promulgated','')
            string = string.replace('|Decision and Reasons Promulgated','')
            string = string.replace('| Decision & Reasons Promulgated','')
            string = string.replace('Decision Promulgated','')
            string = string.replace('|Decision & Reasons promulgated','')
            string = string.replace('|Determination Promulgated','')
            string = string.replace('Decision and Reasons Promulgated','')
            string = string.replace('|Decision & Reasons  Promulgated','')
            string = string.replace(' on 4 July 2003','')
            string = string.replace('Determination Promulgated','')
            string = string.replace('Decision & Reasons Promulgated','')
            string = string.replace('|Decisions and Reasons Promulgated','')
            string = string.replace('|Decision and Reasons','')
            string = string.replace('UT(IAC)','')
            string = string.replace('UT (IAC) ','')
            string = string.replace('Date of Hearing  9 December 2005','')
            string = string.replace(' | |SS (Risk-Manastry) Iran CG [2003] UKIAT 00035 |','')
            # Strip of often found trailing characters
            string = string.rstrip(',')
            string = string.rstrip('|')
            # Remove leading and trailing spaces (again)
            string = string.strip()
        else:
            string = 'NA'
        #print(string)
        # Add dictionary key 'Heard at' with value string to the dictionary
        decision.update({'Heard at:': string})
    else:
        continue
# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)

100%|██████████| 35305/35305 [00:01<00:00, 27089.94it/s]


# 3. The judges



In [42]:
# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each text file and extract Court information
for decision in tqdm(data):
    # Obtain the text of the court decision
    decision_string = decision.get('String')
    # Deal with empty/corrupt files that didn't upload a sentence string
    if decision_string:
        # Regex expression: What comes in between 'Before' and 'Between'
        regex = '(?<=Before)([\s\S]*?)(?=Between)'
        catch = re.search(regex, decision_string)
        #If the catch is successful
        if catch :
            string = catch.group(0)

            # Get rid of some table delimiters
            string = string.replace('|','')
            string = string.replace('?','')
            string = string.replace(',','')

            # Remove leading and trailing spaces
            string = string.strip()
            
            # Split strings (spaces > 3 usually indicates two "joint" names)
            # Alternative approach tried and discarded base on sentence tokenization 
            # from nltk.tokenize import sent_tokenize
            listNames = string.split("   ")
            # Make list of names with strings containijng names
            # Capitalize the first letter of each word & delete 
            listNames = [name.strip().title() for name in listNames if name.strip()]

            # Discard content in brackets as it's mostly titles and clutter
            listNames = [re.sub('[\(\[].*?[\)\]]', '', x).strip() for x in listNames]

            # Finally, delete titles, positions held and other clutter around the name
            clutter = ['Judge', 'Tribunal', 'Court', 'Upper', 'Deputy', 'Senior', 'Of', 'The', 'Mr', 'Dr', 'Vice', 'President',
            ':', 'Honourable', 'Hon.', '', '- - - - - - - - - - - - - - - - - - - -', 'Ut', 'Trinbunal', '-And-', 'Mrs', 'President,',
            'Tribnunal', '-', 'Hon', 'And', 'Chairman', 'Vice-President', 'Immigration', 'Asylum Chamber', '-Vice', '(Senior',
            '...............', 'Designated', 'His Honour', 'Respondent Representation: For Appellant', 'Secretary State For Home Department',
            'Appellant', 'Lord', 'Sir', 'In Matter An Application For Judicial Review', 'I) Eu Regulation Number 604/2013 Human',
            'Miss', 'Ms.', ':-']

            # 
            listNames = [' '.join(filter(lambda x: x not in clutter,  name.split())) for name in listNames]
            # Remove remaining 'issues' with empty strings ''
            listNames = list(filter(None, listNames))
            # Add a . following individual letters

            #print(listNames)
            
        else:
            listNames = ['NA']
        
        #print(decision.get('File'))
        #print(listNames)
        # Add dictionary key 'Judges:' with value list of strings to the dictionary
        decision.update({'Judges:': listNames})
    else:
        continue

# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)

100%|██████████| 35305/35305 [00:01<00:00, 24965.24it/s]


In [43]:
# 'Manually' fix some mistakes with some judges

# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each text file and extract Court information
for decision in tqdm(data):

    if decision.get('File') == '00046_ukut_iac_2020_ps_iran_cg':
        listNames = ['J Barnes', 'A R Mackey', 'S L Batiste']
        decision.update({'Judges:': listNames})
    else:
        continue
    if decision.get('File') == '00393_ukut_iac_2019__jw_ors_ijr':
        listNames = ['Rimington Jackson']
        decision.update({'Judges:': listNames})
    else:
        continue
    if decision.get('File') == '2004_ukiat_00248_gh_iraq_cg':
        listNames = ['Rintoul', 'Bruce']
        decision.update({'Judges:': listNames})
    else:
        continue
    if decision.get('File') == '00270_ukut_iac_2015_mmw_ijr':
        listNames = ['Justice Mccloskey']
        decision.update({'Judges:': listNames})
    else:
        continue
    if decision.get('File') == '00271_ukut_iac_2015_bh_ijr':
        listNames = ['Justice Mccloskey', "O'Connor"]
        decision.update({'Judges:': listNames})
    else:
        continue
    if decision.get('File') == 'AA082212015':
        listNames = ['Alis', 'I K']
        decision.update({'Judges:': listNames})
    else:
        continue

# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)


100%|██████████| 35305/35305 [00:00<00:00, 2771630.50it/s]


# 4. The legal representation for the appellant and the respondent

The legal team consists of the representation for the appellant and the respondent.

In [242]:
representation = []
files_legal = []

# nlp sentence tokenizer with Stanford
nlp = stanza.Pipeline(lang = 'en', processors = 'tokenize', tokenize_no_ssplit = True)

# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each text file and extract Court information
for decision in tqdm(data):
    # Obtain the text of the court decision
    decision_string = decision.get('String')
    file_name = decision.get('File')
    files_legal.append(file_name)
    #print(file_name)
    # Use only first third of text
    string = decision_string[:len(decision_string)//3]
    # All text in lower
    string = string.lower()
    # Apply stanford nlp to string
    doc = nlp(string)

    # List to store the ruling sentences
    catch = []

    # Make sentences
    for i, sentence in enumerate(doc.sentences):
        sente = [token.text for token in sentence.tokens]
        # Keep only the alpha tokens
        sente = [e for e in sente if e.isalpha()]
        catch.append(sente)
        #print(catch)
    
    # Look for partial hits (representation_leads_part) in string 
    representation_leads_part = [['representation', 'for', 'the', 'appellant'], ['representation', 'for', 'the', 'claimant'],
    ['for', 'the', 'appellant'], ['representation', 'for', 'the', 'appellants'], ['for', 'the', 'first', 'appellant']]
    
    # Representation has not been found yet (flag = 0)
    flag = 0

    for element in catch:
        for part in representation_leads_part:
            # find index of part hit
            idx_part = representation_leads_part.index(part)
            # Condition flag == 0 to avoid greedy behaviour (several matches) Only matters 1st hit
            if sublist(representation_leads_part[idx_part], element) and flag == 0:
                index = catch.index(element)
                # representaion lead found in catch
                flag = 1
                # Keep only sentence with the hit (it includes all needed info)
                new_catch = catch[index]
                representation.append(new_catch)
                decision.update({'Representation:': new_catch})
                #print(new_catch)
                break
            else:
                continue
                
    # If information on the representation has not been found (flag = 0)
    if flag == 0:
        #print(f'Did not find a nationality {file_name} in catch: {catch}')
        representation.append(np.nan)
        decision.update({'Representation:': np.nan})
        #print('Did not find a representation')
        #print(catch)
    else:
        continue

# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)


100%|██████████| 35305/35305 [5:05:36<00:00,  1.93it/s]


The information on the legal representatives has been captured for a large number of decisions. 

In [244]:
dict_representation = {'File':files_legal,'Representation':representation}

df_representation = pd.DataFrame(dict_representation, columns=['File','Representation'])
df_representation.isna().sum()


File                 0
Representation    3372
dtype: int64

The field Representation: includes a string with the information on the legal representatives. The following breaks it down into two pieces:
- The legal representation of the appellant (legalAppellant).
- The legal representation of the defendant (legalDefendant).

In [91]:
files_appellant = []
appellants = []
respondents = []

# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each text file and extract Court information
for decision in tqdm(data):
    # Obtain the text of the court decision
    representation_string = decision.get('Representation:')
    file_name = decision.get('File')

    #file_name = decision.get('File')
    if isinstance(representation_string, float):
        continue
    else:
        # If a label has not been found (flag = 0)
        flag = 0
        # The decisions are stored as a listt of tokens
        string = ' '.join(x for x in representation_string)
        #print(string)
        files_appellant.append(string)
        
        # Catch what's between 'for the appellant/s and for the correspondent' 
        regex_appellant = '(?<=for the appellant)([\s\S]*?)(?=for the respondent)'
        catch_appellant = re.search(regex_appellant, string)
        if catch_appellant :
            string_appellant = catch_appellant.group(0)
            # Remove leading and trailing spaces
            string_appellant = string_appellant.strip()
            
            if string_appellant.startswith('s'):
                string_appellant = string_appellant[1:].strip()
        
        # Catch what's between 'for the claimant/s and for the correspondent' 
        regex_claimant = '(?<=for the claimant)([\s\S]*?)(?=for the respondent)'
        catch_claimant = re.search(regex_claimant, string)
        if catch_claimant :
            string_appellant = catch_claimant.group(0)
            # Remove leading and trailing spaces
            string_appellant = string_appellant.strip()
            
            if string_appellant.startswith('s'):
                string_appellant = string_appellant[1:].strip()
                print(string_appellant)
                
        #print(string_appellant)
        appellants.append(string_appellant)
        decision.update({'Appellant:': string_appellant})

        # Catch what's after 'for the correspondent' 
        regex_respondent = '(?<=for the respondent)([\s\S]*)'
        catch_respondent = re.search(regex_respondent, string)
        if catch_respondent :
            string_respondent = catch_respondent.group(0)
            # Remove leading and trailing spaces
            string_respondent = string_respondent.strip()
                
            if string_respondent.startswith('s'):
                string_respondent = string_respondent[1:].strip()

        #print(string_respondent)
        respondents.append(string_respondent)
        decision.update({'Respondent:': string_respondent})

# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)


100%|██████████| 35305/35305 [00:00<00:00, 101300.39it/s]


In [92]:
print(len(files_appellant))
print(len(appellants))
print(len(respondents))

dict_sense_representation = {'Representation':files_appellant,'Appellant':appellants, 'Respondent':respondents}

df_sense_representation = pd.DataFrame(dict_sense_representation, columns=['Representation','Appellant', 'Respondent'])
df_sense_representation.isna().sum()
df_sense_representation.sum()

print(df_sense_representation)

df_sense_representation.describe()
#df_sense_representation.to_clipboard()

31933
31933
31933
                                          Representation  \
0      representation for the appellant mr j gajjar c...   
1      representation for the appellant ms sardar cou...   
2      representation for the appellant mr hussain fo...   
3      representation for the appellant mr t melvin s...   
4      for the appellant ms e rutherford instructed b...   
...                                                  ...   
31928  on august he lodged an appeal against the refu...   
31929  the adjudicator rejected all the appellant cla...   
31930  the appellant describes himself variously as a...   
31931  ms s panagiotopoulou of counsel instructed by ...   
31932  we are however concerned with the relevance of...   

                                               Appellant  \
0      mr j gajjar counsel instructed by m a consulta...   
1        ms sardar counsel instructed by duncan lewis co   
2                                             mr hussain   
3      mr t melvin se

Unnamed: 0,Representation,Appellant,Respondent
count,31933,31933,31933
unique,30550,17626,8486
top,for the appellant in person,no appearance,mr p duffy senior home office presenting officer
freq,35,423,550


In [137]:
# Get the legal firm or entitity involved as an appellant or respondent

appellants = []
appellants_firm = []
respondents = []
respondents_firm = []

home_office_tags = ['senior presenting officer', 'senior home office presenting officer', 'home office presenting officer', 'home office']

# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each text file and extract Court information
for decision in tqdm(data):
    # Obtain the text of the court decision
    appellant_string = decision.get('Appellant:')
    respondent_string = decision.get('Respondent:')
    representation_string = decision.get('Representation:')

    if isinstance(representation_string, float):
        continue
    else:
        # First, proceed with appellant's related firm info
        flag_appellant = 0
        
        # Catch what's after ' of ' 
        regex_appellant = '(?<= of )([\s\S]*)'
        catch_appellant = re.search(regex_appellant, appellant_string)
        if catch_appellant :
            string_appellant_firm = catch_appellant.group(0)
            # Remove leading and trailing spaces
            string_appellant_firm = string_appellant_firm.strip()
            flag_appellant = 1
        
        # Catch what's after 'legal representative' 
        regex_appellant = '(?<=legal representative)([\s\S]*)'
        catch_appellant = re.search(regex_appellant, appellant_string)
        if catch_appellant :
            string_appellant_firm = catch_appellant.group(0)
            # Remove leading and trailing spaces
            string_appellant_firm = string_appellant_firm.strip()
            flag_appellant = 1

        # Catch what's after 'instructed by' 
        regex_appellant = '(?<=instructed by)([\s\S]*)'
        catch_appellant = re.search(regex_appellant, appellant_string)
        if catch_appellant :
            string_appellant_firm = catch_appellant.group(0)
            # Remove leading and trailing spaces
            string_appellant_firm = string_appellant_firm.strip()
            flag_appellant = 1

        #print(appellant_string)
        if flag_appellant == 1:
            string_appellant_firm = string_appellant_firm
            #print(string_appellant_firm)
        else:
            for tag in home_office_tags:
                if appellant_string != None and tag in appellant_string:
                    string_appellant_firm = 'Home Office'
                    flag_appellant = 1
                    break
                else:
                    string_appellant_firm = np.nan
            #print(string_appellant_firm)
        appellants_firm.append(string_appellant_firm)
        appellants.append(appellant_string)
        decision.update({'Appellant entity:': string_appellant_firm})


        # Second, proceed with respondent's related firm info
        flag_respondent = 0

        # Catch what's after ' of ' 
        regex_respondent = '(?<= of )([\s\S]*)'
        catch_respondent = re.search(regex_respondent, respondent_string)
        if catch_respondent :
            string_respondent_firm = catch_respondent.group(0)
            # Remove leading and trailing spaces
            string_respondent_firm = string_respondent_firm.strip()
            flag_respondent = 1
        
        # Catch what's after 'legal representative' 
        regex_respondent = '(?<=legal representative)([\s\S]*)'
        catch_respondent = re.search(regex_respondent, respondent_string)
        if catch_respondent:
            string_respondent_firm = catch_respondent.group(0)
            # Remove leading and trailing spaces
            string_respondent_firm = string_respondent_firm.strip()
            flag_respondent = 1

        # Catch what's after 'instructed by' 
        regex_respondent = '(?<=instructed by)([\s\S]*)'
        catch_respondent = re.search(regex_respondent, respondent_string)
        if catch_respondent :
            string_respondent_firm = catch_respondent.group(0)
            # Remove leading and trailing spaces
            string_respondent_firm = string_respondent_firm.strip()
            flag_respondent = 1

        #print(respondent_string)
        if flag_respondent == 1:
            string_respondent_firm = string_respondent_firm
            #print(string_respondent_firm)
        else:
            for tag in home_office_tags:
                if respondent_string != None and tag in respondent_string:
                    string_respondent_firm = 'Home Office'
                    flag_respondent = 1
                    break
                else:
                    string_respondent_firm = np.nan
            #print(string_respondent_firm)
        respondents_firm.append(string_respondent_firm)
        respondents.append(respondent_string)
        decision.update({'Respondent entity:': string_respondent_firm})

# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)


100%|██████████| 35305/35305 [00:00<00:00, 116561.27it/s]


In [139]:

print(len(appellants))
print(len(appellants_firm))

print(len(respondents))
print(len(respondents_firm))

#data[0]

31933
31933
31933
31933


# 5. The decision of the judge

The decision of the judge is the most challenging piece of information to extract from the documents. # First isolate the part of the document most likely to include the decission the second half of the document. Second, get rid of annexes and appendixes. third, # classifying judgments is not the same as classifying cases.


In [60]:
# nlp sentence tokenizer with Stanford
nlp = stanza.Pipeline(lang = 'en', processors = 'tokenize', tokenize_no_ssplit = True)

# Store decisions in a list to make a df
decisions = []
files_judge = []

# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each text file and extract Court information
for decision in tqdm(data):
    # Obtain the full text of the court decision
    string = decision.get('String')
    file_name = decision.get('File')
    files_judge.append(file_name)

    # Use only second half of text (skip references to annxes and appendixes)
    string = string[len(string)//2:]

    # Discard text following appendix and annexes
    string = string.rsplit("appendix", 1)
    string = string[0]
    string = string.rsplit("annex", 1)
    string = string[0]

    # Narrow down the search from the end
    # Split on last occurrence of "Signed"
    string = string.rsplit("Signed", 1)
    string = string[0].lower()


    # Keep a max of 2000 characters
    string = string[ min(-2000, len(string)):]

    # Get rid of text after the last occurrence of 'anonymity'
    string = string.rsplit("anonymity", 1)
    string = string[0]

    # Apply stanford nlp
    doc = nlp(string)

    # List to store the ruling sentences
    catch = []
    # Flag = 1 when decision found
    flag = 0
        
    # Make sentences
    for i, sentence in enumerate(doc.sentences):
        sente = [token.text for token in sentence.tokens]
        # Keep only the alpha tokens
        sente = [e for e in sente if e.isalpha()]
        #print(type(sente))
        catch.append(sente)
        
    # Identify decision leads in sentences
    decision_leads = [['notice', 'of', 'decision'], ['decision'], ['decisions'], ['conclusions'], ['conclusion']]
        
    # When decision lead found, trim catch and update flag value 
    for lead in decision_leads:
        try:
            # Find index of decision lead in ruling
            index = catch.index(lead)
            # Remove sentences before the decision lead sentence
            del catch[0:index]
            # Flatten the list of lists/sentences
            flat_catch = [item for sublist in catch for item in sublist]
            # Decision found
            flag = 1
            # Store decision in decisions list
            decisions.append(flat_catch)
            decision.update({'Decision:': flat_catch})
            #print('Found decision 1')
            #print(flat_catch)
            break
        except ValueError:
            continue
    
    # If a decision has not been found yet (flag = 0)
    if flag == 0:
    # Look for partial hits in text 
        decision_leads_part = [['for', 'the', 'above', 'reasons'], ['for', 'the', 'reasons', 'i', 'have', 'given'], ['general', 'conclusions'],
        ['for', 'the', 'reasons', 'set', 'out', 'above'], ['for', 'all', 'of', 'these', 'reasons'], ['decision', 'and', 'directions'], ['conclusions'],
        ['notice', 'of', 'decision'], ['decision','the', 'application', 'for', 'judicial', 'review', 'is'], ['there', 'is', 'no', 'material', 'error', 'of', 'law', 'in'],
        ['decision', 'the', 'decision', 'of', 'tribunal', 'judge', 'dean', 'promulgated'], ['the', 'decision', 'of', 'the', 'ftt', 'is', 'set', 'aside'],
        ['i', 'grant', 'permission', 'to', 'appeal', 'i', 'set', 'aside', 'the', 'decision', 'of', 'the', 'tribunal'], ['i', 'set', 'aside', 'that', 'decision'],
        ['the', 'appellant', 'appeal', 'as', 'originally', 'brought', 'to', 'the', 'ftt', 'is', 'dismissed'], ['i', 'do', 'not', 'set', 'aside', 'the', 'decision']]
            
        for element in catch:
            for part in decision_leads_part:
                idx_part = decision_leads_part.index(part)
                if all_exist(decision_leads_part[idx_part], element):
                    index = catch.index(element)
                    # Decision found in catch
                    flag = 1
                    # Remove sentences before the decision lead sentence
                    del catch[0:index]
                    # Flatten the list of lists/sentences
                    flat_catch = [item for sublist in catch for item in sublist]
                    #print('Found decision 2')
                    #print(flat_catch)
                    break
                
                else:
                    continue
                
        # If a decision has still not been found (flag = 0)
        if flag == 0:
            decisions.append(np.nan)
            decision.update({'Decision:': np.nan})
            #print('Did not find a decision')
            #print(catch)
        else:
            # Store decision in decisions list
            decisions.append(flat_catch)
            decision.update({'Decision:': flat_catch})
            continue


# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)

2021-11-13 20:28:12 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2021-11-13 20:28:12 INFO: Use device: cpu
2021-11-13 20:28:12 INFO: Loading: tokenize
2021-11-13 20:28:12 INFO: Done loading processors!
100%|██████████| 35305/35305 [2:00:38<00:00,  4.88it/s]


In [245]:
dict_decisions = {'File':files_judge,'Decision':decisions}

df = pd.DataFrame(dict_decisions, columns=['File','Decision'])
df.isna().sum()
#print(data[49])
print(len(files_judge))
print(len(decisions))
#print(decisions[32488])
#print(files[5000])
#rint(decisions[5000])
#print(files[6000])
print(decisions[6002])
#print(df[df['Decision'].isnull()])
print(df.isnull().sum(axis = 0))

#print(json.dumps(data[32554], indent = 4, sort_keys = True))

35305
35305
['decision', 'the', 'determination', 'of', 'the', 'tribunal', 'having', 'been', 'found', 'to', 'contain', 'a', 'material', 'error', 'of', 'law', 'i', 'substitute', 'the', 'following', 'decision', 'the', 'appellant', 'appeal', 'is', 'allowed', 'under', 'the', 'immigration', 'rules']
HU166042017
['the', 'judge', 'also', 'went', 'on', 'to', 'consider', 'whether', 'or', 'not', 'any', 'exceptional', 'circumstances', 'existed', 'in', 'this', 'particular', 'case', 'the', 'findings', 'and', 'conclusions', 'are', 'comprehensive', 'and', 'when', 'the', 'decision', 'is', 'viewed', 'holistically', 'the', 'judge', 'consideration', 'is', 'entirely', 'sound', 'in', 'light', 'of', 'the', 'above', 'the', 'appellant', 'appeal', 'to', 'the', 'upper', 'tribunal', 'is', 'dismissed', 'and', 'the', 'decision', 'of', 'the', 'tribunal', 'stands', 'anonymity', 'i', 'make', 'no']
PA098412018
['notice', 'of', 'decision', 'for', 'the', 'above', 'reasons', 'the', 'decision', 'i', 'on', 'the', 'appellant

# 6. Sense of the decision.

The decision has been isolated. However, no information on whether the sentence accepts/rejects or is neutral. Sense of decision depends on the appellent. If appellent is home office, then... The decision of the First-tier Tribunal did not involve the  making  of an error of law and I uphold it
is accepted, otherwise is rejected.

In [94]:
# Store decisions in a list to make a df
decision_text = []
decision_label = []

# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each text file and extract Court information
for decision in tqdm(data):
    # Obtain the full text of the court decision
    string = decision.get('Decision:')
    #file_name = decision.get('File')
    if isinstance(string, float):
        continue
    else:
        # If a label has not been found (flag = 0)
        flag = 0
        # The decisions are stored as a listt of tokens
        string = ' '.join(x for x in string)
        # print(string)
        decision_text.append(string)
        # Look for partial hits in text 
        decision_labels_reject = ['appeal dismissed', 'appeal is dismissed','this application is refused', 'the appeal is dismissed', 'the decision of the first-tier Tribunal stands',
            'the decision of the tribunal did not involve the making of a material error of law', 'the original decision shall stand',
            'did not involve the making of an error on a point of law', 'appeal remains dismissed', 'decision stands', 'not satisfied that the judge erred',
            'not involve an error on', 'do not set aside the decision', 'this application is refused', 'i therefore uphold the decision of',
            'does not contain a material error of law and shall stand', 'it shall stand', 'did not contain any error of law', 'did not involve the making of a material error of law',
            'dismissing the claimant appeal', 'does not contain a material error of law','no material error of law has been established',
            'the decision to dismiss the appellant appeal shall stand', 'the appeal is statutorily abandoned', 'tribunal decision is not vitiated by legal error',
            'there is no material error of law in the tribunal judge decision', 'the appeal by the secretary of state is dismissed',
            'decision of the tribunal does not disclose an error on a point of law', 'errors are not material such that the decision should be set aside',
            'the appeal of the appellant is dismissed', 'reveals no error of law and stands', 'the appeal to the upper tribunal is dismissed',
            'the respondent appeal be dismissed', 'the judge did not materially err in law', 'did not involve a making of or a material error of law',
            'appeal to the upper tribunal is dismissed', 'i am dismissing', 'did not contain a material error of law', 'the decision of the ftt judge must stand',
            'there are no errors of law', 'there is no material error of law', 'this appeal is dismissed', 'i dismiss the appeal', 'tribunal did not involve the making of an error of law',
            'contains no material error of law', 'the decision of the tribunal stands', 'the decision of the tribunal did not involve the making of an error of law',
            'this appeal is therefore dismissed', 'i refuse permission to appeal', 'the applicant appeal to the upper tribunal is therefore dismissed',
            'decision of the ftt did not involve the making of an error of law', 'i do not set it aside', 'the decision of the tribunal did not involve the making of any error on a point of law',
            'the appellant appeal to the upper tribunal is therefore dismissed', 'permission to appeal to the court of appeal is refused', 'the decision of the tribunal is upheld', 'the appeals are dismissed',
            'did not make a material error of law', 'the decision of the tribunal shall stand', 'the appeal of the secretary of state is dismissed',
            'the appeal of the secretary of state is dismissed', 'do not disclose any material error of law', 'the tribunal did not err in law',
            'judge did not make an error of law', 'i uphold the decision to dismiss', 'no material error of law was made', 'did not make a material error on a point of law',
            'there was no error of law made by the judge', 'we dismiss the secretary of state appeal', 'the appeals to the upper tribunal are dismissed',
            'decision of the tribunal does not contain an error on a point', 'is disused', 'decision of the tribunal does not contain an error on a point of law',
            'the appellants appeals are each dismissed', 'the decision by dismissing the appeal', 'i uphold the tribunal determination and dismiss the appeal'
            'this appeal is dismissed', 'appeals dismissed for all the appellants', 'decision of the tribunal does not disclose an error in law', 'tribunal does not show a material error on a point of law',
            'i uphold the tribunal determination and dismiss the appeal', 'these appeals are dismissed', 'determination does not disclose any material error of law',
            'tribunal did not involve the making of a material error on a point of law', 'the decision of the tribunal containing no material error of law shall stand',
            'decision of the tribunal does not disclose a material error on a point of law', 'there was no material error of law made by the judge',
            'is free of legal error accordingly it must stand', 'decision of the tribunal does not contain errors of law that decision shall stand',
            'the ftt judge did not err in law', 'i find no error of law', 'there is no error in law', 'no material error has been established',
            'no material error of law has been demonstrated', 'there being no material error of law in the decision', 'the appeals of the appellants are dismissed',
            'no material error of law is established in the decision', 'i dismissed the appellant appeal', 'i dismiss the appellant appeal', 'the decision of the tribunal contained a material error of law',
            'i dismiss the appellant appeal', 'there is no error of law in the judge findings', 'i find no material error of law', 'the appeal of the claimant is dismissed',
            'the decision of the original judge will stand', 'the appeal is to the upper tribunal is dismissed', 'did not involve the making of an error of law',
            'the appeal is therefore dismissed', 'is dismissed', 'is refused', 'the appellants appeals are dismissed', 'i remake the decision on the appeal dismissing it',
            'determination of the tribunal contains no error of law and it is upheld', 'the judge did not err in law', 'the appeal is as dismissed',
            'the decision of the tribunal contains no error of law and shall stand', 'the claim is therefore dismissed', 'i find that there is no valid appeal before the tribunal',
            'we have given we dismiss this appeal', 'this appeal must be dismissed', 'we conclude that no material error of law has been shown', 'there is no error of law',
            'decision of the tribunal contains no error of law']

        decision_labels_accept = ['i set it aside', 'did involve the making of an', 'decision of the tribunal is set aside', 'is set aside',
            'i allow the claimant', 'appeal remains allowed', 'the appeal is allowed', 'appeal is granted', 'the appeal is allowed', 'the appellant is granted',
            'i set aside the judge decision', 'i therefore set aside the decision', 'i set aside the decision', 'appeal allowed', 'appeal is allowed',
            'set aside the decision', 'the decision of the first-tier tribunal has already been set aside', 'i allow the claimant', 'appeals allowed',
            'i allow the appeal', 'is allowed', 'the first-tier tribunal erred in law', 'these appeals are allowed', 'the judge materially erred in law',
            'does not contain an error of law', 'the appeals are allowed', 'the tribunal erred in law', 'the decision of the tribunal contained an error of law',
            'involved the making of a material error of law', 'decision of the tribunal is tainted by material errors of law', 'i set the decision aside',
            'tribunal involved an error on a point of law', 'did involve a material error of law', 'the appellants are granted', 'there is no material error of law in the determination',
            'the judge erred in allowing this appeal', 'i remake the decision allowing the appellant appeal', 'did err in the making of', 'there are material errors of law',
            'is therefore set aside', 'the appellant is granted', 'there is a material error of law', 'i allow the appellant eea appeal', 'should be set aside', 'by allowing the appeal',
            'the tribunal decision involved the making of an error on a point of law', 'involved the making of an error on a point of law', 'by allowing the appellant appeal',
            'involved the making of an error of law', 'the decision of the tribunal does not contain errors of law and it is upheld', 'decision of the tribunal has been set aside',
            'the claimant appeal to the ftt is remade and allowed', 'tribunal was vitiated by legal error', 'discloses an error of law', 'we set aside that decision',
            'the appeal is remitted to the tribunal for a hearing afresh', 'contains a material error of law', 'the tribunal made errors of law', 'the determination of the tribunal does not disclose a material error of law',
            'the determination of the tribunal contained an error of law', 'the judge made an error on a point of law', 'the decision of the tribunal is hereby set aside for material error',
            'the tribunal judge made errors of law', 'the appeal against the judge decision is therefore allowed', 'i allow the claim for asylum and on human rights grounds',
            'the tribunal decision is vitiated by a material error of law', 'i find material error in law', 'the appellants appeals are allowed', 'the human rights appeals are allowed',
            'i therefore allow the appeal', 'does not disclose an error of law and stands', 'determination does contain a material error of law', 'we allow the appeal',
            'we have decided to allow this appeal', 'this appeal is accordingly allowed', 'both appeals are allowed', 'the appeals of the four appellants are allowed']

        # Check first evidence of reject, if reject be done with it

        for label in decision_labels_reject:
            if string != None and label in string:
                #print("Rejected!")
                flag = -1
                decision_label.append('Rejected')
                decision.update({'Decision label:': 'Rejected'})
                break
            else:
                'Not found!'
        if flag == -1:
            continue
        else:
            # Check evidence for Accept
            for label in decision_labels_accept:
                if string != None and label in string:
                    #print("Accepted!")
                    flag = 1
                    decision_label.append('Accepted')
                    decision.update({'Decision label:': 'Accepted'})
                    break
                else:
                    'Not found!'
        if flag == 0:
            decision_label.append('Neutral')
            decision.update({'Decision label:': 'Neutral'})

    #print(flag)
# The decision/ruling by the judge ('Decision:').

# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)


100%|██████████| 35305/35305 [00:01<00:00, 28238.15it/s]


In [95]:
print(len(decision_text))
print(len(decision_label))

dict_sense_decisions = {'Decision text':decision_text,'Decision label':decision_label}

df_decision_sense = pd.DataFrame(dict_sense_decisions, columns=['Decision text','Decision label'])
df_decision_sense.isna().sum()
df_decision_sense.sum()

print(df_decision_sense)

df_decision_sense.describe()


30085
30085
                                           Decision text Decision label
0      notice of decision directions the decision of ...       Accepted
1      in light of my conclusions on that point littl...       Rejected
2      notice of decision the decision of the tribuna...       Rejected
3      notice of decision the decision of the tribuna...       Accepted
4      decision the decision of tribunal judge malcol...       Accepted
...                                                  ...            ...
30080  for the reasons we have given in paragraph we ...       Accepted
30081  conclusions a northern cyprus is not capable o...       Accepted
30082  for the reasons we have given we dismiss this ...       Rejected
30083  our conclusions on the general issues relating...       Rejected
30084  conclusions for each of the main reason a and ...       Accepted

[30085 rows x 2 columns]


Unnamed: 0,Decision text,Decision label
count,30085,30085
unique,24093,3
top,notice of decision the appeal is dismissed no,Accepted
freq,231,13711


In [96]:
df_decision_sense[['Decision label']].value_counts()


Decision label
Accepted          13711
Rejected          12778
Neutral            3596
dtype: int64

In [48]:
rslt_df = df_decision_sense[df_decision_sense['Decision label'] == 'Neutral']
rslt_df.to_clipboard()

In [None]:
for label in decision_labels_accept:
    idx_part = decision_labels_accept.index(label)
    if all_exist(decision_labels_accept[idx_part], example):
        flag = 1
        break
        #print('Accept')
    else:
        flag = 0
        #print('Reject')
        continue

# 7. Nationality of the appellant. 
The field country is empty to a large extent.

In [182]:
# List of countries and nationalities to be checked against the text
countries = ['Afghanistan', 'Aland Islands', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua', 
'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bolivia, Plurinational State of', 'Bonaire', 'Bonaire, Sint Eustatius and Saba', 
'Bosnia and Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil', 'British Indian Ocean Territory', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 
'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island', 
'Cocos (Keeling) Islands', 'Colombia', 'Comoros', 'Congo', 'Congo, The Democratic Republic of the', 'Congo', 'Cook Islands', 'Costa Rica', "Côte d'Ivoire", 
'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 
'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Falkland Islands (Malvinas)', 'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana', 
'French Polynesia', 'French Southern Territories', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Gibraltar', 'Greece', 'Greenland', 'Grenada', 
'Guadeloupe', 'Guam', 'Guatemala', 'Guernsey', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Heard Island and McDonald Islands', 
'Holy See (Vatican City State)', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran, Islamic Republic of', 'Iraq', 
'Ireland', 'Isle of Man', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jersey', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', 
"Korea, Democratic People's Republic of", 'Korea, Republic of', 'Kuwait', 'Kyrgyzstan', "Lao People's Democratic Republic", 'Latvia', 
'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macao', 'Macedonia, Republic of', 'Madagascar', 'Malawi', 
'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Martinique', 'Mauritania', 'Mauritius', 'Mayotte', 'Mexico', 'Micronesia', 
'Federated States of', 'Micronesia', 'Moldova, Republic of', 'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Montserrat', 'Morocco', 'Mozambique', 
'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Caledonia', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Niue', 'Norfolk Island', 
'Northern Mariana Islands', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Palestinian Territory, Occupied', 'Palestine', 'Panama', 'Papua New Guinea', 
'Paraguay', 'Peru', 'Philippines', 'Pitcairn', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Réunion', 'Romania', 'Russian Federation', 'Russia', 
'Rwanda', 'Saint Barthélemy', 'Saint Helena, Ascension and Tristan da Cunha', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Martin (French part)', 
'Saint Pierre and Miquelon', 'Saint Vincent and the Grenadines', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 
'Seychelles', 'Sierra Leone', 'Singapore', 'Sint Maarten (Dutch part)', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 
'South Georgia and the South Sandwich Islands', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'South Sudan', 'Svalbard and Jan Mayen', 'Swaziland', 
'Sweden', 'Switzerland', 'Syria', 'Syrian Arab Republic', 'Taiwan', 'Taiwan, Province of China', 'Tajikistan', 'Tanzania', 
'Tanzania, United Republic of', 'Thailand', 'Timor-Leste', 'Togo', 'Tokelau', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 
'Turks and Caicos Islands', 'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United States', 'United States Minor Outlying Islands', 'Uruguay', 
'Uzbekistan', 'Vanuatu', 'Venezuela', 'Venezuela, Bolivarian Republic of', 'Vietnam', 'Viet Nam', 'Virgin Islands, British', 'Virgin Islands, U.S.', 
'Wallis and Futuna', 'Yemen', 'Zambia', 'Zimbabwe', 'Pakistani', 'Iranian', 'Bangladeshi', 'Indian', 'Egyptian', 'Afghan', 'Albanian', 'Algerian', 
'American', 'Andorran', 'Angolan', 'Antiguans', 'Argentinean', 'Armenian', 'Australian', 'Austrian', 'Azerbaijani', 'Bahamian', 'Bahraini', 'Bangladeshi', 'Barbadian', 'Barbudans', 'Batswana', 'Belarusian', 'Belgian', 'Belizean', 'Beninese', 'Bhutanese', 'Bolivian', 'Bosnian', 'Brazilian', 'Bruneian', 'Bulgarian', 'Burkinabe', 'Burmese', 'Burundian', 'Cambodian', 'Cameroonian', 'Canadian', 'Cape Verdean', 'Central African', 'Chadian', 'Chilean', 'Chinese', 'Colombian', 'Comoran', 'Congolese', 'Costa Rican', 'Croatian', 'Cuban', 'Cypriot', 'Czech', 'Danish', 'Djibouti', 'Dominican', 'Dutch', 'Dutchman', 'Dutchwoman', 'East Timorese', 'Ecuadorean', 'Egyptian', 'Emirian', 'Equatorial Guinean', 'Eritrean', 'Estonian', 'Ethiopian', 'Fijian', 'Filipino', 'Finnish', 'French', 'Gabonese', 'Gambian', 'Georgian', 'German', 'Ghanaian', 'Greek', 'Grenadian', 'Guatemalan', 'Guinea-Bissauan', 'Guinean', 'Guyanese', 'Haitian', 'Herzegovinian', 'Honduran', 'Hungarian', 'I-Kiribati', 'Icelander', 'Indian', 'Indonesian', 'Iranian', 'Iraqi', 'Irish', 'Israeli', 'Italian', 'Ivorian', 'Jamaican', 'Japanese', 'Jordanian', 'Kazakhstani', 'Kenyan', 'Kittian and Nevisian', 'Kuwaiti', 'Kyrgyz', 'Laotian', 'Latvian', 'Lebanese', 'Liberian', 'Libyan', 'Liechtensteiner', 'Lithuanian', 'Luxembourger', 'Macedonian', 'Malagasy', 'Malawian', 'Malaysian', 'Maldivan', 'Malian', 'Maltese', 'Marshallese', 'Mauritanian', 'Mauritian', 'Mexican', 'Micronesian', 'Moldovan', 'Monacan', 'Mongolian', 'Moroccan', 'Mosotho', 'Motswana', 'Mozambican', 'Namibian', 'Nauruan', 'Nepalese', 'Netherlander', 'New Zealander', 'Ni-Vanuatu', 'Nicaraguan', 'Nigerian', 'Nigerien', 'North Korean', 'Northern Irish', 'Norwegian', 'Omani', 'Pakistani', 'Palauan', 'Panamanian', 'Papua New Guinean', 'Paraguayan', 'Peruvian', 'Polish', 'Portuguese', 'Qatari', 'Romanian', 'Russian', 'Rwandan', 'Saint Lucian', 'Salvadoran', 'Samoan', 'San Marinese', 'Sao Tomean', 'Saudi', 'Scottish', 'Senegalese', 'Serbian', 'Seychellois', 'Sierra Leonean', 'Singaporean', 'Slovakian', 'Slovenian', 'Solomon Islander', 'Somali', 'South African', 'South Korean', 'Spanish', 'Sri Lankan', 'Sudanese', 'Surinamer', 'Swazi', 'Swedish', 'Swiss', 'Syrian', 'Taiwanese', 'Tajik', 'Tanzanian', 'Thai', 'Togolese', 'Tongan', 'Trinidadian or Tobagonian', 'Tunisian', 'Turkish', 'Tuvaluan', 'Ugandan', 'Ukrainian', 'Uruguayan', 'Uzbekistani', 'Venezuelan', 'Vietnamese', 'Welsh', 'Yemenite', 'Zambian', 'Zimbabwean']

countriesLower = [x.lower() for x in countries]
#print(countriesLower)

In [183]:
# Store decisions in a list to make a df
nationalities = []
files_nationalities = []

# nlp sentence tokenizer with Stanford
nlp = stanza.Pipeline(lang = 'en', processors = 'tokenize', tokenize_no_ssplit = True)

# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each text file and extract Court information
for decision in tqdm(data):
    # Obtain the full text of the court decision
    string = decision.get('String')
    file_name = decision.get('File')
    files_nationalities.append(file_name)

    # Use only first third of text
    string = string[:len(string)//3]
    # All text in lower
    string = string.lower()

    # Apply stanford nlp
    doc = nlp(string)

    # List to store the ruling sentences
    catch = []

    # Make sentences
    for i, sentence in enumerate(doc.sentences):
        sente = [token.text for token in sentence.tokens]
        # Keep only the alpha tokens
        sente = [e for e in sente if e.isalpha()]
        #print(type(sente))
        catch.append(sente)
    # Look for partial hits in text
    nationality_leads_part = [['the', 'appellant', 'is', 'a', 'national', 'of'], ['the', 'appellant', 'is', 'a', 'citizen', 'of'],
    ['the', 'respondent', 'is', 'a', 'citizen', 'of'], ['the', 'appellants', 'are', 'all', 'citizens', 'of'], ['citizen', 'of'],
    ['national', 'of'], ['citizens', 'of']]
    # Nationality not yet found (flag = 0)
    flag = 0
    for element in catch:
        for part in nationality_leads_part:
            idx_part = nationality_leads_part.index(part)
            if sublist(nationality_leads_part[idx_part], element):
                #print(nationality_leads_part[idx_part])
                index = catch.index(element)
                # Nationality lead found in catch
                # Remove sentences before sentence with decision lead
                new_catch = catch[index]
                # flag2 = 1 when nationality is found in 1
                for token in new_catch:
                    #indx_country = countriesLower.index(country)
                    if sublist([token], countriesLower):
                        # Nationality found (flag = 1)
                        flag = 1
                        nationalities.append(token)
                        decision.update({'Nationality:': token})
                        #print(f'FOUND A NATIONALITY {token} in {file_name}')
                        break
                    else:
                        continue
                    break
            if flag == 1:
                break
        if flag == 1:
            break
    # If a decision has still not been found (flag = 0)
    if flag == 0:
        #print(f'Did not find a nationality {file_name} in catch: {catch}')
        nationalities.append(np.nan)
        decision.update({'Nationality:': np.nan})
        #print(f'Did not find a nationality in {catch}')
    else:
        continue

# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)


2021-11-14 18:00:58 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2021-11-14 18:00:58 INFO: Use device: cpu
2021-11-14 18:00:58 INFO: Loading: tokenize
2021-11-14 18:00:58 INFO: Done loading processors!
100%|██████████| 35305/35305 [5:04:39<00:00,  1.93it/s]


The field "nationality" includes a mix of country names and nationalities. Nationalities are transformed to country names to harmonize the field. 

In [185]:
# Create dictionary with nationality key and country name as value
dict_nationality_csv = pd.read_csv('data/countries.csv', header = None, index_col = 4, squeeze = False).to_dict()
dict_nationality = dict_nationality_csv[3]
#print(dict_nationality)

# Dictionary from country name key to code3 as value
dict_country_code3 = pd.read_csv('data/countries.csv', header = None, index_col = 3, squeeze = False).to_dict()
dict_country = dict_country_code3[2]
#print(dict_country)

In [188]:
# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each text file and extract Court information
for decision in tqdm(data):
    # Obtain the full text of the court decision
    string = decision.get('Nationality:')
    #file_name = decision.get('File')
    if isinstance(string, float):
        continue
    else:
        # If we are dealing with a nationality and not a country name
        if string.capitalize() in dict_nationality:
            country = dict_nationality[string.capitalize()]
        else:
            # Simply capitalize the name of the country
            country = string.capitalize()
        # Update decision in dict
        decision.update({'Nationality:': country})
        #print(country)

# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)


100%|██████████| 35305/35305 [00:00<00:00, 962063.83it/s]


Add a "country" field to the decision which includes the 3-digit code of the country of the subject.

In [191]:
# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each text file and extract Court information
for decision in tqdm(data[:10]):
    # Obtain the full text of the court decision
    string = decision.get('Nationality:')
    #file_name = decision.get('File')
    if isinstance(string, float):
        continue
    else:
        if string in dict_country:
            # Get country code from dict_country
            country = dict_country[string]
        else:
            country = np.nan
        # Update decision in dict
        decision.update({'Country:': country})
        #print(country)

# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)

100%|██████████| 10/10 [00:00<00:00, 131482.88it/s]


In [192]:
data[0]

{'Case title:': '',
 'Appellant name:': '',
 'Status of case:': 'Unreported',
 'Hearing date:': '27 Aug 2021',
 'Promulgation date:': '11 Oct 2021',
 'Publication date:': '26 Oct 2021',
 'Last updated on:': '26 Oct 2021',
 'Country:': 'PAK',
 'Judges:': ['Rintoul'],
 'Document': 'https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/73739/HU202322019.doc',
 'Reference': ['HU/20232/2019'],
 'Download': 'Yes',
 'File': 'HU202322019',
 'String': '\n                                    [pic]\nIAC-FH-CK-V1\n\nUpper Tribunal\n(Immigration and Asylum Chamber)    Appeal Number: HU/20232/2019\n\n\n                            THE IMMIGRATION ACTS\n\n\n|Heard at Field House                 |Decision & Reasons Promulgated       |\n|On 27 August 2021                    |On the 11th October 2021             |\n|Extempore                            |                                     |\n\n\n                                   Before\n\n                        UPPER TRIBUNAL JUDGE RI

In [184]:
print(len(files_nationalities))
print(len(nationalities))
#print(nationalities)
dict_nationalities = {'File':files_nationalities,'Nationality':nationalities}
df = pd.DataFrame(dict_nationalities, columns=['File','Nationality'])
df.isna().sum()

35305
35305


File               0
Nationality    12537
dtype: int64

# SANDBOX

In [None]:
# Deal with empty/corrupt files that didn't upload a sentence string
# Regex expression: What comes in between 'Before' and 'Between'
# regex = '(?<=Before)([\s\S]*?)(?=Between)'
regex = 'representation:([\S\s]*)for the respondent'
catch = re.search(regex, decision_string.lower())
#If the catch is successful
if catch :
    string = catch.group(0)
    delimiters = ['|', '?', ':']
    # Get rid of some table delimiters
    for i in delimiters:
        string = string.replace(i,'')
    # Remove leading and trailing spaces
    string = string.strip()
    print(string)

# Path to the txt documents
txt_path = './data/processed/txt_files_test/'
print(os.listdir(txt_path))
# Loop over each text file and extract Court information
for text in os.listdir(txt_path):
    print(text)

    with open(txt_path + text, 'r') as file:
        decision_string = file.read()
        # Regex expression: What comes after "Heard at" until hitting 3 balnks or new line
        #regex = '(?<=Heard at).*[^\S\r\n]{3,}'
        #regex = 'Before([\S\s]*)Between'
        regex = '(?<=Before)([\s\S]*?)(?=Between)'

        catch = re.search(regex, decision_string)
        #If the catch is successful
        if catch :
            string = catch.group(0)

            # Keep only alpha numeric
            string = string.replace('|','')
            #string = re.sub(r'[^A-Za-z0-9 ]+', '', string)
            # Remove leading and trailing spaces
            string = string.strip()
            print(string)
        else:
            continue


    # Loading string with court decision to data
for txt_file in  tqdm(os.listdir(txt_path)):
    
    # Open file and obtain string and file_name
    with open(txt_path + txt_file, 'r') as file:
        string = file.read()
        f_name, f_ext = os.path.splitext(file.name)
        head, file_name = os.path.split(f_name)
    # Search data list of dictionaries for dict where {"File":} = file_name
    for d in data:
        if d.get('File') == file_name:
            # Add dictionary key 'String' with value string
            d.update({'String': string})

In [175]:
# look for it in first half of string
# GPE Countries, cities, states.
# LOC Non-GPE locations, mountain ranges, bodies of water.
#
sp = spacy.load("en_core_web_sm")
# loop over every row in the 'Bio' column
for text in df['Bio'].tolist():
    # use spacy to extract the entities
    doc = sp(text)
    for ent in doc.ents:    
        # check if entity is equal 'LOC' or 'GPE'
        if ent.label_ in ['GPE']:
            print(ent.text, ent.label_)  