# Information extraction (30th October 2021)

This notebook applies word2vec to the corpus of tribunal decisions.

In particular, the notebook does:

1. Data preparation for word2vec.

2. Implementation of word2vec.

3. Unsupervised clustering.

The resulting trained model is... .

This notebook should run in the tfm environment, which can be created with the environment.yml file.

In [1]:
from os import listdir
from os.path import isfile, join, getsize
import numpy as np

import time
import re
import json
import pickle
import pandas as pd
import whois
import sys
import datetime
from tqdm import tqdm
import textract
import gensim
import spacy
import scipy as sp
import sys

import gensim
from gensim import corpora
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import remove_stopwords

import logging

from smart_open import smart_open
import nltk
#nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer




IN_COLAB = 'google.colab' in sys.modules


# What environment am I using?
print(f'Current environment: {sys.executable}')

# Change the current working directory
os.chdir('/Users/albertamurgopacheco/Documents/GitHub/TFM')
# What's my working directory?
print(f'Current working directory: {os.getcwd()}')


Current environment: /Users/albertamurgopacheco/anaconda3/envs/tfm/bin/python
Current working directory: /Users/albertamurgopacheco/Documents/GitHub/TFM


In [2]:
# Define working directories in colab and local execution

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')
    docs_path = '/content/gdrive/MyDrive/TFM/data/raw'
    input_path = '/content/gdrive/MyDrive/TFM'
    output_path = '/content/gdrive/MyDrive/TFM/output'

else:
    docs_path = './data/raw'
    input_path = '.'
    output_path = './output'

# WORD2VEC

# 1. The data preparation

There are two categories of cases: the reported and the unreported ones. The reported cases include richer data while the unreported ones (the vast majority of cases) miss several data fields due to a request for annonimity from any of the parties involved in the legal dispute.

The first two letters in the file name seem to follow some logic. Inspecting the documents reveals the following meanings:

In [3]:
# Open jsonDataFinal file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)


corpus = []

corruptFiles = ['HU077022015', 'HU029682017']

# Search data list of dictionaries for dict where {"File":} = file_name
for d in tqdm(data):
    if d.get('File') not in corruptFiles:
        doc = d.get('String')
        if doc:
            corpus.append(doc)
        else:
            continue


print(len(corpus))
print(type(corpus[0]))



100%|██████████| 35308/35308 [00:00<00:00, 158620.44it/s]

35305
<class 'str'>





CLeaning each document

In [11]:


# Gensim-implemented filters for preprocessing data
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, 
strip_multiple_whitespaces, strip_non_alphanum, strip_numeric, remove_stopwords]


corpus_clean = [preprocess_string(s, CUSTOM_FILTERS) for s in corpus]

brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in corpus_clean)


    

In [12]:
#print(corpus_clean[0])
import multiprocessing
cores = multiprocessing.cpu_count()
print(cores)

4


In [13]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc ]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [14]:
from time import time 
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size = 50, n_process = cores -1)]
#txt = [cleaning(doc) for doc in nlp.pipe(corpus_clean, batch_size = 50, n_process = cores -1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

In [143]:
from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in txt]



In [144]:
phrases = Phrases(sent, min_count = 30, progress_per = 10000)

In [145]:
from collections import defaultdict  # For word frequency
bigram = Phraser(phrases)

sentences = bigram[sent]

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

20208

In [146]:
sorted(word_freq, key = word_freq.get, reverse=True)[:10]

["'",
 'appellant',
 'judge',
 'decision',
 'tribunal',
 'evidence',
 'appeal',
 'respondent',
 'family',
 'find']

In [147]:
import multiprocessing

from gensim.models import Word2Vec

w2v_model = Word2Vec(min_count = 20,
                     window = 2,
                     vector_size = 300,
                     sample = 6e-5, 
                     alpha = 0.03, 
                     min_alpha = 0.0007, 
                     negative = 20,
                     workers = cores-1)

In [148]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.21 mins


In [149]:
t = time()

w2v_model.train(sentences, total_examples = w2v_model.corpus_count, epochs = 30, report_delay = 1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 5.62 mins


In [150]:
w2v_model.init_sims(replace = True)

  w2v_model.init_sims(replace = True)


In [151]:
w2v_model.wv.most_similar(positive = ["judge"])

[('fttj', 0.5355463027954102),
 ('ftt', 0.5189005136489868),
 ('ij', 0.48209068179130554),
 ('finding', 0.4739972949028015),
 ('tier', 0.46665164828300476),
 ('tribunal', 0.4650040864944458),
 ('shergill', 0.44893842935562134),
 ('obhi', 0.44763654470443726),
 ('parke', 0.445686936378479),
 ('err', 0.4407684803009033)]

In [152]:
w2v_model.wv.most_similar(positive = ["refugee"])

[('stateless', 0.41961172223091125),
 ('subsidiary', 0.39647388458251953),
 ('humanitarian', 0.39330193400382996),
 ('hp', 0.37861526012420654),
 ('quo', 0.312267541885376),
 ('protection', 0.3078048527240753),
 ('eligible', 0.30390167236328125),
 ('cessation', 0.2974592447280884),
 ('qd', 0.2675744891166687),
 ('clause', 0.26459071040153503)]

In [117]:
w2v_model.wv.most_similar(positive=["error"])

[('material_error', 0.48005980253219604),
 ('material_outcome', 0.47925588488578796),
 ('error_law', 0.4521803855895996),
 ('vitiate', 0.4454037547111511),
 ('material_misdirection', 0.4356999099254608),
 ('infect', 0.4234890937805176),
 ('err', 0.4159732460975647),
 ('erroneous', 0.4065679907798767),
 ('flaw', 0.4028043746948242),
 ('misdirection_law', 0.3956824541091919)]

End of gensim tutorial

In [20]:

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence = str(sentence)
    sentence = sentence.lower()
    sentence = sentence.replace('{html}','') 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url = re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words = [stemmer.stem(w) for w in filtered_words]
    lemma_words = [lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)

#corpus_pre = [preprocess(d) for d in corpus]

# Step 2: Prepare Data (Remove stopwords and lemmatize)
data_processed = []

for i, doc in enumerate(corpus[:2]):
    doc_out = []
    for sentence in doc:
        doc_out = preprocess(doc)
    data_processed.append(doc_out)



KeyboardInterrupt: 

In [30]:
data_processed = []


for i, doc in enumerate(corpus):
    doc_out = []
    for wd in doc:
        if wd not in stopwords.words('english'):  # remove stopwords
            lemmatized_word = lemmatizer.lemmatize(wd)
            print(type(lemmatized_word))
            if lemmatized_word:
                doc_out = doc_out + [lemmatized_word[0].split(b'/')[0].decode('utf-8')]
        else:
            continue
    data_processed.append(doc_out)

print(data_processed[0][:5]) 

<class 'str'>


TypeError: must be str or None, not bytes

In [31]:
dataset = api.load("text8")
print(dataset)

<text8.Dataset object at 0x7fc7e8c82c10>


In [34]:
data = [d for d in dataset]
print(type(data[0][0]))

<class 'str'>


In [15]:
# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in corpus_pre]

# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
print(dictionary)

Dictionary(1613 unique tokens: ['abandoned', 'ability', 'able', 'absence', 'accept']...)


In [17]:
# Show the word to id map
#print(dictionary.token2id)


create a bag of words corpus

In [18]:
# Tokenize the docs
tokenized_list = [simple_preprocess(doc) for doc in corpus_pre]

# Create the Corpus
mydict = corpora.Dictionary()
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list]
#pprint(mycorpus)

word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus]
#pprint(word_counts)

In [1]:
# Step 0: Import packages and stopwords


#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
#logging.root.setLevel(level=logging.INFO)
stop_words = stopwords.words('english')
stop_words = stop_words + ['com', 'edu', 'subject', 'lines', 'would', 'article', 'could']

print(stop_words)



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# 2. The court where the case was heard

An inspection of a sample of judicial decisions reveals that the name of the court is located in the first part of the document and it usually follows the expression "Heard at".

The strategy to capture this field will consist of a search using regular expressions. 

In [106]:
# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Loop over each text file and extract Court information
for decision in tqdm(data):
    # Obtain the text of the court decision
    decision_string = decision.get('String')

    # Regex expression: What comes after "Heard at" until hitting 3 balnks or new line
    #regex = '(?<=Heard at).*[^\S\r\n]{3,}'
    regex = 'Heard at(.*)[\S\r\n]| (?<=Heard at).*[^\S\r\n]{3,}'
    catch = re.search(regex, decision_string)

    # If the catch is successful
    if catch :
        string = catch.group(0)
        # Remove ':' if included in the catch
        string = string.replace(':','')
        # Remove leading and trailing spaces
        string = string.strip()
        # Avoids picking up parts of tables and '|'
        string = string.split('   ')
        string = string[0]
        # Remove 'Heard at' if included in the catch
        string = string.replace('Heard at ','')
        # Remove 'manually' some strings often included in the catch
        string = string.replace('|Decision & Reasons Promulgated','')
        string = string.replace('|Decision and Reasons Promulgated','')
        string = string.replace('| Decision & Reasons Promulgated','')
        string = string.replace('Decision Promulgated','')
        string = string.replace('|Decision & Reasons promulgated','')
        string = string.replace('|Determination Promulgated','')
        string = string.replace('Decision and Reasons Promulgated','')
        string = string.replace('|Decision & Reasons  Promulgated','')
        string = string.replace(' on 4 July 2003','')
        string = string.replace('Determination Promulgated','')
        string = string.replace('Decision & Reasons Promulgated','')
        string = string.replace('|Decisions and Reasons Promulgated','')
        string = string.replace('|Decision and Reasons','')
        string = string.replace('UT(IAC)','')
        string = string.replace('UT (IAC) ','')
        string = string.replace('Date of Hearing  9 December 2005','')
        string = string.replace(' | |SS (Risk-Manastry) Iran CG [2003] UKIAT 00035 |','')
        # Strip trailing characters found often
        string = string.rstrip(',')
        string = string.rstrip('|')
        # Remove leading and trailing spaces (again)
        string = string.strip()
        
    else:
        string = 'NA'
    
    #print(string)
    # Add dictionary key 'Heard at' with value string to the dictionary
    decision.update({'Heard at:': string})

# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)

 30%|██▉       | 10481/35308 [00:01<00:03, 7523.58it/s] 


TypeError: expected string or bytes-like object

print(data[10481])

In [109]:
print(data[10481])

{'Case title:': '', 'Appellant name:': '', 'Status of case:': 'Unreported', 'Hearing date:': '6 Jul 2018', 'Promulgation date:': '17 Jul 2018', 'Publication date:': '1 Aug 2018', 'Last updated on:': '1 Aug 2018', 'Country:': '', 'Judges:': '', 'Document': 'https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/60888/HU029682017.docx', 'Reference': ['HU/02968/2017'], 'Download': 'Yes', 'File': 'HU029682017'}


In [105]:
# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

# Path to the txt documents
txt_path = './data/processed/txt_files/'

# Loop over each text file and extract Court information
for text in os.listdir(txt_path):

    with open(txt_path + text, 'r') as file:
        data = file.read()
        #print(data)

    # Regex expression: What comes after "Heard at" until hitting 3 balnks or new line
    #regex = '(?<=Heard at).*[^\S\r\n]{3,}'
    regex = 'Heard at(.*)[\S\r\n]| (?<=Heard at).*[^\S\r\n]{3,}'
    catch = re.search(regex, data)

    # If the catch is successful
    if catch :
        string = catch.group(0)
        # Remove ':' if included in the catch
        string = string.replace(':','')
        # Remove leading and trailing spaces
        string = string.strip()
        # Avoids picking up parts of tables and '|'
        string = string.split('   ')
        string = string[0]
        # Remove 'Heard at' if included in the catch
        string = string.replace('Heard at ','')
        # Remove 'manually' some strings often included in the catch
        string = string.replace('|Decision & Reasons Promulgated','')
        string = string.replace('|Decision and Reasons Promulgated','')
        string = string.replace('| Decision & Reasons Promulgated','')
        string = string.replace('Decision Promulgated','')
        string = string.replace('|Decision & Reasons promulgated','')
        string = string.replace('|Determination Promulgated','')
        string = string.replace('Decision and Reasons Promulgated','')
        string = string.replace('|Decision & Reasons  Promulgated','')
        string = string.replace(' on 4 July 2003','')
        string = string.replace('Determination Promulgated','')
        string = string.replace('Decision & Reasons Promulgated','')
        string = string.replace('|Decisions and Reasons Promulgated','')
        string = string.replace('|Decision and Reasons','')
        string = string.replace('UT(IAC)','')
        string = string.replace('UT (IAC) ','')
        string = string.replace('Date of Hearing  9 December 2005','')
        string = string.replace(' | |SS (Risk-Manastry) Iran CG [2003] UKIAT 00035 |','')
        # Strip trailing characters found often
        string = string.rstrip(',')
        string = string.rstrip('|')
        # Remove leading and trailing spaces (again)
        string = string.strip()
        
    else:
        string = 'NA'
    
    #print(string)
    # Add dictionary key 'Heard at' with value string to the dictionary
    data.update({'Heard at:': string})

# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)

AttributeError: 'str' object has no attribute 'update'

In [104]:
# Open jsonData file as data
with open('./data/jsonDataFinal.json') as json_file:
    data = json.load(json_file)

print(data[190])

{'Case title:': '', 'Appellant name:': '', 'Status of case:': 'Unreported', 'Hearing date:': '27 Jul 2021', 'Promulgation date:': '13 Aug 2021', 'Publication date:': '1 Sep 2021', 'Last updated on:': '1 Sep 2021', 'Country:': '', 'Judges:': '', 'Document': 'https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/73329/HU208952018.doc', 'Reference': ['HU/20895/2018'], 'Download': 'Yes', 'File': 'HU208952018', 'String': '\n                                    [pic]\n\nUpper Tribunal\n(Immigration and Asylum Chamber)    Appeal Number: HU/20895/2018\n\n\n                            THE IMMIGRATION ACTS\n\n\n|Heard at Field House                          |Decision & Reasons Promulgated|\n|On Tuesday, 27 July 2021                      |On Friday 13, August 2021     |\n|                                              |                              |\n\n                                   Before\n\n                          UPPER TRIBUNAL JUDGE PITT\n\n\n                           

In [70]:
# Use regex on sample list
l =['00010_ukait_2009_gs_afghanistan_cg.txt', '00003_ukait_2008_aa_others_pakistan.txt', 
'IA411142014.txt', 'IA417362014___Others.txt', 'PA047742016.txt', 'PA053522017.txt',
'IA124652014.txt', 'IA125982015.txt', 'PA085102018.txt']

# Use regex on entire list
ll = os.listdir(txt_path)
print(len(ll))



def find_location(txt_file):



    return string

    # Loading string with court decision to data
for txt_file in  tqdm(os.listdir(txt_path)):
    
    # Open file and obtain string and file_name
    with open(txt_path + txt_file, 'r') as file:
        string = file.read()
        f_name, f_ext = os.path.splitext(file.name)
        head, file_name = os.path.split(f_name)
    # Search data list of dictionaries for dict where {"File":} = file_name
    for d in data:
        if d.get('File') == file_name:
            # Add dictionary key 'String' with value string
            d.update({'String': string})
            


Heard at Newport


# 3. The judges



100%|██████████| 35244/35244 [58:16<00:00, 10.08it/s]


# 4. The legal representation for the appellant and the respondent

The legal team consists of the representation for the appellant and the respondent.

# 5. The decision/ruling of the judge

# Adding the text of each decision to jsonData
A string with sentence text is added to each object in the list.

In [117]:
# Paths to jsonData & txt files
jsonData_path = os.path.join(os.getcwd(), 'data/jsonData.json')
txt_path = './data/processed/txt_files/'

# Open jsonData file as data
with open(jsonData_path) as json_file:
    data = json.load(json_file)

# Loading string with court decision to data
for txt_file in  tqdm(os.listdir(txt_path)):
    
    # Open file and obtain string and file_name
    with open(txt_path + txt_file, 'r') as file:
        string = file.read()
        f_name, f_ext = os.path.splitext(file.name)
        head, file_name = os.path.split(f_name)
    # Search data list of dictionaries for dict where {"File":} = file_name
    for d in data:
        if d.get('File') == file_name:
            # Add dictionary key 'String' with value string
            d.update({'String': string})


100%|██████████| 35087/35087 [05:32<00:00, 105.43it/s]


In [121]:
# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)



ASYLUM AND IMMIGRATION TRIBUNAL

AA and Others (Highly skilled migrants: legitimate expectation) Pakistan
[2008] UKAIT 00003

                            THE IMMIGRATION ACTS

Heard at: Field House                             Date of Hearing:  22
October 2007

                                   Before

    Miss E Arfon-Jones DL, Deputy President of the Asylum and Immigration
                                  Tribunal
                       Senior Immigration Judge Grubb

                                   Between

                                                                  Appellants
                                     and

                 SECRETARY OF STATE FOR THE HOME DEPARTMENT
                                                                  Respondent

                                                                   Appellant
                                     and

                 SECRETARY OF STATE FOR THE HOME DEPARTMENT


                                         

In [97]:
print(f'Current working directory: {os.getcwd()}')

# Open jsonData file
jsonData_path = os.path.join(os.getcwd(), 'data/jsonData.json')
with open(jsonData_path) as json_file:
    data = json.load(json_file)
    print(json.dumps(data[32554], indent = 4, sort_keys = True))

#parsed = json.loads(jsonData)
#print(json.dumps(parsed[16366], indent = 4, sort_keys = True))

Current working directory: /Users/albertamurgopacheco/Documents/GitHub/TFM
{
    "Appellant name:": "",
    "Case title:": "",
    "Country:": "",
    "Document": "https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/39898/DA000192013.doc",
    "Download": "Yes",
    "Hearing date:": "",
    "Judges:": "",
    "Last updated on:": "4 Dec 2013",
    "Promulgation date:": "23 Oct 2013",
    "Publication date:": "4 Dec 2013",
    "Reference": [
        "DA/00019/2013"
    ],
    "Status of case:": "Unreported"
}


In [110]:
import docx2txt

# AA036502013.doc
# AA036222013.doc
# DA000192013.doc

result = textract.process(os.path.join( '/Users/albertamurgopacheco/Documents/GitHub/TFM/data/raw/HU029682017.docx'))
print(result)
print(file)
print(dest_file_name)


txt_path = './data/processed/txt_files/'
with open(txt_path + '00003_ukait_2008_aa_others_pakistan.txt', 'r') as file:
    data = file.read()
    print(data)
#f = open('./data/processed/txt_files/00003_ukait_2008_aa_others_pakistan.txt', "r")


BadZipFile: File is not a zip file

In [112]:
type(jsonData)

_io.TextIOWrapper

# Information extraction

In [23]:
# Regex per trobar jutges
Before([\S\s]*)Between?





# Regex for the Appellant
For the Appellant: ([\S\s]*)For the Respondent

# Regex for the Respondent
For the Respondent: (.*)\n\n 
# OR
For the Respondent: (.*)


Decisions 
https://docs.microsoft.com/en-us/dotnet/api/system.text.regularexpressions.match?view=net-5.0

In [None]:
# Extraction of decisions

https://research.iclr.co.uk/blog/blackstone-goes-live


# keyword extraction for issues in strategies
https://www.airpair.com/nlp/keyword-extraction-tutorial


In [117]:
# Decision extraction

f = open('./data/processed/txt_files/00003_ukait_2008_aa_others_pakistan.txt', "r")
for number, paragraph in enumerate(f.read().split("\n\n"), 1):
    print(number)
    print(paragraph)
    pattern = "Decisions"
    if paragraph.find(pattern) != -1:
        print("save to file") 
    else:
        print("don't save to file")

        

1

ASYLUM AND IMMIGRATION TRIBUNAL
don't save to file
2
AA and Others (Highly skilled migrants: legitimate expectation) Pakistan
[2008] UKAIT 00003
don't save to file
3
                            THE IMMIGRATION ACTS
don't save to file
4
Heard at: Field House                             Date of Hearing:  22
October 2007
don't save to file
5
                                   Before
don't save to file
6
    Miss E Arfon-Jones DL, Deputy President of the Asylum and Immigration
                                  Tribunal
                       Senior Immigration Judge Grubb
don't save to file
7
                                   Between
don't save to file
8
                                                                  Appellants
                                     and
don't save to file
9
                 SECRETARY OF STATE FOR THE HOME DEPARTMENT
                                                                  Respondent
don't save to file
10
                                       

In [116]:
import spacy

# Load the model
nlp = spacy.load("en_blackstone_proto")



OSError: [E053] Could not read config.cfg from /Users/albertamurgopacheco/anaconda3/envs/tfm/lib/python3.8/site-packages/en_blackstone_proto/en_blackstone_proto-0.0.1/config.cfg