In [1]:
import nltk, re, pprint
from nltk import word_tokenize, sent_tokenize
import os
import json
import xml.etree.ElementTree as ET
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import find_corpus_fileids


In [4]:
class CustomCorpusReader(CorpusReader):
    def __init__(self, root):
        super().__init__(root, fileids=None)
        self.annotations_dir = os.path.join(root, 'annotations')
        self.cases_dir = os.path.join(root, 'cases')

    def annotations(self):
        annotations = {}
        for fname in os.listdir(self.annotations_dir):
            if fname.endswith('.json'):
                fpath = os.path.join(self.annotations_dir, fname)
                with open(fpath, 'r', encoding='utf-8') as f:
                    annotations[fname] = json.load(f)
        return annotations

    def case_files(self):
        """Recursively walk through group/year folders and parse XML files."""
        cases = {}
        for root, _, files in os.walk(self.cases_dir):
            for file in files:
                if file.endswith('.xml'):
                    full_path = os.path.join(root, file)
                    try:
                        tree = ET.parse(full_path)
                        root_elem = tree.getroot()

                        # optional: create a case ID from its path
                        rel_path = os.path.relpath(full_path, self.cases_dir)
                        case_id = rel_path.replace(os.sep, '_').replace('.xml', '')

                        cases[case_id] = root_elem
                    except ET.ParseError as e:
                        print(f"Error parsing {full_path}: {e}")
        return cases
    
    def extract_text(self, xml_root):
        """Extract all text from an XML tree, ignoring tags."""
        return ' '.join(xml_root.itertext())

    def case_text_stats(self):
        """
        Return a dict of sentence and word counts per case.
        Structure: { case_id: {'sentences': x, 'words': y} }
        """
        stats = {}
        case_xml = self.case_files()
        
        for case_id, root in case_xml.items():
            raw_text = self.extract_text(root)
            sentences = sent_tokenize(raw_text)
            words = word_tokenize(raw_text)
            
            stats[case_id] = {
                'sentences': len(sentences),
                'words': len(words)
            }

        return stats

In [5]:
reader_clc = CustomCorpusReader('/Users/arniexx/Desktop/datasets & nlp/nlp final project/CLCmini/corpus')
stats_clc = reader_clc.case_text_stats()

# print all sentence and word counts, 15 cases/xml files
print("Case Text Statistics:\n")
for case_id, counts in sorted(stats_clc.items()):
    print(f"{case_id}: {counts['sentences']} sentences, {counts['words']} words")

Case Text Statistics:

EWCA-Civ_1892_-ew-cases-EWCA-Civ-1892-1: 192 sentences, 6720 words
EWCA-Civ_1977_-ew-cases-EWCA-Civ-1977-6: 416 sentences, 9296 words
EWCA-Civ_2020_-ew-cases-EWCA-Civ-2020-1605: 400 sentences, 12648 words
EWCA-Crim_2005_-ew-cases-EWCA-Crim-2005-2826: 354 sentences, 9439 words
EWHC-Admin_2016_-ew-cases-EWHC-Admin-2016-2768: 464 sentences, 19549 words
EWHC-Admin_2019_-ew-cases-EWHC-Admin-2019-2341: 1057 sentences, 34170 words
EWHC-Exch_1845_-ew-cases-EWHC-Exch-1845-J83: 240 sentences, 7781 words
UKEAT_2017_-uk-cases-UKEAT-2017-0334_16_3107: 323 sentences, 9777 words
UKEAT_2019_-uk-cases-UKEAT-2019-0149_18_2506: 127 sentences, 4029 words
UKHL_1932_-uk-cases-UKHL-1932-100: 590 sentences, 24717 words
UKHL_1999_-uk-cases-UKHL-1999-52: 468 sentences, 13433 words
UKSC_2016_-uk-cases-UKSC-2016-14: 348 sentences, 12330 words
UKSC_2018_-uk-cases-UKSC-2018-49: 477 sentences, 15294 words
UKSC_2019_-uk-cases-UKSC-2019-41: 457 sentences, 12411 words
UKVAT_2004_-uk-cases-UKVAT-2

In [6]:
def clean_text(text):
    # remove special characters (keep basic punctuation)
    text = re.sub(r"[^a-zA-Z0-9\s.,;:!?'\"]+", '', text)
    
    # normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def export_combined_txt(reader, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for case_id, xml_root in reader.case_files().items():
            text = reader.extract_text(xml_root)
            cleaned = clean_text(text)
            f.write(f"--- {case_id} ---\n")
            f.write(cleaned + "\n\n")

export_combined_txt(reader_clc, '/Users/arniexx/Desktop/datasets & nlp/nlp final project/corpus_combined.txt')

In [18]:
import spacy
nlp = spacy.load("en_core_web_sm")
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [19]:
# read the corpus
with open("corpus_combined.txt", "r", encoding="utf-8") as file:
    text = file.read()

# process the text
doc = nlp(text)

# iterate over sentences and print POS-tagged tokens
for sent in doc.sents:
    print(f"\nSentence: {sent.text}")
    for token in sent:
        print(f"{token.text}\t{token.pos_}\t{token.tag_}\t{token.dep_}")


Sentence: --- EWHC-Exch_1845_-ew-cases-EWHC-Exch-1845-J83 ---
79c54aa6739044a6aa8e186e64b8c176 html England and Wales High Court Exchequer Court 18450222 153 ER 351 1845 13 M W 838 1845 EWHC Exch J83 England and Wales High Court Exchequer Court Decisions England and Wales High Court Exchequer Court Decisions Wood v Leadbitter 1845 EWHC Exch J83 22 February 1845 153 ER 351, 1845 13 M W 838, 1845 EWHC Exch J83 Neutral Citation Number: 1845 EWHC Exch J83 1845 13 M W 838; 153 ER 351 IN THE COURT OF EXCHEQUER 22 February 1845 Between: WOOD v LEADBITTER Trespass for assault and false imprisonment.
---	PUNCT	:	punct
EWHC	PROPN	NNP	nmod
-	PUNCT	HYPH	punct
Exch_1845_-ew	VERB	VBN	nummod
-	PUNCT	HYPH	punct
cases	NOUN	NNS	prep
-	PUNCT	HYPH	punct
EWHC	NOUN	NN	compound
-	PUNCT	HYPH	punct
Exch-1845	NOUN	NN	compound
-	PUNCT	HYPH	punct
J83	PROPN	NNP	appos
---	PUNCT	:	punct

	SPACE	_SP	dep
79c54aa6739044a6aa8e186e64b8c176	NUM	CD	nummod
html	NOUN	NN	nmod
England	PROPN	NNP	nmod
and	CCONJ	CC	cc
Wales	PROP

In [30]:
# loading in a model w vectors + defining anchor phrases
nlp2 = spacy.load("en_core_web_md")
anchor_phrases = [
    "in accordance with",
    "on behalf of",
    "in respect of",
    "by virtue of",
    "in relation to",
    "with regard to"
]

anchor_docs = [nlp2(p) for p in anchor_phrases]

In [31]:
# extracting candidate phrases from the text
from itertools import islice

def get_candidate_phrases(doc, min_len=3, max_len=5):
    candidates = []
    for i in range(len(doc)):
        for size in range(min_len, max_len + 1):
            span = doc[i:i+size]
            if len(span) < size:
                continue
            if any(token.pos_ == "ADP" for token in span):
                candidates.append(span)
    return candidates

In [32]:
# load the text
with open("corpus_combined.txt", "r", encoding="utf-8") as f:
    text = f.read()

doc = nlp2(text)
candidates = get_candidate_phrases(doc)

similar_matches = []

# compare candidates to anchors
for span in candidates:
    for anchor_doc in anchor_docs:
        similarity = span.similarity(anchor_doc)
        if similarity > 0.85:  # Tune this threshold
            similar_matches.append((span.text, similarity))


In [33]:
for phrase, score in sorted(similar_matches, key=lambda x: -x[1]):
    print(f"Matched: '{phrase}' (Similarity: {score:.2f})")

Matched: 'By virtue of' (Similarity: 1.00)
Matched: 'By virtue of' (Similarity: 1.00)
Matched: 'by consideration of' (Similarity: 1.00)
Matched: 'in respect of' (Similarity: 1.00)
Matched: 'by virtue of' (Similarity: 1.00)
Matched: 'in relation to' (Similarity: 1.00)
Matched: 'on behalf of' (Similarity: 1.00)
Matched: 'on behalf of' (Similarity: 1.00)
Matched: 'on behalf of' (Similarity: 1.00)
Matched: 'in relation to' (Similarity: 1.00)
Matched: 'in relation to Ajaz' (Similarity: 1.00)
Matched: 'on behalf of' (Similarity: 1.00)
Matched: 'in accordance with' (Similarity: 1.00)
Matched: 'in relation to' (Similarity: 1.00)
Matched: 'in relation to' (Similarity: 1.00)
Matched: 'in relation to' (Similarity: 1.00)
Matched: 'with regard to' (Similarity: 1.00)
Matched: 'in accordance with' (Similarity: 1.00)
Matched: 'in accordance with' (Similarity: 1.00)
Matched: 'in respect of' (Similarity: 1.00)
Matched: 'in accordance with' (Similarity: 1.00)
Matched: 'in accordance with' (Similarity: 1.