Run these one at a time

In [18]:
#!/usr/bin/env python
"""
Find up‑to‑five files that appear in BOTH the “perturbed_*.json” set and the
“modified_*.txt” set (ignoring the prefixes and the extensions).

Results are returned in:
    json_common_file_names  – the JSON‑side basenames (with prefix & ext)
    txt_common_file_names   – the TXT‑side basenames (with prefix & ext)
"""

import os
from pathlib import Path
from typing import List, Tuple

# --- settings ---------------------------------------------------------------
ROOT_DIR = Path(r"C:\Users\user\Programming\temporary\InText_Doc_Location")
JSON_PREFIX, JSON_EXT = "perturbed_", ".json"
TXT_PREFIX,  TXT_EXT  = "modified_", ".txt"
TARGET_COUNT = 5
# ---------------------------------------------------------------------------

# ---------------------------------------------------------------------------
#  UNIVERSAL collectors: keep a file only if it exists in **all** categories
# ---------------------------------------------------------------------------
from pathlib import Path
from typing import Dict, List, Set

JSON_PREFIX, JSON_EXT = "perturbed_", ".json"
TXT_PREFIX,  TXT_EXT  = "modified_", ".txt"


def canonical_name(fname: str, prefix: str, ext: str) -> str:
    return fname[len(prefix):-len(ext)]


def _per_category_sets(root: Path) -> tuple[Dict[str, Set[str]], Dict[str, Set[str]]]:
    """
    Build two dictionaries keyed by category name:

        json_keys[cat]  -> {canonical names found in cat/}
        txt_keys[cat]   -> {canonical names found in cat/modified_files/}

    Every canonical name is *prefix/extension‑stripped* so the two domains can
    be compared directly.
    """
    json_keys: Dict[str, Set[str]] = {}
    txt_keys:  Dict[str, Set[str]] = {}

    for cat in (d for d in root.iterdir() if d.is_dir()):
        # ---- JSON files directly inside <cat>/
        jset = {
            canonical_name(ch.name, JSON_PREFIX, JSON_EXT)
            for ch in cat.iterdir()
            if ch.is_file() and ch.name.startswith(JSON_PREFIX) and ch.name.endswith(JSON_EXT)
        }
        json_keys[cat.name] = jset

        # ---- TXT files in <cat>/modified_files/
        mdir = cat / "modified_files"
        tset = {
            canonical_name(ch.name, TXT_PREFIX, TXT_EXT)
            for ch in mdir.iterdir()
            if ch.is_file() and ch.name.startswith(TXT_PREFIX) and ch.name.endswith(TXT_EXT)
        } if mdir.is_dir() else set()
        txt_keys[cat.name] = tset

    return json_keys, txt_keys


def collect_universal_json_and_txt(root: Path) -> tuple[List[str], List[str]]:
    """
    Returns two *basename* lists (with their original prefixes/extensions) that
    occur in **every** category folder:

        json_common_file_names – 'perturbed_*.json'
        txt_common_file_names  – 'modified_*.txt'
    """
    json_keys, txt_keys = _per_category_sets(root)

    # ---- intersection across categories
    universal_json = set.intersection(*(s for s in json_keys.values() if s))
    universal_txt  = set.intersection(*(s for s in txt_keys.values()  if s))

    # ---- only keep names that survive BOTH universality tests
    universal = universal_json & universal_txt

    # ---- Build representative basenames (first encounter wins)
    json_common, txt_common = {}, {}

    for cat in (d for d in root.iterdir() if d.is_dir()):
        for ch in cat.iterdir():
            if ch.is_file() and ch.name.startswith(JSON_PREFIX) and ch.name.endswith(JSON_EXT):
                key = canonical_name(ch.name, JSON_PREFIX, JSON_EXT)
                if key in universal and key not in json_common:
                    json_common[key] = ch.name

        mdir = cat / "modified_files"
        if mdir.is_dir():
            for ch in mdir.iterdir():
                if ch.is_file() and ch.name.startswith(TXT_PREFIX) and ch.name.endswith(TXT_EXT):
                    key = canonical_name(ch.name, TXT_PREFIX, TXT_EXT)
                    if key in universal and key not in txt_common:
                        txt_common[key] = ch.name

    # Return them as *lists of basenames*, sorted for determinism
    ordered_keys = sorted(universal)
    json_common_file_names = [json_common[k] for k in ordered_keys]
    txt_common_file_names  = [txt_common[k]  for k in ordered_keys]

    return json_common_file_names, txt_common_file_names


def find_common_lists(root: Path, target_count: int = 5):
    json_names, txt_names = collect_universal_json_and_txt(root)

    # Trim to the first N if you still want a maximum of five
    json_common_file_names = json_names[:target_count]
    txt_common_file_names  = txt_names[:target_count]

    return json_common_file_names, txt_common_file_names


# ----------------------------- main / demo ----------------------------------
if __name__ == "__main__":
    json_common_file_names, txt_common_file_names = find_common_lists(ROOT_DIR)

    print("json_common_file_names =", json_common_file_names)
    print("txt_common_file_names  =", txt_common_file_names)


json_common_file_names = ['perturbed_ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt.json', 'perturbed_BORROWMONEYCOM,INC_06_11_2020-EX-10.1-JOINTVENTUREAGREEMENT.txt.json', 'perturbed_CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEBSITEHOSTINGAGREEMENT.txt.json', 'perturbed_DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt.json', 'perturbed_Freecook_20180605_S-1_EX-10.3_11233807_EX-10.3_HostingAgreement.txt.json']
txt_common_file_names  = ['modified_ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt.txt', 'modified_BORROWMONEYCOM,INC_06_11_2020-EX-10.1-JOINTVENTUREAGREEMENT.txt.txt', 'modified_CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEBSITEHOSTINGAGREEMENT.txt.txt', 'modified_DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt.txt', 'modified_Freecook_20180605_S-1_EX-10.3_11233807_EX-10.3_HostingAgreement.txt.txt']


In [19]:
import shutil
from pathlib import Path
from typing import Iterable, List, Tuple

def copy_selected_files(
    file_names: Iterable[str],
    src_dir: str | Path,
    dst_dir: str | Path
) -> Tuple[List[Path], List[Path]]:
    """
    Copy the files listed in *file_names* from *src_dir* to *dst_dir*.

    • *src_dir*   – folder that already contains the files (searched non‑recursively).
    • *dst_dir*   – folder that will receive the copies; created automatically
                    (including any parent directories) if it doesn’t exist.

    Returns
    -------
    copied, skipped : two lists with the Path objects that were copied
                      and those that were skipped because they already existed.
    """
    src_path = Path(src_dir).expanduser().resolve()
    dst_path = Path(dst_dir).expanduser().resolve()
    dst_path.mkdir(parents=True, exist_ok=True)

    copied: List[Path] = []
    skipped: List[Path] = []

    for name in file_names:
        source_file = src_path / name
        target_file = dst_path / name
        try:
            if not target_file.exists():
                shutil.copy2(source_file, target_file)
                copied.append(target_file)
            else:
                skipped.append(target_file)
        except FileNotFoundError:
            # If the file isn’t present in src_dir we just continue.
            skipped.append(source_file)

    return copied, skipped

# json_common_file_names, txt_common_file_names

# copied, skipped = copy_selected_files(
#     json_common_file_names,
#     src_dir=r"",
#     dst_dir=r""
# )

# print("Copied:", copied)

copied, skipped = copy_selected_files(
    json_common_file_names,
    src_dir=r"C:\Users\user\Programming\temporary\InText_Doc_Location\ambiguity_InText",
    dst_dir=r"C:\Users\user\Desktop\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\answers_v2\ambiguity_inText"
)

print("Copied:", copied)

copied, skipped = copy_selected_files(
    json_common_file_names,
    src_dir=r"C:\Users\user\Programming\temporary\InText_Doc_Location\inconsistencies_InText",
    dst_dir=r"C:\Users\user\Desktop\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\answers_v2\inconsistencies_inText"
)

print("Copied:", copied)

copied, skipped = copy_selected_files(
    json_common_file_names,
    src_dir=r"C:\Users\user\Programming\temporary\InText_Doc_Location\misaligned_term_InText",
    dst_dir=r"C:\Users\user\Desktop\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\answers_v2\misaligned_terminalogy_inText"
)

print("Copied:", copied)

copied, skipped = copy_selected_files(
    json_common_file_names,
    src_dir=r"C:\Users\user\Programming\temporary\InText_Doc_Location\omission_InText",
    dst_dir=r"C:\Users\user\Desktop\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\answers_v2\omissions_inText"
)

print("Copied:", copied)

copied, skipped = copy_selected_files(
    json_common_file_names,
    src_dir=r"C:\Users\user\Programming\temporary\InText_Doc_Location\structural_flaws_InText",
    dst_dir=r"C:\Users\user\Desktop\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\answers_v2\structural_flaws_inText"
)

print("Copied:", copied)

copied, skipped = copy_selected_files(
    txt_common_file_names,
    src_dir=r"C:\Users\user\Programming\temporary\InText_Doc_Location\structural_flaws_InText\modified_files",
    dst_dir=r"C:\Users\user\Desktop\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\documents_v2\structural_flaws_inText"
)

print("Copied:", copied)

copied, skipped = copy_selected_files(
    txt_common_file_names,
    src_dir=r"C:\Users\user\Programming\temporary\InText_Doc_Location\omission_InText\modified_files",
    dst_dir=r"C:\Users\user\Desktop\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\documents_v2\omissions_inText"
)

print("Copied:", copied)

copied, skipped = copy_selected_files(
    txt_common_file_names,
    src_dir=r"C:\Users\user\Programming\temporary\InText_Doc_Location\misaligned_term_InText\modified_files",
    dst_dir=r"C:\Users\user\Desktop\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\documents_v2\misaligned_terminalogy_inText"
)

print("Copied:", copied)

copied, skipped = copy_selected_files(
    txt_common_file_names,
    src_dir=r"C:\Users\user\Programming\temporary\InText_Doc_Location\inconsistencies_InText\modified_files",
    dst_dir=r"C:\Users\user\Desktop\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\documents_v2\inconsistencies_inText"
)

print("Copied:", copied)

copied, skipped = copy_selected_files(
    txt_common_file_names,
    src_dir=r"C:\Users\user\Programming\temporary\InText_Doc_Location\ambiguity_InText\modified_files",
    dst_dir=r"C:\Users\user\Desktop\Legal-Document-Discrepancy-Benchmark-Dataset\mini-eval\documents_v2\ambiguity_inText"
)

print("Copied:", copied)



Copied: [WindowsPath('C:/Users/user/Desktop/Legal-Document-Discrepancy-Benchmark-Dataset/mini-eval/answers_v2/ambiguity_inText/perturbed_ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt.json'), WindowsPath('C:/Users/user/Desktop/Legal-Document-Discrepancy-Benchmark-Dataset/mini-eval/answers_v2/ambiguity_inText/perturbed_BORROWMONEYCOM,INC_06_11_2020-EX-10.1-JOINTVENTUREAGREEMENT.txt.json'), WindowsPath('C:/Users/user/Desktop/Legal-Document-Discrepancy-Benchmark-Dataset/mini-eval/answers_v2/ambiguity_inText/perturbed_CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEBSITEHOSTINGAGREEMENT.txt.json'), WindowsPath('C:/Users/user/Desktop/Legal-Document-Discrepancy-Benchmark-Dataset/mini-eval/answers_v2/ambiguity_inText/perturbed_DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt.json'), WindowsPath('C:/Users/user/Desktop/Legal-Document-Discrepancy-Benchmark-Dataset/mini-eval/answers_v2/ambiguity_inText/perturbed_Freecook_20180605_S-1_EX-10.3_11233807_EX-10.3_HostingAgr