In [1]:
import pandas as pd
from pathlib import Path
import shutil
import random

# --- INPUT PATHS ---
csv_in = Path(r"D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\combined_extracted_dataAICoded.csv")
csv_out = Path(r"D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\combined_extracted_dataAICHECKSAMPLE.csv")

batch_dirs = [
    Path(r"D:\vc-research\vc-research\Batch1"),
    Path(r"D:\vc-research\vc-research\Batch2"),
    Path(r"D:\vc-research\vc-research\Batch3"),
    Path(r"D:\vc-research\vc-research\Batch4"),
]
dest_dir = Path(r"D:\vc-research\vc-research\BatchAISAMPLE")
dest_dir.mkdir(parents=True, exist_ok=True)

# --- PARAMETERS ---
SAMPLE_N = 200
RANDOM_SEED = 45  # set to an int (e.g., 42) if you want reproducible sampling

# --- STEP 1: Read CSV and randomly select 200 rows ---
df = pd.read_csv(csv_in, dtype=str)  # read all as string to protect IDs
if len(df) < SAMPLE_N:
    raise ValueError(f"CSV has only {len(df)} rows; cannot sample {SAMPLE_N}.")

df_sample = df.sample(n=SAMPLE_N, random_state=RANDOM_SEED)
df_sample.to_csv(csv_out, index=False)

# --- STEP 2: Build filename list from 'custom_id' ---
if "custom_id" not in df_sample.columns:
    raise KeyError("Column 'custom_id' not found in the CSV.")

def transform_id(s: str) -> str | None:
    if pd.isna(s):
        return None
    s = str(s)
    # remove first 8 chars and last 9 chars
    if len(s) < 17:  # must be at least 8 + 9
        return None
    return s[8:-9]

names = [transform_id(x) for x in df_sample["custom_id"]]
# keep only valid, non-empty names
names = [n for n in names if n and n.strip()]
# Dedup & normalize case for matching on Windows
name_set = {n.strip().lower() for n in names}

# Also save the list for reference
names_txt = dest_dir / "sample_filenames_list.txt"
with names_txt.open("w", encoding="utf-8") as f:
    for n in sorted(name_set):
        f.write(n + "\n")

# --- STEP 3: Index all files in Batch1–Batch4 by stem (case-insensitive) ---
# We’ll map normalized stem -> list of file Paths (in case the same name appears in multiple dirs)
from collections import defaultdict
index = defaultdict(list)

for bdir in batch_dirs:
    if not bdir.exists():
        continue
    for p in bdir.rglob("*"):
        if p.is_file():
            stem_norm = p.stem.lower()
            index[stem_norm].append(p)

# --- STEP 4: Copy matches to destination, avoiding overwrites ---
def safe_copy(src: Path, dst_folder: Path):
    """
    Copy src to dst_folder, preserving extension, avoiding overwrite by appending suffixes.
    Returns the final destination path.
    """
    candidate = dst_folder / src.name
    if not candidate.exists():
        shutil.copy2(src, candidate)
        return candidate
    # add incremental suffix: filename_1.ext, filename_2.ext, ...
    stem, ext = src.stem, src.suffix
    i = 1
    while True:
        c2 = dst_folder / f"{stem}_{i}{ext}"
        if not c2.exists():
            shutil.copy2(src, c2)
            return c2
        i += 1

copied = []
missing = []

for target_name in name_set:
    if target_name in index:
        for src_path in index[target_name]:
            dst_path = safe_copy(src_path, dest_dir)
            copied.append(dst_path)
    else:
        missing.append(target_name)

# --- Optional: brief console report ---
print(f"Sampled {len(df_sample)} rows and saved to:\n  {csv_out}")
print(f"Filename list written to:\n  {names_txt}")
print(f"Copied {len(copied)} files to:\n  {dest_dir}")
if missing:
    print(f"{len(missing)} names had no matching files in Batch1–4. "
          f"First few: {missing[:10]}")


Sampled 200 rows and saved to:
  D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\combined_extracted_dataAICHECKSAMPLE.csv
Filename list written to:
  D:\vc-research\vc-research\BatchAISAMPLE\sample_filenames_list.txt
Copied 200 files to:
  D:\vc-research\vc-research\BatchAISAMPLE


In [2]:
#Yash stuff
import shutil
from pathlib import Path

# --- INPUTS ---
checklist_path = Path(r"D:\vc-research\vc-research\Reese's contributions\reese data\checkedlist.txt")
search_dirs = [
    Path(r"D:\vc-research\vc-research\Batch1"),
    Path(r"D:\vc-research\vc-research\Batch2"),
    Path(r"D:\vc-research\vc-research\Batch3"),
    Path(r"D:\vc-research\vc-research\Batch4"),
]
dest_dir = Path(r"D:\vc-research\vc-research\Reese's contributions\reese data\YashPDFs")

# --- HELPERS ---
def transform_name(raw_line: str) -> str | None:
    """
    For a line like 'checked_XXXX...YYYY', remove the leading 'checked_' (if present)
    and then drop the last 9 characters. Returns the resulting base name (no extension).
    """
    s = raw_line.strip()
    if not s:
        return None
    if s.startswith("checked_"):
        s = s[len("checked_"):]
    if len(s) < 9:
        return None  # can't drop last 9 safely
    s = s[:-9]
    # Remove a trailing extension if present; we will compare to PDF stems anyway.
    if "." in s:
        s = Path(s).stem
    return s

def unique_destination(base: Path) -> Path:
    """
    If 'base' exists, append _1, _2, ... before the extension to avoid overwrite.
    """
    if not base.exists():
        return base
    i = 1
    while True:
        candidate = base.with_stem(f"{base.stem}_{i}")
        if not candidate.exists():
            return candidate
        i += 1

# --- MAIN ---
def main():
    dest_dir.mkdir(parents=True, exist_ok=True)

    # Read and transform checklist lines into desired stems (casefold for robust matching)
    with checklist_path.open("r", encoding="utf-8-sig") as f:
        targets = [t for t in (transform_name(line) for line in f) if t]

    target_set = {t.casefold() for t in targets}  # case-insensitive matching

    # Build an index of all PDFs across search directories: {stem_lower: [paths]}
    stem_to_paths: dict[str, list[Path]] = {}
    for root in search_dirs:
        if not root.exists():
            print(f"WARNING: Search directory not found: {root}")
            continue
        for pdf in root.rglob("*.pdf"):
            stem_lower = pdf.stem.casefold()
            stem_to_paths.setdefault(stem_lower, []).append(pdf)

    # Copy matches
    copied = 0
    missing = []
    multi = 0

    for stem_lower in target_set:
        matches = stem_to_paths.get(stem_lower, [])
        if not matches:
            missing.append(stem_lower)
            continue

        # If multiple PDFs share the same stem, copy them all (from different folders)
        if len(matches) > 1:
            multi += 1

        for src in matches:
            out = dest_dir / src.name
            out = unique_destination(out)  # avoid overwriting if duplicate names
            shutil.copy2(src, out)
            copied += 1
            print(f"Copied: {src} -> {out}")

    print("\n--- SUMMARY ---")
    print(f"Requested stems: {len(target_set)}")
    print(f"PDFs copied:     {copied}")
    print(f"Stems missing:   {len(missing)}")
    if missing:
        # Show a few for quick inspection
        preview = ", ".join(sorted(missing)[:15])
        print(f"Missing (sample): {preview}{' ...' if len(missing) > 15 else ''}")
    print(f"Stems with multiple source PDFs: {multi}")

if __name__ == "__main__":
    main()


Copied: D:\vc-research\vc-research\Batch2\110_2010-02-09_Certificates of Incorporation.pdf -> D:\vc-research\vc-research\Reese's contributions\reese data\YashPDFs\110_2010-02-09_Certificates of Incorporation.pdf
Copied: D:\vc-research\vc-research\Batch2\188_2014-04-14_Certificates of Incorporation.pdf -> D:\vc-research\vc-research\Reese's contributions\reese data\YashPDFs\188_2014-04-14_Certificates of Incorporation.pdf
Copied: D:\vc-research\vc-research\Batch2\560_2010-05-28_Certificates of Incorporation.pdf -> D:\vc-research\vc-research\Reese's contributions\reese data\YashPDFs\560_2010-05-28_Certificates of Incorporation.pdf
Copied: D:\vc-research\vc-research\Batch2\640_2006-02-07_Certificates of Incorporation.pdf -> D:\vc-research\vc-research\Reese's contributions\reese data\YashPDFs\640_2006-02-07_Certificates of Incorporation.pdf
Copied: D:\vc-research\vc-research\Batch2\166_2003-11-05_Certificates of Incorporation.pdf -> D:\vc-research\vc-research\Reese's contributions\reese dat