## Inspect files to get total counts, unique specimens, and check regex matching
---
*Last edited 23 Oct 2025 by K. Wolcott*

In [18]:
# Read in data from latent codes for training attempt
import os, re, json, numpy as np, pandas as pd

# Load config
train_dir = 'run_v30'
config_path = train_dir + '/model_params_config.json'
try:
    with open(config_path, 'r') as f:
        cfg = json.load(f)
    print(f"\033[92mLoaded config from {config_path}\033[0m")
except FileNotFoundError:
    print(f"\033[31mError: model_params_config.json not found in {config_path}. Exiting.\033[0m")

# Get training paths from config
train_paths = cfg['list_mesh_paths']
all_vtk_files = [os.path.basename(f) for f in train_paths]

# Parse species and vertebra info from filenames (e.g. "H-capensis-22-L8.ply_align.vtk").
pat = re.compile(r"^(?P<species>[\w\s\-]+)[\-_ ]+\d+[\-_ ]+(?P<vertebra>[CTL]\d+)", re.IGNORECASE)
labels = []
unmatched_files = []
for f in all_vtk_files:
    fname = os.path.basename(f)
    m = pat.match(fname)
    if m:
        species = m.group("species").strip()
        vertebra = m.group("vertebra").strip()  # e.g. "C2", "L8"
        labels.append((species, vertebra))
    else:
        labels.append((None, None))
        unmatched_files.append(fname)

# Inspect training dataset filepaths
print("\nInspecting info for training data...\n")

# List summary of files matched by regex
all_species = []
for i, (species, vertebra) in enumerate(labels):
    all_species.append(species)
res = list(dict.fromkeys(all_species))
res_sorted = sorted(res, key=lambda x: str(x) if x is not None else '')
print(f"Running analysis for {len(all_vtk_files)} vertebrae from {len(res)} specimens")
print(f"\n{len(unmatched_files)} files were found that don't match the regex pattern.")
print("\nFiles not matching the regex: ", unmatched_files)
#print("\nUnique specimens found matching the regex: ")
#for item in res_sorted:
    #print(item)

[92mLoaded config from run_v30/model_params_config.json[0m

Inspecting info for training data...

Running analysis for 1818 vertebrae from 89 specimens

4 files were found that don't match the regex pattern.

Files not matching the regex:  ['agamidae_agama_atra_uf180711_21.l7_align.vtk', 'boiidae_eryx conicus uf-herp-66735_align.vtk', 'homalopsidae_homalopsis_buccata_uf61845_align.vtk', 'typhlopidae_aniolios_erycinus_uf113561_align.vtk']


In [19]:
# Inspect all filenames
print("\nInspecting info for all data...\n")

# Folder to where mesh paths are stored
folder_vtk = "vertebrae_meshes"

# Get paths from folder
train_paths = os.listdir(folder_vtk)
all_vtk_files = [os.path.basename(f) for f in train_paths]

# Parse species and vertebra info from filenames (e.g. "H-capensis-22-L8.ply_align.vtk").
pat = re.compile(r"^(?P<species>[\w\s\-]+)[\-_ ]+\d+[\-_ ]+(?P<vertebra>[CTL]\d+)", re.IGNORECASE)
labels = []
unmatched_files = []
for f in all_vtk_files:
    fname = os.path.basename(f)
    m = pat.match(fname)
    if m:
        species = m.group("species").strip()
        vertebra = m.group("vertebra").strip()  # e.g. "C2", "L8"
        labels.append((species, vertebra))
    else:
        labels.append((None, None))
        unmatched_files.append(fname)

# List summary of files matched by regex
all_species = []
for i, (species, vertebra) in enumerate(labels):
    all_species.append(species)
res = list(dict.fromkeys(all_species))
res_sorted = sorted(res, key=lambda x: str(x) if x is not None else '')
print(f"Running analysis for {len(all_vtk_files)} vertebrae from {len(res)} specimens")
print(f"\n{len(unmatched_files)} files were found that don't match the regex pattern.")
print("\nFiles not matching the regex: ", unmatched_files)
#print("\nUnique specimens found matching the regex: ")
#for item in res_sorted:
    #print(item)


Inspecting info for all data...

Running analysis for 2273 vertebrae from 90 specimens

4 files were found that don't match the regex pattern.

Files not matching the regex:  ['boiidae_eryx conicus uf-herp-66735_align.vtk', 'agamidae_agama_atra_uf180711_21.l7_align.vtk', 'typhlopidae_aniolios_erycinus_uf113561_align.vtk', 'homalopsidae_homalopsis_buccata_uf61845_align.vtk']
