In [2]:
%load_ext autoreload

In [3]:
%autoreload 2
import json
from pathlib import Path
from loguru import logger
from constants import SEED
from pipeline import path_from_config
from copy import deepcopy
from generate_experiment import hash_experiment_config, recursive_sort
from pipeline_components import preprocessers
import shutil
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [17]:


def default_seed(experiment: dict, seed: int):
    """Append seed to experiment if not already present"""
    if "seed" not in experiment:
        experiment["seed"] = seed
    return experiment

def extract_preprocessers(component: str):
    prefix = "preprocessers_"
    string_to_split = component[len(prefix):]
    valid_preprocessers = [x.name for x in preprocessers]
    parsed = []
    while string_to_split:
        for preprocesser in valid_preprocessers:
            if string_to_split.startswith(preprocesser):
                parsed.append(preprocesser)
                string_to_split = string_to_split[len(preprocesser):]
                valid_preprocessers.remove(preprocesser)
                break
        else:
            raise ValueError(f"Invalid preprocesser: {string_to_split}")
    return parsed

def normalise_preprocessers(path: Path):
        prefix = "preprocessers_"
        new_components = []
        for component in path.parts:
            if component.startswith(prefix):
                preprocessers = extract_preprocessers(component)
                sorted_preprocessers = recursive_sort(preprocessers)
                sorted_preprocessers_str = "".join(sorted_preprocessers)
                new_components.append(prefix + sorted_preprocessers_str)
            else:
                new_components.append(component)
        return Path(*new_components)

def update_seeds(results_dir: Path):
    experiment_log_file = results_dir / "experiment_log.json"
    with open(experiment_log_file) as f:
        experiment_log: dict = json.load(f)
    new_log = {}
    for experiment in experiment_log.values():
        to_hash = experiment.copy()
        del to_hash["status"]
        del to_hash["error"]
        new_hash = hash_experiment_config(to_hash)
        experiment = default_seed(experiment, SEED)
        experiment["preprocesser_order"] = recursive_sort(experiment["preprocesser_order"])
        new_log[new_hash] = experiment
    return new_log


def clear_results_by_pattern(
    results_dir: Path,
    preprocessers: list[str] = None,
    tokenisers: list[str] = None,
    featurisers: list[str] = None,
    models: list[str] = None,
    backup: bool = True,
    verbose: bool = False
):
    """
    Clear entry in experiment log matching specified patterns and optionally backup first.
    
    Args:
        results_dir: Directory containing results files
        preprocessers: List of preprocesser names to match
        tokenisers: List of tokeniser names to match
        featurisers: List of featuriser names to match 
        models: List of model names to match
        backup: Whether to backup files before removing
    Returns:
        New experiment log with entries removed
        List of paths to results files to be removed
    """
    logger.info(f"Clearing results matching patterns in {results_dir}")
    experiment_log_file = results_dir / "experiment_log.json"
    
    if backup:
        backup_dir = results_dir / "backup"
        backup_dir.mkdir(exist_ok=True)
        if verbose:
            logger.info(f"Backing up results to {backup_dir}")
        new_path = backup_dir / experiment_log_file.name
        if new_path.exists():
            raise FileExistsError(f"Backup file already exists - skipping")
        else:
            shutil.copy2(experiment_log_file, new_path)
    
    removed = 0
    with open(experiment_log_file) as f:
        experiment_log: dict[str, dict] = json.load(f)
    new_log = experiment_log.copy()
    paths_to_remove = []
    existing_paths = [x for x in results_dir.glob("**/results.json")]
    ordered_to_real_map = {}

    for path in existing_paths:
        normalised_path = normalise_preprocessers(path.parent)
        ordered_to_real_map[str(normalised_path)] = path
    for hash, experiment in experiment_log.items():
        # Check if results match any specified patterns
        if preprocessers and not any(p in experiment["preprocesser_order"] for p in preprocessers):
            continue
        if tokenisers and experiment["tokeniser"] not in tokenisers:
            continue
        if featurisers and experiment["featuriser"] not in featurisers:
            continue
        if models and experiment["model"] not in models:
            continue
            
        # If we get here, all specified patterns matched - so we remove it
        if verbose:
            logger.info(f"Removing matched experiment: {experiment}")
        del new_log[hash]
        candidate_path = path_from_config(
                results_dir, experiment["preprocesser_order"], experiment["tokeniser"], 
                experiment["featuriser"], experiment["model"], 
                experiment.get("seed", SEED), 
                experiment.get("finetune", False)
        )
        normalised_candidate_path = normalise_preprocessers(candidate_path) 
        if str(normalised_candidate_path) in ordered_to_real_map:
            paths_to_remove.append(ordered_to_real_map[str(normalised_candidate_path)])
        else:
            if verbose:
                logger.warning(f"No matching path found for {candidate_path}")
        removed += 1
            
    logger.info(f"Removed {removed} results files matching patterns")
    logger.info(f"Found {len(paths_to_remove)} paths to remove")
    return new_log, paths_to_remove

def update_log(dataset: str, log: dict):
    with open(f"results/{dataset}/experiment_log.json", "w") as f:
        json.dump(log, f, indent=0)

def delete_paths(paths: list[Path]):
    for path in tqdm(paths):
        try:
            if path.parent.is_dir() and path.parent.exists():
                shutil.rmtree(path.parent)
        except Exception as e:
            logger.warning(f"Failed to remove directory: {path.parent} - {repr(e)}")

In [18]:
# Update seeds throughout log
datasets = ["jc_penney_products", "online_boat_listings", "california_house_prices"]
for dataset in datasets:
    clean_log = update_seeds(Path(f"results/{dataset}"))
    with open(f"results/{dataset}/experiment_log.json", "w") as f:
        json.dump(clean_log, f, indent=0)


In [38]:
# If we want to remove all results with a given model
# Might remove MLP as we dont have any activation functions
dataset = datasets[2]
clean_log, to_delete = clear_results_by_pattern(Path(f"results/{dataset}"), models=["mlp", "resnet"])

[32m2024-11-19 19:54:30.580[0m | [1mINFO    [0m | [36m__main__[0m:[36mclear_results_by_pattern[0m:[36m80[0m - [1mClearing results matching patterns in results/california_house_prices[0m
[32m2024-11-19 19:54:30.669[0m | [1mINFO    [0m | [36m__main__[0m:[36mclear_results_by_pattern[0m:[36m134[0m - [1mRemoved 63 results files matching patterns[0m
[32m2024-11-19 19:54:30.674[0m | [1mINFO    [0m | [36m__main__[0m:[36mclear_results_by_pattern[0m:[36m135[0m - [1mFound 32 paths to remove[0m


In [28]:
# Deleted 288/288 From JC Penney on 19/11/2024
# Deleted 288/288 from online boat listings on 19/11/2024
# Deleted 32/63 from california house prices on 19/11/2024 - 31 failures due to OOM -> Use a different GPU/drop batch size

0

In [40]:
update_log(dataset, clean_log)

In [43]:
delete_paths(to_delete)

100%|██████████| 32/32 [00:00<00:00, 54.11it/s]


In [11]:
# Debugging duplicates with different hashes

sample_config = {
    "preprocesser_order": [],
    "tokeniser": "whitespace",
    "featuriser": "bow_binary",
    "model": "catboost",
    "seed": 97,
    "finetune": False,
}

new_hash = hash_experiment_config(sample_config)
del sample_config["seed"]
old_hash = hash_experiment_config(sample_config)
print(f"New hash: {new_hash}")
print(f"Old hash: {old_hash}")

New hash: a61ac505c6c9892e67248db77eef960e9fdb1c1342b57380f381e6bd6ed148f7
Old hash: 464ed3feb10b102352769b13bc8eb10d85680b9ea6a71ccc475987636f972e82


In [9]:
new_log =update_seeds(Path(f"results/jc_penney_products"))


In [13]:
new_log[new_hash]

{'preprocesser_order': [],
 'tokeniser': 'whitespace',
 'featuriser': 'bow_binary',
 'model': 'catboost',
 'finetune': False,
 'seed': 97,
 'status': 'success',
 'error': None}

In [5]:
def sync_log_to_results(log: dict, results_dir: Path):
    in_log_not_in_results = {}
    in_results_not_in_log = {}
    in_both = {}
    for result in results_dir.glob("**/results.json"):
        complete_result = json.loads(result.read_text())

        hash = hash_experiment_config(complete_result)
        if hash in log:
            in_both[hash] = complete_result
        else:
            in_results_not_in_log[hash] = complete_result
    for hash, experiment in log.items():
        if hash not in in_results_not_in_log:
            in_log_not_in_results[hash] = experiment
    return in_log_not_in_results, in_results_not_in_log, in_both

current_log = json.loads(Path(f"results/jc_penney_products/experiment_log.json").read_text())
in_log_not_in_results, in_results_not_in_log, in_both = sync_log_to_results(current_log, Path(f"results/jc_penney_products"))
print(f"In log not in results: {len(in_log_not_in_results)}")
print(f"In results not in log: {len(in_results_not_in_log)}")
print(f"In both: {len(in_both)}")

KeyError: 'status'

In [4]:
import subprocess

def get_staged_files():
    # Run git command to list staged files
    result = subprocess.run(['git', 'diff', '--name-only', '--cached'], 
                          capture_output=True, 
                          text=True)
    
    # Split output into list of files
    staged_files = result.stdout.strip().split('\n')
    
    # Filter out empty strings
    staged_files = [Path(f) for f in staged_files if f]
    
    return staged_files

staged_files = get_staged_files()
print(f"Found {len(staged_files)} staged files:")


Found 260 staged files:


In [5]:
def delete_parent_dir(path: Path):
    if path.parent.is_dir() and path.parent.exists():
        shutil.rmtree(path.parent)

for file in tqdm(staged_files):
    delete_parent_dir(file)

100%|██████████| 260/260 [00:02<00:00, 118.90it/s]
