In [None]:
import json
import mlflow
from pathlib import Path
from tqdm.auto import tqdm
import re
import asyncio
from concurrent.futures import ThreadPoolExecutor
import threading

## Configuration

In [None]:
# Base paths
OUTPUTS_DIR = Path("../../../outputs")
MLRUNS_DIR = Path("../../../mlruns")

# Set up MLflow
mlflow.set_tracking_uri(f"file:{MLRUNS_DIR}")

print(f"Outputs directory: {OUTPUTS_DIR.absolute()}")
print(f"MLflow tracking URI: {MLRUNS_DIR.absolute()}")

## Restore Deleted Experiments (if any)

In [None]:
# Delete experiments properly to allow recreation
client = mlflow.tracking.MlflowClient()

try:
    deleted_experiments = client.search_experiments(view_type=mlflow.entities.ViewType.DELETED_ONLY)
    
    if deleted_experiments:
        print(f"Found {len(deleted_experiments)} deleted experiment(s). Permanently deleting...")
        for exp in deleted_experiments:
            print(f"  - Deleting: {exp.name} (ID: {exp.experiment_id})")
            # First restore, then permanently delete
            client.restore_experiment(exp.experiment_id)
            client.delete_experiment(exp.experiment_id)
            # Now delete the files
            import shutil
            exp_path = MLRUNS_DIR / exp.experiment_id
            if exp_path.exists():
                shutil.rmtree(exp_path)
                print(f"    Removed directory: {exp_path}")
        print("All deleted experiments permanently removed!")
    else:
        print("No deleted experiments found.")
except Exception as e:
    print(f"Error: {e}")

## Helper Functions

In [None]:
def parse_model_name(model_dir_name):
    """
    Parse model directory name to extract configuration.
    
    Standard format: qwen3_8B_polish_inclusive_proofreading_lora_r{rank}_lr{lr}_ep{ep}_bs{bs}_ga{ga}_warmup{warmup}_seq{seq}
    """
    # Standard proofreading pattern
    pattern = r'qwen3_(\w+)_polish_inclusive_proofreading_lora_r(\d+)_lr([\d.e-]+)_ep([\d.]+)_bs(\d+)_ga(\d+)_warmup(\d+)_seq(\d+)'
    match = re.match(pattern, model_dir_name)
    
    if match:
        return {
            'type': 'proofreading',
            'model_size': match.group(1),
            'lora_rank': int(match.group(2)),
            'learning_rate': float(match.group(3)),
            'epochs': float(match.group(4)),
            'batch_size': int(match.group(5)),
            'gradient_accumulation': int(match.group(6)),
            'warmup_steps': int(match.group(7)),
            'max_seq_length': int(match.group(8)),
        }
    
    # Translation standalone
    if 'translation_standalone' in model_dir_name:
        pattern = r'qwen3_(\w+)_translation_standalone_lora_r(\d+)_lr([\d.e-]+)_ep([\d.]+)_bs(\d+)_ga(\d+)_warmup(\d+)_seq(\d+)'
        match = re.match(pattern, model_dir_name)
        if match:
            return {
                'type': 'translation_standalone',
                'model_size': match.group(1),
                'lora_rank': int(match.group(2)),
                'learning_rate': float(match.group(3)),
                'epochs': float(match.group(4)),
                'batch_size': int(match.group(5)),
                'gradient_accumulation': int(match.group(6)),
                'warmup_steps': int(match.group(7)),
                'max_seq_length': int(match.group(8)),
            }
    
    # Multitask proofreading+translation
    if 'multitask_proofreading+translation' in model_dir_name:
        pattern = r'qwen3_(\w+)_multitask_proofreading\+translation_lora_r(\d+)_lr([\d.e-]+)_ep([\d.]+)_bs(\d+)_ga(\d+)_warmup(\d+)_seq(\d+)'
        match = re.match(pattern, model_dir_name)
        if match:
            return {
                'type': 'multitask',
                'model_size': match.group(1),
                'lora_rank': int(match.group(2)),
                'learning_rate': float(match.group(3)),
                'epochs': float(match.group(4)),
                'batch_size': int(match.group(5)),
                'gradient_accumulation': int(match.group(6)),
                'warmup_steps': int(match.group(7)),
                'max_seq_length': int(match.group(8)),
            }
    
    return None


def log_model_to_mlflow(model_dir, config):
    """
    Log a single model's training metrics to MLflow.
    """
    # Find the last checkpoint
    checkpoints = sorted(model_dir.glob("checkpoint-*"), 
                        key=lambda x: int(x.name.split("-")[1]))
    
    if not checkpoints:
        return False, "No checkpoints found"
    
    last_checkpoint = checkpoints[-1]
    trainer_state_file = last_checkpoint / "trainer_state.json"
    
    if not trainer_state_file.exists():
        return False, f"No trainer_state.json in {last_checkpoint.name}"
    
    # Load trainer state
    with open(trainer_state_file, 'r') as f:
        trainer_state = json.load(f)
    
    # Determine experiment name based on type
    if config['type'] == 'translation_standalone':
        experiment_name = f"qwen3-{config['model_size']}-polish-translation"
    elif config['type'] == 'multitask':
        experiment_name = f"qwen3-{config['model_size']}-polish-multitask"
    else:
        experiment_name = f"qwen3-{config['model_size']}-polish-inclusive-proofreading"
    
    mlflow.set_experiment(experiment_name)
    
    # Create run name
    run_name = f"lora_r{config['lora_rank']}_lr{config['learning_rate']}_ep{config['epochs']}_bs{config['batch_size']}_ga{config['gradient_accumulation']}_warmup{config['warmup_steps']}_seq{config['max_seq_length']}_retroactive"
    
    # Check if run already exists
    client = mlflow.tracking.MlflowClient()
    experiment = client.get_experiment_by_name(experiment_name)
    if experiment:
        existing_runs = client.search_runs(
            experiment_ids=[experiment.experiment_id],
            filter_string=f"tags.mlflow.runName = '{run_name}'"
        )
        if existing_runs:
            return False, f"Run already exists (skipped)"
    
    # Start MLflow run
    with mlflow.start_run(run_name=run_name):
        # Log hyperparameters
        mlflow.log_params({
            "task_type": config['type'],
            "model_size": config['model_size'],
            "lora_rank": config['lora_rank'],
            "epochs": config['epochs'],
            "batch_size": config['batch_size'],
            "gradient_accumulation_steps": config['gradient_accumulation'],
            "learning_rate": config['learning_rate'],
            "warmup_steps": config['warmup_steps'],
            "max_seq_length": config['max_seq_length'],
        })
        
        # Log all training metrics from log_history
        for entry in trainer_state['log_history']:
            step = entry.get('step')
            if step is None:
                continue
            
            # Log each metric with proper validation
            for key, value in entry.items():
                if key != 'step' and isinstance(value, (int, float)):
                    try:
                        # Skip None values and NaN
                        if value is not None and not (isinstance(value, float) and value != value):
                            mlflow.log_metric(key, value, step=step)
                    except Exception:
                        pass
        
        # Log final best metrics (only if not None)
        if 'best_metric' in trainer_state and trainer_state['best_metric'] is not None:
            try:
                best_val = trainer_state['best_metric']
                if isinstance(best_val, (int, float)):
                    mlflow.log_metric("best_metric", best_val)
            except Exception:
                pass
        
        # Log best checkpoint info (as param since it's a string path)
        if 'best_model_checkpoint' in trainer_state and trainer_state['best_model_checkpoint'] is not None:
            best_ckpt = trainer_state['best_model_checkpoint']
            try:
                mlflow.log_param("best_model_checkpoint", str(best_ckpt))
                # Extract checkpoint step number if it's a path
                if isinstance(best_ckpt, str) and 'checkpoint-' in best_ckpt:
                    ckpt_step = int(best_ckpt.split('checkpoint-')[-1].split('/')[0])
                    mlflow.log_metric("best_checkpoint_step", ckpt_step)
            except Exception:
                pass
        
        # Log model artifacts (skip to save time - uncomment if needed)
        # mlflow.log_artifacts(str(last_checkpoint), artifact_path="final_checkpoint")
        
        run_id = mlflow.active_run().info.run_id
        total_steps = trainer_state['global_step']
    
    return True, f"Logged {total_steps} steps (run_id: {run_id[:8]}...)"

## Discover All Model Directories

In [None]:
# Find all model directories
all_model_dirs = [d for d in OUTPUTS_DIR.iterdir() if d.is_dir()]

# Parse and filter valid models
models_to_log = []
skipped_dirs = []

for model_dir in all_model_dirs:
    config = parse_model_name(model_dir.name)
    if config:
        models_to_log.append((model_dir, config))
    else:
        skipped_dirs.append(model_dir.name)

print(f"Found {len(models_to_log)} models to log")
print(f"Skipped {len(skipped_dirs)} directories (not matching pattern)")

if skipped_dirs:
    print("\nSkipped directories:")
    for name in skipped_dirs:
        print(f"  - {name}")

## Log All Models to MLflow

In [None]:
import concurrent.futures

MAX_WORKERS = 10

def log_model_wrapper(model_dir, config):
    """Wrapper function for thread pool execution"""
    try:
        success, message = log_model_to_mlflow(model_dir, config)
        if success:
            return {
                'status': 'success',
                'name': model_dir.name,
                'type': config['type'],
                'rank': config['lora_rank'],
                'message': message
            }
        else:
            if "already exists" in message:
                return {
                    'status': 'skipped',
                    'name': model_dir.name,
                    'reason': message
                }
            else:
                return {
                    'status': 'failed',
                    'name': model_dir.name,
                    'reason': message
                }
    except Exception as e:
        return {
            'status': 'failed',
            'name': model_dir.name,
            'reason': str(e)
        }

results = {'success': [], 'failed': [], 'skipped': []}

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    future_to_model = {
        executor.submit(log_model_wrapper, model_dir, config): (model_dir.name, config)
        for model_dir, config in models_to_log
    }
    
    with tqdm(total=len(models_to_log), desc="Logging models to MLflow") as pbar:
        for future in concurrent.futures.as_completed(future_to_model):
            model_name, config = future_to_model[future]
            try:
                result = future.result()
                
                if result['status'] == 'success':
                    results['success'].append(result)
                    tqdm.write(f"[SUCCESS] {result['name']}: {result['message']}")
                elif result['status'] == 'skipped':
                    results['skipped'].append(result)
                    tqdm.write(f"[SKIPPED] {result['name']}: {result['reason']}")
                else:
                    results['failed'].append(result)
                    tqdm.write(f"[FAILED] {result['name']}: {result['reason']}")
                    
            except Exception as e:
                results['failed'].append({
                    'name': model_name,
                    'reason': f"Thread execution failed: {str(e)}"
                })
                tqdm.write(f"[FAILED] {model_name}: Thread execution failed - {e}")
            
            pbar.update(1)

## Summary

In [None]:
print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"\nSuccessfully logged: {len(results['success'])} models")
print(f"Skipped (already exist): {len(results['skipped'])} models")
print(f"Failed: {len(results['failed'])} models")

if results['success']:
    print("\nSuccessfully logged models:")
    # Group by type
    by_type = {}
    for item in results['success']:
        task_type = item['type']
        if task_type not in by_type:
            by_type[task_type] = []
        by_type[task_type].append(item)
    
    for task_type, items in by_type.items():
        print(f"\n  {task_type.upper()}:")
        for item in sorted(items, key=lambda x: x['rank']):
            print(f"    - LoRA rank {item['rank']}: {item['message']}")

if results['skipped']:
    print("\nSkipped models (already exist):")
    for item in results['skipped']:
        print(f"  - {item['name']}")

if results['failed']:
    print("\nFailed models:")
    for item in results['failed']:
        print(f"  - {item['name']}: {item['reason']}")

print("\n" + "="*80)
print(f"MLflow logging complete! Access the UI at: http://localhost:5000")
print("="*80)