# Trehalose Biomarker Data Annotations Workflow

**Complete workflow for managing file annotations using the data model**

## Workflow Steps:
1. **Libraries & Configuration** - Load dependencies and configure paths
2. **File Enumeration** - Connect to Synapse and enumerate files/folders
3. **Annotation Management** - Create/update annotation templates dynamically
4. **Validation** - Validate annotations against data model schemas
5. **Application** - Apply validated annotations to Synapse entities

## Key Features:
- üîÑ **Dynamic annotation management**: Only adds new files, preserves existing annotations
- üìã **Schema-driven validation**: Validates against ClinicalFile and OmicFile schemas
- üéØ **Smart file type detection**: Automatically determines Clinical vs Omic data
- üíæ **Persistent storage**: Saves annotations to `./annotations/{folder_name}.json`
- ‚úÖ **Complete validation**: Blocks progression until all annotations are valid

In [92]:
# CELL 1: LIBRARIES & IMPORTS
import pandas as pd
import numpy as np
import json
import re
import os
import yaml
import glob
from pathlib import Path
from datetime import datetime
import synapseclient
from synapseclient.models import (
    Column, ColumnType, Dataset, EntityRef, File, Folder, Project, FacetType, DatasetCollection
)
from typing import Dict, List, Any, Set, Union
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

print("üìö Libraries loaded successfully")
print(f"üêç Python version: {pd.__version__} (pandas)")
print(f"üîó Synapse client version: {synapseclient.__version__}")

üìö Libraries loaded successfully
üêç Python version: 2.3.2 (pandas)
üîó Synapse client version: 4.8.0


In [None]:
# CELL 2: CONFIGURATION SETUP

# Synapse Configuration
STAGING_FOLDER_ID = "syn68927891"  # Trehalose Biomarker Data folder
PROJECT_ID = "syn68702804"
RELEASE_FOLDER_ID = "syn68885183"
DATASETS_COLLECTION_ID = "syn66496326"
DRY_RUN = False  # Set to False to actually apply changes

# Data Model Configuration
DATA_MODEL_PATH = "../model_schemas"  # Path to YAML schema directory
DATA_MODEL_FILE = "../dist/ALS.yaml"  # Main compiled data model (optional)

# Annotation Management Configuration
ANNOTATIONS_DIR = "./annotations"  # Local directory for annotation files

# Authentication Token (replace with your token or use .synapseConfig)

print("‚öôÔ∏è  CONFIGURATION LOADED")
print("=" * 40)
print(f"üìÅ Staging Folder: {STAGING_FOLDER_ID}")
print(f"üèóÔ∏è  Project: {PROJECT_ID}")
print(f"üìÑ Data Model Path: {DATA_MODEL_PATH}")
print(f"üíæ Annotations Directory: {ANNOTATIONS_DIR}")
print(f"üîç Dry Run Mode: {DRY_RUN}")

# Create annotations directory if it doesn't exist
os.makedirs(ANNOTATIONS_DIR, exist_ok=True)
print(f"üìÇ Annotations directory ready: {ANNOTATIONS_DIR}")

‚öôÔ∏è  CONFIGURATION LOADED
üìÅ Staging Folder: syn68927891
üèóÔ∏è  Project: syn68702804
üìÑ Data Model Path: ../model_schemas
üíæ Annotations Directory: ./annotations
üîç Dry Run Mode: False
üìÇ Annotations directory ready: ./annotations


In [94]:
# CELL 3: FILE ENUMERATION AND ANNOTATION CREATION

def connect_to_synapse():
    """Connect to Synapse."""
    try:
        syn = synapseclient.Synapse()
        syn.login(authToken=SYNAPSE_AUTH_TOKEN)
        print("‚úÖ Connected to Synapse")
        return syn
    except Exception as e:
        print(f"‚ùå Failed to connect: {e}")
        return None

def enumerate_files_with_folders(syn, folder_id, include_folders=True, recursive=True):
    """Enumerate files and folders in a Synapse folder using modern API."""
    if not syn:
        return {}
    
    items = {}
    
    def _process_folder(folder_obj, path_prefix=""):
        """Process a folder object and its contents."""
        # Process files in this folder
        if folder_obj.files:
            for file in folder_obj.files:
                try:
                    file_entity = syn.get(file.id, downloadFile=False)
                    file_name = file_entity.name if hasattr(file_entity, 'name') else file.name
                    
                    # Extract base name (remove extension)
                    base_name = file_name
                    for ext in ['.csv', '.txt', '.json', '.xml', '.tsv', '.xlsx', '.pdf', '.docx', '.html', '.md', '.adat']:
                        if file_name.lower().endswith(ext.lower()):
                            base_name = file_name[:-(len(ext))]
                            break
                    
                    current_path = f"{path_prefix}/{file_name}" if path_prefix else file_name
                    items[file.id] = {
                        'name': file_name,
                        'base_name': base_name,
                        'id': file.id,
                        'type': 'file',
                        'path': current_path
                    }
                    print(f"   üìÑ {current_path} ({file.id})")
                except Exception as e:
                    print(f"   ‚ö†Ô∏è  Error getting file {file.id}: {e}")
        
        # Process subfolders
        if folder_obj.folders:
            for subfolder in folder_obj.folders:
                current_path = f"{path_prefix}/{subfolder.name}" if path_prefix else subfolder.name
                
                # Add folder metadata if requested
                if include_folders:
                    items[subfolder.id] = {
                        'name': subfolder.name,
                        'id': subfolder.id,
                        'type': 'folder',
                        'path': current_path
                    }
                    print(f"   üìÅ {current_path} ({subfolder.id})")
                
                # Recursively process subfolder if enabled
                if recursive:
                    print(f"   üîç Processing subfolder: {current_path}")
                    _process_folder(subfolder, current_path)
    
    try:
        print(f"üîç Starting enumeration of folder {folder_id}...")
        
        # Get the folder and sync from Synapse
        folder = Folder(id=folder_id)
        folder = folder.sync_from_synapse(download_file=False, recursive=recursive)
        
        # Process the folder and all its contents
        _process_folder(folder)
        
        file_count = sum(1 for item in items.values() if item['type'] == 'file')
        folder_count = sum(1 for item in items.values() if item['type'] == 'folder')
        
        print(f"üìä Found {file_count} files and {folder_count} folders")
        return items
        
    except Exception as e:
        print(f"‚ùå Error enumerating files: {e}")
        import traceback
        traceback.print_exc()
        return {}

def get_staging_folder_name(syn, folder_id):
    """Extract folder name for annotation file naming."""
    try:
        folder = syn.get(folder_id, downloadFile=False)
        return folder.name
    except Exception as e:
        print(f"‚ùå Error getting folder name: {e}")
        return "unknown_folder"

def create_annotations_file_path(staging_folder_name):
    """Create path: ./annotations/{folder_name}_annotations.json"""
    clean_name = staging_folder_name.lower().replace(' ', '_').replace('-', '_')
    clean_name = re.sub(r'[^a-z0-9_]', '', clean_name)  # Remove special chars
    return f"{ANNOTATIONS_DIR}/{clean_name}_annotations.json"

def load_existing_annotations(file_path):
    """Load existing annotations if file exists."""
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return json.load(f)
    return {}

def save_annotations(annotations, file_path):
    """Save annotations to JSON file."""
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w') as f:
        json.dump(annotations, f, indent=2)

# Execute file enumeration
print("üîó SYNAPSE CONNECTION & FILE ENUMERATION")
print("=" * 45)

syn = connect_to_synapse()
if syn:
    files_folders = enumerate_files_with_folders(syn, STAGING_FOLDER_ID, include_folders=True, recursive=True)
    staging_folder_name = get_staging_folder_name(syn, STAGING_FOLDER_ID)
    annotation_file_path = create_annotations_file_path(staging_folder_name)
    
    print(f"\nüìã Enumeration Complete:")
    print(f"   üìÅ Folder: {staging_folder_name}")
    print(f"   üìä Found: {len(files_folders)} files/folders")
    print(f"   üíæ Annotation file: {annotation_file_path}")
else:
    print("‚ùå Could not connect to Synapse - using simulated data")
    files_folders = {}
    staging_folder_name = "trehalose_biomarker_data"
    annotation_file_path = create_annotations_file_path(staging_folder_name)

üîó SYNAPSE CONNECTION & FILE ENUMERATION

UPGRADE AVAILABLE

A more recent version of the Synapse Client (4.10.0) is available. Your version (4.8.0) can be upgraded by typing:
   pip install --upgrade synapseclient

Python Synapse Client version 4.10.0 release notes

https://python-docs.synapse.org/news/



UPGRADE AVAILABLE

A more recent version of the Synapse Client (4.10.0) is available. Your version (4.8.0) can be upgraded by typing:
   pip install --upgrade synapseclient

Python Synapse Client version 4.10.0 release notes

https://python-docs.synapse.org/news/


Welcome, ram.ayyala!

‚úÖ Connected to Synapse
üîç Starting enumeration of folder syn68927891...
Welcome, ram.ayyala!

‚úÖ Connected to Synapse
üîç Starting enumeration of folder syn68927891...


Syncing from Synapse:   0%|          | 0.00/1.00 [00:00<?, ?B/s]

[syn68927891:Trehalose Biomarker Data]: Syncing Folder from Synapse.


Syncing from Synapse:   0%|          | 0.00/1.00 [00:00<?, ?B/s]

[syn68927979:Answer Clinical Data]: Syncing Folder from Synapse.


Syncing from Synapse:   0%|          | 0.00/1.00 [00:00<?, ?B/s]

[syn68927983:Protavio]: Syncing Folder from Synapse.


Syncing from Synapse:   0%|          | 0.00/1.00 [00:00<?, ?B/s]

[syn68927982:Metabolomics]: Syncing Folder from Synapse.


Syncing from Synapse:   0%|          | 0.00/1.00 [00:00<?, ?B/s]

[syn68927980:ICON]: Syncing Folder from Synapse.


Syncing from Synapse:   0%|          | 0.00/1.00 [00:00<?, ?B/s]

[syn68927984:Somalogic]: Syncing Folder from Synapse.


Syncing from Synapse:   0%|          | 0.00/1.00 [00:01<?, ?B/s]



   üìÅ Answer Clinical Data (syn68927979)
   üîç Processing subfolder: Answer Clinical Data
   üìÑ Answer Clinical Data/2024.T.15_Answer_ClinicalData_External.xlsx (syn68929229)
   üìÅ ICON (syn68927980)
   üîç Processing subfolder: ICON
   üìÑ ICON/0869-0004-B_pT181_SAMPLE RESULTS_FINAL_09May2025.xlsx (syn68929235)
   üìÑ ICON/0869-0004-C_Total Tau_SAMPLE RESULTS_FINAL_24Apr2025 (1).xlsx (syn68929236)
   üìÑ ICON/0869-0004-D_miR-206_SAMPLE RESULTS_FINAL_10JUL2025.xlsx (syn68929234)
   üìÑ ICON/0869-0004-C_Total Tau_SAMPLE RESULTS_FINAL_24Apr2025 (1).xlsx (syn68929236)
   üìÑ ICON/0869-0004-D_miR-206_SAMPLE RESULTS_FINAL_10JUL2025.xlsx (syn68929234)
   üìÑ ICON/Total File_0869-0004-A.xlsx (syn68929233)
   üìÅ Metabolomics (syn68927982)
   üîç Processing subfolder: Metabolomics
   üìÑ Metabolomics/25_0805_Trehalose_EAP_C18-neg.xlsx (syn68929241)
   üìÑ Metabolomics/25_0805_Trehalose_EAP_C8-pos.xlsx (syn68929242)
   üìÑ ICON/Total File_0869-0004-A.xlsx (syn68929233)
   ü

In [95]:
# CELL 4: ENHANCED SCHEMA LOADING FUNCTIONS

def get_all_schemas(schema_base_path=None):
    """Load all schema classes from YAML files in the model_schemas directory."""
    if schema_base_path is None:
        # Get the notebook's directory and find model_schemas relative to it
        notebook_dir = Path.cwd()
        # Look for model_schemas in the current directory or parent directories
        schema_base_path = None
        for parent in [notebook_dir] + list(notebook_dir.parents):
            potential_path = parent / 'model_schemas'
            if potential_path.exists():
                schema_base_path = potential_path
                break
        
        if schema_base_path is None:
            # Fallback to relative path
            schema_base_path = Path('model_schemas')
    
    schema_path = str(schema_base_path / '**' / '*.yaml')
    all_schemas = {}
    
    print(f"Looking for schemas in: {schema_path}")
    
    for schema_file in glob.glob(schema_path, recursive=True):
        print(f"Processing: {schema_file}")
        with open(schema_file, 'r') as f:
            try:
                schema = yaml.safe_load(f)
                if schema and 'classes' in schema:
                    for class_name, class_def in schema['classes'].items():
                        all_schemas[class_name] = class_def
                        print(f"  Found class: {class_name}")
            except yaml.YAMLError as e:
                print(f"Error parsing {schema_file}: {e}")
    
    print(f"Total classes loaded: {len(all_schemas)}")
    return all_schemas

def get_full_schema(class_name, all_schemas, visited=None):
    """Recursively build complete schema including inheritance and mixins."""
    if visited is None:
        visited = set()
    
    # Prevent infinite recursion
    if class_name in visited:
        return {}
    visited.add(class_name)
    
    if class_name not in all_schemas:
        print(f"Warning: Class '{class_name}' not found in schemas")
        return {}
    
    class_def = all_schemas.get(class_name, {})
    if not class_def:
        return {}
        
    # Start with this class's attributes
    attributes = class_def.get('attributes', {}).copy()
    
    # Add parent class attributes (is_a relationship)
    if 'is_a' in class_def:
        parent_name = class_def['is_a']
        print(f"  {class_name} inherits from {parent_name}")
        parent_attributes = get_full_schema(parent_name, all_schemas, visited.copy())
        # Parent attributes come first, then are overridden by child attributes
        attributes = {**parent_attributes, **attributes}
        
    # Add mixin attributes
    if 'mixins' in class_def:
        for mixin in class_def['mixins']:
            print(f"  {class_name} uses mixin {mixin}")
            mixin_attributes = get_full_schema(mixin, all_schemas, visited.copy())
            # Mixins come first, then are overridden by class attributes
            attributes = {**mixin_attributes, **attributes}
            
    return attributes

def detect_file_type(file_info):
    """Detect whether a file should use ClinicalFile or OmicFile schema."""
    name = file_info.get('name', '').lower()
    path = file_info.get('path', '').lower()
    
    # Omic data indicators
    omic_indicators = [
        'metabolomics', 'proteomics', 'genomics', 'transcriptomics',
        'somalogic', 'protavio', 'sequencing', 'omics',
        '.adat', '.fastq', '.bam', '.vcf', '.bed'
    ]
    
    # Clinical data indicators  
    clinical_indicators = [
        'clinical', 'assessment', 'medical', 'treatment', 'visit',
        'demographic', 'alsfrs', 'vital', 'neurological', 'answer'
    ]
    
    # Check for omic indicators
    for indicator in omic_indicators:
        if indicator in name or indicator in path:
            return 'OmicFile'
    
    # Check for clinical indicators
    for indicator in clinical_indicators:
        if indicator in name or indicator in path:
            return 'ClinicalFile'
    
    # Default to ClinicalFile for unknown types
    return 'ClinicalFile'

# Load all schemas from the data model
print("üîç LOADING DATA MODEL SCHEMAS")
print("=" * 30)

all_schemas = get_all_schemas()

if not all_schemas:
    print("‚ùå No schemas found! Check the path to model_schemas directory.")
else:
    print(f"\n‚úÖ Loaded {len(all_schemas)} schema classes")
    
    # Test schema loading for key classes
    for class_name in ['ClinicalFile', 'OmicFile', 'BaseFile']:
        if class_name in all_schemas:
            print(f"‚úÖ Found {class_name}")
        else:
            print(f"‚ùå Missing {class_name}")
    
    # Test schema inheritance
    clinical_schema = get_full_schema('ClinicalFile', all_schemas)
    omic_schema = get_full_schema('OmicFile', all_schemas)
    
    print(f"\nüìä Schema Analysis:")
    print(f"   ClinicalFile attributes: {len(clinical_schema)}")
    print(f"   OmicFile attributes: {len(omic_schema)}")

üîç LOADING DATA MODEL SCHEMAS
Looking for schemas in: /home/ramayyala/Documents/data-model/model_schemas/**/*.yaml
Processing: /home/ramayyala/Documents/data-model/model_schemas/base/BaseDataset.yaml
  Found class: BaseDataset
Processing: /home/ramayyala/Documents/data-model/model_schemas/base/BaseFile.yaml
  Found class: BaseFile
Processing: /home/ramayyala/Documents/data-model/model_schemas/clinical/data-management.yaml
  Found class: DataQuality
  Found class: AssessmentAdministration
Processing: /home/ramayyala/Documents/data-model/model_schemas/clinical/data-types.yaml
Processing: /home/ramayyala/Documents/data-model/model_schemas/clinical/domains.yaml
Processing: /home/ramayyala/Documents/data-model/model_schemas/clinical/genetic-profile.yaml
  Found class: GeneticProfile
  Found class: GeneticVariant
  Found class: FamilyHistory
Processing: /home/ramayyala/Documents/data-model/model_schemas/clinical/laboratory.yaml
  Found class: LaboratoryCollection
  Found class: LaboratoryR

In [96]:
# CELL 5: DYNAMIC ANNOTATION STRUCTURE GENERATION

def create_annotation_template(file_info, all_schemas):
    """Create blank annotation template based on file type detection."""
    file_type = detect_file_type(file_info)
    
    # Choose appropriate schema
    if file_type == 'OmicFile':
        schema_attributes = get_full_schema('OmicFile', all_schemas)
    else:
        schema_attributes = get_full_schema('ClinicalFile', all_schemas)
    
    # Build template from schema
    template = {}
    for attr_name, attr_def in schema_attributes.items():
        # Handle multivalued attributes
        if isinstance(attr_def, dict) and attr_def.get('multivalued', False):
            template[attr_name] = ['']
        else:
            template[attr_name] = ''
    
    # Add metadata about detected file type
    template['_file_type'] = file_type
    template['_schema_source'] = 'data-model'
    template['_created_timestamp'] = datetime.now().isoformat()
    
    return template

def merge_annotations_smartly(existing_annotations, new_files_folders, all_schemas):
    """
    Smart merge that:
    1. Keeps existing annotations intact
    2. Adds templates for new files/folders not in existing annotations
    3. Does not overwrite any existing values
    """
    merged = existing_annotations.copy()
    new_count = 0
    existing_count = 0
    
    for syn_id, file_info in new_files_folders.items():
        if syn_id not in merged:
            # New file/folder - create template
            template = create_annotation_template(file_info, all_schemas)
            merged[syn_id] = {
                file_info['name']: template
            }
            print(f"‚ûï Added template for: {file_info['name']} (detected as {template['_file_type']})")
            new_count += 1
        else:
            print(f"‚úÖ Existing annotations kept for: {file_info['name']}")
            existing_count += 1
    
    print(f"\nüìä Merge Summary:")
    print(f"   ‚ûï New entries: {new_count}")
    print(f"   ‚úÖ Existing entries: {existing_count}")
    print(f"   üìã Total entries: {len(merged)}")
    
    return merged

# Execute annotation creation/merging
print("üèóÔ∏è  ANNOTATION STRUCTURE GENERATION")
print("=" * 35)

if all_schemas and files_folders:
    # Load existing annotations
    existing_annotations = load_existing_annotations(annotation_file_path)
    print(f"üìÇ Existing annotations: {len(existing_annotations)} entries")
    
    # Smart merge with new files/folders
    updated_annotations = merge_annotations_smartly(existing_annotations, files_folders, all_schemas)
    
    # Save updated annotations
    save_annotations(updated_annotations, annotation_file_path)
    print(f"\nüíæ Annotations saved to: {annotation_file_path}")
    
    # Show sample annotation structure
    if updated_annotations:
        first_entry_id = list(updated_annotations.keys())[0]
        first_entry = updated_annotations[first_entry_id]
        first_file_name = list(first_entry.keys())[0]
        sample_annotation = first_entry[first_file_name]
        
        print(f"\nüìã Sample annotation structure for '{first_file_name}':")
        # Show first 10 attributes
        sample_attrs = list(sample_annotation.items())[:10]
        for key, value in sample_attrs:
            if isinstance(value, list) and len(value) > 3:
                display_value = f"{value[:3]}... (+{len(value)-3} more)"
            else:
                display_value = str(value)
            print(f"   {key}: {display_value}")
        
        if len(sample_annotation) > 10:
            print(f"   ... and {len(sample_annotation) - 10} more attributes")
        
        print(f"\nüéØ File type detected: {sample_annotation.get('_file_type', 'Unknown')}")
        
else:
    print("‚ö†Ô∏è  Cannot create annotations: missing schemas or files")
    updated_annotations = {}

üèóÔ∏è  ANNOTATION STRUCTURE GENERATION
üìÇ Existing annotations: 23 entries
‚úÖ Existing annotations kept for: Answer Clinical Data
‚úÖ Existing annotations kept for: 2024.T.15_Answer_ClinicalData_External.xlsx
‚úÖ Existing annotations kept for: ICON
‚úÖ Existing annotations kept for: 0869-0004-B_pT181_SAMPLE RESULTS_FINAL_09May2025.xlsx
‚úÖ Existing annotations kept for: 0869-0004-C_Total Tau_SAMPLE RESULTS_FINAL_24Apr2025 (1).xlsx
‚úÖ Existing annotations kept for: 0869-0004-D_miR-206_SAMPLE RESULTS_FINAL_10JUL2025.xlsx
‚úÖ Existing annotations kept for: Total File_0869-0004-A.xlsx
‚úÖ Existing annotations kept for: Metabolomics
‚úÖ Existing annotations kept for: 25_0805_Trehalose_EAP_C18-neg.xlsx
‚úÖ Existing annotations kept for: 25_0805_Trehalose_EAP_C8-pos.xlsx
‚úÖ Existing annotations kept for: 25_0805_Trehalose_EAP_HILIC-neg.xlsx
‚úÖ Existing annotations kept for: 25_0805_Trehalose_EAP_HILIC-pos.xlsx
‚úÖ Existing annotations kept for: Protavio
‚úÖ Existing annotations kept f

In [99]:
# CELL 6: LOAD AND VALIDATE ANNOTATIONS

def validate_annotation_against_schema(annotation, file_type, all_schemas):
    """
    Validate individual annotation against its schema.
    Returns (is_valid, errors_list)
    """
    errors = []
    warnings = []
    
    # Get expected schema
    if file_type == 'OmicFile':
        expected_schema = get_full_schema('OmicFile', all_schemas)
    else:
        expected_schema = get_full_schema('ClinicalFile', all_schemas)
    
    # Check for required fields
    for attr_name, attr_def in expected_schema.items():
        if isinstance(attr_def, dict) and attr_def.get('required', False):
            if attr_name not in annotation:
                errors.append(f"Missing required field: {attr_name}")
            elif not annotation[attr_name] or annotation[attr_name] == '' or annotation[attr_name] == ['']:
                errors.append(f"Required field '{attr_name}' is empty")
    
    # Check multivalued field constraints
    for attr_name, value in annotation.items():
        if attr_name.startswith('_'):  # Skip metadata fields
            continue
            
        if attr_name in expected_schema:
            attr_def = expected_schema[attr_name]
            if isinstance(attr_def, dict):
                is_multivalued = attr_def.get('multivalued', False)
                
                if is_multivalued and not isinstance(value, list):
                    errors.append(f"Field '{attr_name}' should be a list (multivalued)")
                elif not is_multivalued and isinstance(value, list):
                    warnings.append(f"Field '{attr_name}' is a list but should be single value")
        else:
            warnings.append(f"Field '{attr_name}' not found in schema (may be deprecated)")
    
    # Check for completely empty annotations
    non_metadata_fields = {k: v for k, v in annotation.items() if not k.startswith('_')}
    filled_fields = {
        k: v for k, v in non_metadata_fields.items() 
        if v and v != '' and v != [''] and v != []
    }
    
    if len(filled_fields) == 0:
        warnings.append("No fields have been filled out yet")
    
    return len(errors) == 0, errors, warnings

def validate_all_annotations(annotations_data, all_schemas):
    """Validate all annotations and report issues."""
    validation_results = {}
    total_errors = 0
    total_warnings = 0
    valid_count = 0
    
    for syn_id, file_data in annotations_data.items():
        for file_name, annotation in file_data.items():
            file_type = annotation.get('_file_type', 'ClinicalFile')
            is_valid, errors, warnings = validate_annotation_against_schema(annotation, file_type, all_schemas)
            
            validation_results[syn_id] = {
                'file_name': file_name,
                'is_valid': is_valid,
                'errors': errors,
                'warnings': warnings,
                'file_type': file_type
            }
            
            total_errors += len(errors)
            total_warnings += len(warnings)
            
            if is_valid:
                valid_count += 1
                if len(warnings) > 0:
                    print(f"‚ö†Ô∏è  {file_name}: Valid with {len(warnings)} warnings")
                else:
                    print(f"‚úÖ {file_name}: Valid")
            else:
                print(f"‚ùå {file_name}: {len(errors)} validation errors")
                for error in errors[:3]:  # Show first 3 errors
                    print(f"   ‚Ä¢ {error}")
                if len(errors) > 3:
                    print(f"   ... and {len(errors) - 3} more errors")
    
    return validation_results, total_errors, total_warnings, valid_count


def preview_annotation_cleaning(annotations_data, sample_limit=3):
    """Preview what the cleaning will do to annotations."""
    from collections import defaultdict
    
    # Import the cleaning function for testing
    def is_meaningful_value(val):
        preserved_values = {'Unknown', 'N/A', 'unknown', 'n/a', 'NA', 'na'}
        if val in preserved_values:
            return True
        if isinstance(val, str):
            return val.strip() != ''
        return val is not None
    
    stats = {
        'total_fields': 0,
        'cleaned_fields': 0,
        'removed_fields': 0,
        'preserved_meaningful': 0
    }
    
    examples = []
    count = 0
    
    for syn_id, file_data in annotations_data.items():
        for file_name, annotation in file_data.items():
            original_count = 0
            cleaned_count = 0
            preserved_examples = []
            removed_examples = []
            
            for key, value in annotation.items():
                if not key.startswith('_'):
                    original_count += 1
                    stats['total_fields'] += 1
                    
                    # Simulate cleaning logic
                    will_keep = False
                    if isinstance(value, list):
                        meaningful_items = [v for v in value if is_meaningful_value(v)]
                        if meaningful_items:
                            will_keep = True
                            if any(item in {'Unknown', 'N/A'} for item in meaningful_items):
                                stats['preserved_meaningful'] += 1
                    elif is_meaningful_value(value):
                        will_keep = True
                        if value in {'Unknown', 'N/A'}:
                            stats['preserved_meaningful'] += 1
                    
                    if will_keep:
                        cleaned_count += 1
                        stats['cleaned_fields'] += 1
                        if len(preserved_examples) < 2:
                            preserved_examples.append(f'{key}: {str(value)[:30]}')
                    else:
                        stats['removed_fields'] += 1
                        if len(removed_examples) < 2:
                            removed_examples.append(f'{key}: {str(value)[:30]}')
            
            if count < sample_limit:
                examples.append({
                    'file_name': file_name,
                    'original': original_count,
                    'cleaned': cleaned_count,
                    'removed': original_count - cleaned_count,
                    'preserved_examples': preserved_examples,
                    'removed_examples': removed_examples
                })
                count += 1
    
    return stats, examples

# Execute validation
print("üîç ANNOTATION VALIDATION")
print("=" * 25)

if os.path.exists(annotation_file_path):
    # Load annotations for validation
    annotations_data = load_existing_annotations(annotation_file_path)
    
    # Preview cleaning effects
    cleaning_stats, cleaning_examples = preview_annotation_cleaning(annotations_data)
    print(f"üßπ Cleaning Preview:")
    print(f"   üìä Total fields: {cleaning_stats['total_fields']}")
    print(f"   ‚úÖ Will keep: {cleaning_stats['cleaned_fields']}")
    print(f"   üóëÔ∏è Will remove: {cleaning_stats['removed_fields']}")
    print(f"   üéØ Preserved meaningful: {cleaning_stats['preserved_meaningful']}")
    
    for example in cleaning_examples:
        print(f"üìÑ {example['file_name'][:50]}...")
        print(f"   Original: {example['original']} ‚Üí Cleaned: {example['cleaned']} ({example['removed']} removed)")
        if example['preserved_examples']:
            print(f"   ‚úÖ Keeping: {example['preserved_examples'][0]}")
        if example['removed_examples']:
            print(f"   üóëÔ∏è Removing: {example['removed_examples'][0]}")
    print(f"üìã Loaded {len(annotations_data)} annotation entries from: {annotation_file_path}")
    
    if all_schemas and annotations_data:
        # Run validation
        validation_results, total_errors, total_warnings, valid_count = validate_all_annotations(annotations_data, all_schemas)
        
        print(f"\nüìä Validation Summary:")
        print(f"   ‚úÖ Valid: {valid_count}")
        print(f"   ‚ùå Invalid: {len(validation_results) - valid_count}")
        print(f"   üî¢ Total errors: {total_errors}")
        print(f"   ‚ö†Ô∏è  Total warnings: {total_warnings}")
        
        if total_errors > 0:
            print(f"\nüõë Please fix validation errors before proceeding to annotation application")
            print(f"üìù Edit the annotation file: {annotation_file_path}")
            print(f"üîÑ Re-run this cell after making changes")
        else:
            print(f"\nüöÄ All annotations are valid! Ready to apply to Synapse entities")
            if total_warnings > 0:
                print(f"üìù Note: {total_warnings} warnings found (non-blocking)")
    else:
        print("‚ùå Cannot validate: missing schemas or annotation data")
        validation_results = {}
        total_errors = 1  # Block progression
else:
    print(f"‚ùå Annotation file not found: {annotation_file_path}")
    print("üìù Please run the previous cells to create annotations first")
    validation_results = {}
    total_errors = 1  # Block progression

üîç ANNOTATION VALIDATION
üßπ Cleaning Preview:
   üìä Total fields: 731
   ‚úÖ Will keep: 63
   üóëÔ∏è Will remove: 668
   üéØ Preserved meaningful: 4
üìÑ Answer Clinical Data...
   Original: 29 ‚Üí Cleaned: 5 (24 removed)
   ‚úÖ Keeping: clinicalDomain: ['subject_management', 'diseas
   üóëÔ∏è Removing: visitType: 
üìÑ 2024.T.15_Answer_ClinicalData_External.xlsx...
   Original: 29 ‚Üí Cleaned: 5 (24 removed)
   ‚úÖ Keeping: clinicalDomain: ['subject_management', 'diseas
   üóëÔ∏è Removing: visitType: 
üìÑ ICON...
   Original: 29 ‚Üí Cleaned: 6 (23 removed)
   ‚úÖ Keeping: clinicalDomain: ['biomarkers']
   üóëÔ∏è Removing: visitType: 
üìã Loaded 23 annotation entries from: ./annotations/trehalose_biomarker_data_annotations.json
  ClinicalFile inherits from BaseFile
  ClinicalFile uses mixin ClinicalFileMixin
‚ùå Answer Clinical Data: 5 validation errors
   ‚Ä¢ Required field 'title' is empty
   ‚Ä¢ Required field 'creator' is empty
   ‚Ä¢ Required field 'keywords' is empty

In [None]:
# CELL 7: APPLY ANNOTATIONS TO SYNAPSE ENTITIES

def get_existing_synapse_annotations(syn, entity_id):
    """Get existing annotations from Synapse entity."""
    try:
        entity = syn.get(entity_id, downloadFile=False)
        return dict(entity.annotations) if hasattr(entity, 'annotations') and entity.annotations else {}
    except Exception as e:
        print(f"‚ùå Error getting annotations for {entity_id}: {e}")
        return {}

def clean_annotations_for_synapse(annotation):
    """Clean annotations by removing metadata fields and truly empty values.
    
    Preserves meaningful values like 'Unknown', 'N/A', but removes:
    - Empty strings ('')
    - Lists containing only empty strings (['''])
    - Empty lists ([])
    - None/null values
    """
    cleaned = {}
    
    # Values to preserve even if they might seem "empty"
    preserved_values = {'Unknown', 'N/A', 'unknown', 'n/a', 'NA', 'na'}
    
    def is_meaningful_value(val):
        """Check if a value is meaningful (not truly empty)."""
        if val in preserved_values:
            return True
        if isinstance(val, str):
            return val.strip() != ''
        return val is not None
    
    for key, value in annotation.items():
        # Skip metadata fields (starting with underscore)
        if key.startswith('_'):
            continue
        
        # Handle list values
        if isinstance(value, list):
            # Keep only meaningful values in the list
            cleaned_list = [v for v in value if is_meaningful_value(v)]
            if cleaned_list:  # Only include non-empty lists
                cleaned[key] = cleaned_list
            # Skip completely empty lists
        
        # Handle single values
        elif is_meaningful_value(value):
            cleaned[key] = value
        
        # Skip truly empty values (None, '', etc.)
    
    return cleaned

def apply_annotations_to_entity(syn, entity_id, new_annotations, dry_run=False):
    """Apply annotations to Synapse entity."""
    try:
        if dry_run:
            print(f"üîç DRY RUN: Would apply {len(new_annotations)} annotations to {entity_id}")
            return True
        
        entity = syn.get(entity_id, downloadFile=False)
        
        # Clean annotations (remove metadata fields and empty values)
        clean_annotations = clean_annotations_for_synapse(new_annotations)
        
        if not clean_annotations:
            print(f"‚ö†Ô∏è  No valid annotations to apply (all fields empty)")
            return True  # Not an error, just nothing to do
        
        entity.annotations = clean_annotations
        syn.store(entity, forceVersion=False)
        return True
        
    except Exception as e:
        print(f"‚ùå Failed to apply annotations to {entity_id}: {e}")
        return False

def apply_all_annotations(syn, annotations_data, validation_results, dry_run=False):
    """Apply annotations to all validated entities."""
    success_count = 0
    failed_count = 0
    skipped_count = 0
    
    for syn_id, file_data in annotations_data.items():
        for file_name, annotation in file_data.items():
            # Only apply if validation passed
            validation_result = validation_results.get(syn_id, {})
            if not validation_result.get('is_valid', False):
                print(f"‚è≠Ô∏è  Skipping {file_name} (validation failed)")
                skipped_count += 1
                continue
            
            print(f"üîÑ Applying annotations to {file_name} ({syn_id})")
            
            # Show what will be applied
            clean_annotations = clean_annotations_for_synapse(annotation)
            print(f"   üìã {len(clean_annotations)} non-empty fields to apply")
            
            success = apply_annotations_to_entity(syn, syn_id, annotation, dry_run)
            
            if success:
                success_count += 1
                print(f"   ‚úÖ Success")
            else:
                failed_count += 1
                print(f"   ‚ùå Failed")
    
    return success_count, failed_count, skipped_count

# Execute annotation application (only if validation passed)
print("üöÄ ANNOTATION APPLICATION TO SYNAPSE")
print("=" * 35)

# Check if we have the required data from previous cells
required_vars = ['syn', 'annotations_data', 'validation_results', 'total_errors']
missing_vars = [var for var in required_vars if var not in locals()]

if missing_vars:
    print(f"‚ùå Missing required data: {', '.join(missing_vars)}")
    print("üìù Please run all previous cells first")
elif total_errors > 0:
    print(f"üõë Skipping annotation application due to {total_errors} validation errors")
    print(f"üìù Please fix errors in: {annotation_file_path}")
    print(f"üîÑ Then re-run the validation cell and this cell")
elif not syn:
    print("‚ùå No Synapse connection available")
elif not annotations_data:
    print("‚ùå No annotation data available")
else:
    print(f"üîç Ready to apply annotations to {len(annotations_data)} entities")
    print(f"üìù Dry run mode: {DRY_RUN}")
    
    # Apply annotations
    success_count, failed_count, skipped_count = apply_all_annotations(
        syn, annotations_data, validation_results, dry_run=DRY_RUN
    )
    
    print(f"\nüìä Application Results:")
    print(f"   ‚úÖ Success: {success_count}")
    print(f"   ‚ùå Failed: {failed_count}")
    print(f"   ‚è≠Ô∏è  Skipped: {skipped_count}")
    print(f"   üìã Total processed: {success_count + failed_count + skipped_count}")
    
    if DRY_RUN:
        print(f"\nüîç This was a DRY RUN - no actual changes made")
        print(f"üí° Set DRY_RUN = False in Cell 2 to apply changes")
    elif success_count > 0:
        print(f"\nüéâ Successfully applied annotations to {success_count} entities!")
        print(f"üîó Check your entities in Synapse to see the applied annotations")
    
    if failed_count > 0:
        print(f"\n‚ö†Ô∏è  {failed_count} entities failed to update - check error messages above")