# Trehalose Biomarker Data Annotations Workflow

**Complete workflow for managing file annotations using the data model**

## Workflow Steps:
1. **Libraries & Configuration** - Load dependencies and configure paths
2. **File Enumeration** - Connect to Synapse and enumerate files/folders
3. **Annotation Management** - Create/update annotation templates dynamically
4. **Validation** - Validate annotations against data model schemas
5. **Application** - Apply validated annotations to Synapse entities

## Key Features:
- üîÑ **Dynamic annotation management**: Only adds new files, preserves existing annotations
- üìã **Schema-driven validation**: Validates against ClinicalFile and OmicFile schemas
- üéØ **Smart file type detection**: Automatically determines Clinical vs Omic data
- üíæ **Persistent storage**: Saves annotations to `./annotations/{folder_name}.json`
- ‚úÖ **Complete validation**: Blocks progression until all annotations are valid

In [2]:
# CELL 1: LIBRARIES & IMPORTS
import pandas as pd
import numpy as np
import json
import re
import os
import yaml
import glob
from pathlib import Path
from datetime import datetime
import synapseclient
from synapseclient.models import (
    Column, ColumnType, Dataset, EntityRef, File, Folder, Project, FacetType, DatasetCollection
)
from typing import Dict, List, Any, Set, Union
from synapseclient import Wiki
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

print("üìö Libraries loaded successfully")
print(f"üêç Python version: {pd.__version__} (pandas)")
print(f"üîó Synapse client version: {synapseclient.__version__}")

üìö Libraries loaded successfully
üêç Python version: 2.3.2 (pandas)
üîó Synapse client version: 4.8.0


In [None]:
# CELL 2: CONFIGURATION SETUP

# Synapse Configuration
STAGING_FOLDER_ID = "syn68927891"  # Trehalose Biomarker Data folder
PROJECT_ID = "syn68702804"
RELEASE_FOLDER_ID = "syn68885183"
DATASETS_COLLECTION_ID = "syn66496326"
DRY_RUN = False  # Set to False to actually apply changes

# Data Model Configuration
DATA_MODEL_PATH = "../modules"  # Path to YAML schema directory
DATA_MODEL_FILE = "../dist/ALS.yaml"  # Main compiled data model (optional)

# Annotation Management Configuration
ANNOTATIONS_DIR = "../annotations"  # Local directory for annotation files

# Authentication Token (replace with your token or use .synapseConfig)
print("‚öôÔ∏è  CONFIGURATION LOADED")
print("=" * 40)
print(f"üìÅ Staging Folder: {STAGING_FOLDER_ID}")
print(f"üèóÔ∏è  Project: {PROJECT_ID}")
print(f"üìÑ Data Model Path: {DATA_MODEL_PATH}")
print(f"üíæ Annotations Directory: {ANNOTATIONS_DIR}")
print(f"üîç Dry Run Mode: {DRY_RUN}")

# Create annotations directory if it doesn't exist
os.makedirs(ANNOTATIONS_DIR, exist_ok=True)
print(f"üìÇ Annotations directory ready: {ANNOTATIONS_DIR}")

‚öôÔ∏è  CONFIGURATION LOADED
üìÅ Staging Folder: syn68927891
üèóÔ∏è  Project: syn68702804
üìÑ Data Model Path: ../modules
üíæ Annotations Directory: ../annotations
üîç Dry Run Mode: False
üìÇ Annotations directory ready: ../annotations


In [8]:
# CELL 3: FILE ENUMERATION AND ANNOTATION CREATION

def connect_to_synapse():
    """Connect to Synapse."""
    try:
        syn = synapseclient.Synapse()
        syn.login(authToken=SYNPASE_AUTH_TOKEN)
        print("‚úÖ Connected to Synapse")
        return syn
    except Exception as e:
        print(f"‚ùå Failed to connect: {e}")
        return None

def enumerate_files_with_folders(syn, folder_id, include_folders=True, recursive=True):
    """Enumerate files and folders in a Synapse folder using modern API."""
    if not syn:
        return {}
    
    items = {}
    
    def _process_folder(folder_obj, path_prefix=""):
        """Process a folder object and its contents."""
        # Process files in this folder
        if folder_obj.files:
            for file in folder_obj.files:
                try:
                    file_entity = syn.get(file.id, downloadFile=False)
                    file_name = file_entity.name if hasattr(file_entity, 'name') else file.name
                    
                    # Extract base name (remove extension)
                    base_name = file_name
                    for ext in ['.csv', '.txt', '.json', '.xml', '.tsv', '.xlsx', '.pdf', '.docx', '.html', '.md', '.adat']:
                        if file_name.lower().endswith(ext.lower()):
                            base_name = file_name[:-(len(ext))]
                            break
                    
                    current_path = f"{path_prefix}/{file_name}" if path_prefix else file_name
                    items[file.id] = {
                        'name': file_name,
                        'base_name': base_name,
                        'id': file.id,
                        'type': 'file',
                        'path': current_path
                    }
                    print(f"   üìÑ {current_path} ({file.id})")
                except Exception as e:
                    print(f"   ‚ö†Ô∏è  Error getting file {file.id}: {e}")
        
        # Process subfolders
        if folder_obj.folders:
            for subfolder in folder_obj.folders:
                current_path = f"{path_prefix}/{subfolder.name}" if path_prefix else subfolder.name
                
                # Add folder metadata if requested
                if include_folders:
                    items[subfolder.id] = {
                        'name': subfolder.name,
                        'id': subfolder.id,
                        'type': 'folder',
                        'path': current_path
                    }
                    print(f"   üìÅ {current_path} ({subfolder.id})")
                
                # Recursively process subfolder if enabled
                if recursive:
                    print(f"   üîç Processing subfolder: {current_path}")
                    _process_folder(subfolder, current_path)
    
    try:
        print(f"üîç Starting enumeration of folder {folder_id}...")
        
        # Get the folder and sync from Synapse
        folder = Folder(id=folder_id)
        folder = folder.sync_from_synapse(download_file=False, recursive=recursive)
        
        # Process the folder and all its contents
        _process_folder(folder)
        
        file_count = sum(1 for item in items.values() if item['type'] == 'file')
        folder_count = sum(1 for item in items.values() if item['type'] == 'folder')
        
        print(f"üìä Found {file_count} files and {folder_count} folders")
        return items
        
    except Exception as e:
        print(f"‚ùå Error enumerating files: {e}")
        import traceback
        traceback.print_exc()
        return {}

def get_staging_folder_name(syn, folder_id):
    """Extract folder name for annotation file naming."""
    try:
        folder = syn.get(folder_id, downloadFile=False)
        return folder.name
    except Exception as e:
        print(f"‚ùå Error getting folder name: {e}")
        return "unknown_folder"

def create_annotations_file_path(staging_folder_name):
    """Create path: ./annotations/{folder_name}_annotations.json"""
    clean_name = staging_folder_name.lower().replace(' ', '_').replace('-', '_')
    clean_name = re.sub(r'[^a-z0-9_]', '', clean_name)  # Remove special chars
    return f"{ANNOTATIONS_DIR}/{clean_name}_annotations.json"

def load_existing_annotations(file_path):
    """Load existing annotations if file exists."""
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return json.load(f)
    return {}

def save_annotations(annotations, file_path):
    """Save annotations to JSON file."""
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w') as f:
        json.dump(annotations, f, indent=2)

# Execute file enumeration
print("üîó SYNAPSE CONNECTION & FILE ENUMERATION")
print("=" * 45)

syn = connect_to_synapse()
if syn:
    files_folders = enumerate_files_with_folders(syn, STAGING_FOLDER_ID, include_folders=True, recursive=True)
    staging_folder_name = get_staging_folder_name(syn, STAGING_FOLDER_ID)
    annotation_file_path = create_annotations_file_path(staging_folder_name)
    
    print(f"\nüìã Enumeration Complete:")
    print(f"   üìÅ Folder: {staging_folder_name}")
    print(f"   üìä Found: {len(files_folders)} files/folders")
    print(f"   üíæ Annotation file: {annotation_file_path}")
else:
    print("‚ùå Could not connect to Synapse - using simulated data")
    files_folders = {}
    staging_folder_name = "trehalose_biomarker_data"
    annotation_file_path = create_annotations_file_path(staging_folder_name)

üîó SYNAPSE CONNECTION & FILE ENUMERATION

UPGRADE AVAILABLE

A more recent version of the Synapse Client (4.10.0) is available. Your version (4.8.0) can be upgraded by typing:
   pip install --upgrade synapseclient

Python Synapse Client version 4.10.0 release notes

https://python-docs.synapse.org/news/


Welcome, ram.ayyala!

‚úÖ Connected to Synapse
üîç Starting enumeration of folder syn68927891...


Syncing from Synapse:   0%|          | 0.00/1.00 [00:00<?, ?B/s]

[syn68927891:Trehalose Biomarker Data]: Syncing Folder from Synapse.


Syncing from Synapse:   0%|          | 0.00/1.00 [00:00<?, ?B/s]

[syn68927979:Answer Clinical Data]: Syncing Folder from Synapse.


Syncing from Synapse:   0%|          | 0.00/1.00 [00:00<?, ?B/s]

[syn68927984:Somalogic]: Syncing Folder from Synapse.


Syncing from Synapse:   0%|          | 0.00/1.00 [00:00<?, ?B/s]

[syn68927983:Protavio]: Syncing Folder from Synapse.


Syncing from Synapse:   0%|          | 0.00/1.00 [00:00<?, ?B/s]

[syn68927980:ICON]: Syncing Folder from Synapse.


Syncing from Synapse:   0%|          | 0.00/1.00 [00:00<?, ?B/s]

[syn68927982:Metabolomics]: Syncing Folder from Synapse.


Syncing from Synapse:   0%|          | 0.00/1.00 [00:01<?, ?B/s]


   üìÅ Answer Clinical Data (syn68927979)
   üîç Processing subfolder: Answer Clinical Data
   üìÑ Answer Clinical Data/2024.T.15_Answer_ClinicalData_External.xlsx (syn68929229)
   üìÅ ICON (syn68927980)
   üîç Processing subfolder: ICON
   üìÑ ICON/0869-0004-B_pT181_SAMPLE RESULTS_FINAL_09May2025.xlsx (syn68929235)
   üìÑ ICON/0869-0004-C_Total Tau_SAMPLE RESULTS_FINAL_24Apr2025 (1).xlsx (syn68929236)
   üìÑ ICON/0869-0004-D_miR-206_SAMPLE RESULTS_FINAL_10JUL2025.xlsx (syn68929234)
   üìÑ ICON/Total File_0869-0004-A.xlsx (syn68929233)
   üìÅ Metabolomics (syn68927982)
   üîç Processing subfolder: Metabolomics
   üìÑ Metabolomics/25_0805_Trehalose_EAP_C18-neg.xlsx (syn68929241)
   üìÑ Metabolomics/25_0805_Trehalose_EAP_C8-pos.xlsx (syn68929242)
   üìÑ Metabolomics/25_0805_Trehalose_EAP_HILIC-neg.xlsx (syn68929239)
   üìÑ Metabolomics/25_0805_Trehalose_EAP_HILIC-pos.xlsx (syn68929240)
   üìÅ Protavio (syn68927983)
   üîç Processing subfolder: Protavio
   üìÑ Protavio/P




In [9]:
# CELL 4: ENHANCED SCHEMA LOADING FUNCTIONS

def get_all_schemas(schema_base_path=None):
    """Load all schema classes from YAML files in the modules directory."""
    if schema_base_path is None:
        # Get the notebook's directory and find modules relative to it
        notebook_dir = Path.cwd()
        # Look for modules in the current directory or parent directories
        schema_base_path = None
        for parent in [notebook_dir] + list(notebook_dir.parents):
            potential_path = parent / 'modules'
            if potential_path.exists():
                schema_base_path = potential_path
                break
        
        if schema_base_path is None:
            # Fallback to relative path
            schema_base_path = Path('modules')
    
    schema_path = str(schema_base_path / '**' / '*.yaml')
    all_schemas = {}
    
    print(f"Looking for schemas in: {schema_path}")
    
    for schema_file in glob.glob(schema_path, recursive=True):
        print(f"Processing: {schema_file}")
        with open(schema_file, 'r') as f:
            try:
                schema = yaml.safe_load(f)
                if schema and 'classes' in schema:
                    for class_name, class_def in schema['classes'].items():
                        all_schemas[class_name] = class_def
                        print(f"  Found class: {class_name}")
            except yaml.YAMLError as e:
                print(f"Error parsing {schema_file}: {e}")
    
    print(f"Total classes loaded: {len(all_schemas)}")
    return all_schemas

def get_full_schema(class_name, all_schemas, visited=None):
    """Recursively build complete schema including inheritance and mixins."""
    if visited is None:
        visited = set()
    
    # Prevent infinite recursion
    if class_name in visited:
        return {}
    visited.add(class_name)
    
    if class_name not in all_schemas:
        print(f"Warning: Class '{class_name}' not found in schemas")
        return {}
    
    class_def = all_schemas.get(class_name, {})
    if not class_def:
        return {}
        
    # Start with this class's attributes
    attributes = class_def.get('attributes', {}).copy()
    
    # Add parent class attributes (is_a relationship)
    if 'is_a' in class_def:
        parent_name = class_def['is_a']
        print(f"  {class_name} inherits from {parent_name}")
        parent_attributes = get_full_schema(parent_name, all_schemas, visited.copy())
        # Parent attributes come first, then are overridden by child attributes
        attributes = {**parent_attributes, **attributes}
        
    # Add mixin attributes
    if 'mixins' in class_def:
        for mixin in class_def['mixins']:
            print(f"  {class_name} uses mixin {mixin}")
            mixin_attributes = get_full_schema(mixin, all_schemas, visited.copy())
            # Mixins come first, then are overridden by class attributes
            attributes = {**mixin_attributes, **attributes}
            
    return attributes

def detect_file_type(file_info):
    """Detect whether a file should use ClinicalFile or OmicFile schema."""
    name = file_info.get('name', '').lower()
    path = file_info.get('path', '').lower()
    
    # Omic data indicators
    omic_indicators = [
        'metabolomics', 'proteomics', 'genomics', 'transcriptomics',
        'somalogic', 'protavio', 'sequencing', 'omics',
        '.adat', '.fastq', '.bam', '.vcf', '.bed'
    ]
    
    # Clinical data indicators  
    clinical_indicators = [
        'clinical', 'assessment', 'medical', 'treatment', 'visit',
        'demographic', 'alsfrs', 'vital', 'neurological', 'answer'
    ]
    
    # Check for omic indicators
    for indicator in omic_indicators:
        if indicator in name or indicator in path:
            return 'OmicFile'
    
    # Check for clinical indicators
    for indicator in clinical_indicators:
        if indicator in name or indicator in path:
            return 'ClinicalFile'
    
    # Default to ClinicalFile for unknown types
    return 'ClinicalFile'

# Load all schemas from the data model
print("üîç LOADING DATA MODEL SCHEMAS")
print("=" * 30)

all_schemas = get_all_schemas()

if not all_schemas:
    print("‚ùå No schemas found! Check the path to modules directory.")
else:
    print(f"\n‚úÖ Loaded {len(all_schemas)} schema classes")
    
    # Test schema loading for key classes
    for class_name in ['ClinicalFile', 'OmicFile', 'BaseFile']:
        if class_name in all_schemas:
            print(f"‚úÖ Found {class_name}")
        else:
            print(f"‚ùå Missing {class_name}")
    
    # Test schema inheritance
    clinical_schema = get_full_schema('ClinicalFile', all_schemas)
    omic_schema = get_full_schema('OmicFile', all_schemas)
    
    print(f"\nüìä Schema Analysis:")
    print(f"   ClinicalFile attributes: {len(clinical_schema)}")
    print(f"   OmicFile attributes: {len(omic_schema)}")

üîç LOADING DATA MODEL SCHEMAS
Looking for schemas in: /home/ramayyala/Documents/data-model/modules/**/*.yaml
Processing: /home/ramayyala/Documents/data-model/modules/base/BaseDataset.yaml
  Found class: BaseDataset
Processing: /home/ramayyala/Documents/data-model/modules/base/BaseFile.yaml
  Found class: BaseFile
Processing: /home/ramayyala/Documents/data-model/modules/clinical/data-management.yaml
  Found class: DataQuality
  Found class: AssessmentAdministration
Processing: /home/ramayyala/Documents/data-model/modules/clinical/data-types.yaml
Processing: /home/ramayyala/Documents/data-model/modules/clinical/domains.yaml
Processing: /home/ramayyala/Documents/data-model/modules/clinical/genetic-profile.yaml
  Found class: GeneticProfile
  Found class: GeneticVariant
  Found class: FamilyHistory
Processing: /home/ramayyala/Documents/data-model/modules/clinical/laboratory.yaml
  Found class: LaboratoryCollection
  Found class: LaboratoryResult
  Found class: ChemistryPanel
  Found clas

In [11]:
# CELL 5: DYNAMIC ANNOTATION STRUCTURE GENERATION

def create_annotation_template(file_info, all_schemas):
    """Create blank annotation template based on file type detection."""
    file_type = detect_file_type(file_info)
    
    # Choose appropriate schema
    if file_type == 'OmicFile':
        schema_attributes = get_full_schema('OmicFile', all_schemas)
    else:
        schema_attributes = get_full_schema('ClinicalFile', all_schemas)
    
    # Build template from schema
    template = {}
    for attr_name, attr_def in schema_attributes.items():
        # Handle multivalued attributes
        if isinstance(attr_def, dict) and attr_def.get('multivalued', False):
            template[attr_name] = ['']
        else:
            template[attr_name] = ''
    
    # Add metadata about detected file type
    template['_file_type'] = file_type
    template['_schema_source'] = 'data-model'
    template['_created_timestamp'] = datetime.now().isoformat()
    
    return template

def merge_annotations_smartly(existing_annotations, new_files_folders, all_schemas):
    """
    Smart merge that:
    1. Keeps existing annotations intact
    2. Adds templates for new files/folders not in existing annotations
    3. Does not overwrite any existing values
    """
    merged = existing_annotations.copy()
    new_count = 0
    existing_count = 0
    
    for syn_id, file_info in new_files_folders.items():
        if syn_id not in merged:
            # New file/folder - create template
            template = create_annotation_template(file_info, all_schemas)
            merged[syn_id] = {
                file_info['name']: template
            }
            print(f"‚ûï Added template for: {file_info['name']} (detected as {template['_file_type']})")
            new_count += 1
        else:
            print(f"‚úÖ Existing annotations kept for: {file_info['name']}")
            existing_count += 1
    
    print(f"\nüìä Merge Summary:")
    print(f"   ‚ûï New entries: {new_count}")
    print(f"   ‚úÖ Existing entries: {existing_count}")
    print(f"   üìã Total entries: {len(merged)}")
    
    return merged

# Execute annotation creation/merging
print("üèóÔ∏è  ANNOTATION STRUCTURE GENERATION")
print("=" * 35)

if all_schemas and files_folders:
    # Load existing annotations
    existing_annotations = load_existing_annotations(annotation_file_path)
    print(f"üìÇ Existing annotations: {len(existing_annotations)} entries")
    
    # Smart merge with new files/folders
    updated_annotations = merge_annotations_smartly(existing_annotations, files_folders, all_schemas)
    
    # Save updated annotations
    save_annotations(updated_annotations, annotation_file_path)
    print(f"\nüíæ Annotations saved to: {annotation_file_path}")
    
    # Show sample annotation structure
    if updated_annotations:
        first_entry_id = list(updated_annotations.keys())[0]
        first_entry = updated_annotations[first_entry_id]
        first_file_name = list(first_entry.keys())[0]
        sample_annotation = first_entry[first_file_name]
        
        print(f"\nüìã Sample annotation structure for '{first_file_name}':")
        # Show first 10 attributes
        sample_attrs = list(sample_annotation.items())[:10]
        for key, value in sample_attrs:
            if isinstance(value, list) and len(value) > 3:
                display_value = f"{value[:3]}... (+{len(value)-3} more)"
            else:
                display_value = str(value)
            print(f"   {key}: {display_value}")
        
        if len(sample_annotation) > 10:
            print(f"   ... and {len(sample_annotation) - 10} more attributes")
        
        print(f"\nüéØ File type detected: {sample_annotation.get('_file_type', 'Unknown')}")
        
else:
    print("‚ö†Ô∏è  Cannot create annotations: missing schemas or files")
    updated_annotations = {}

üèóÔ∏è  ANNOTATION STRUCTURE GENERATION
üìÇ Existing annotations: 23 entries
‚úÖ Existing annotations kept for: Answer Clinical Data
‚úÖ Existing annotations kept for: 2024.T.15_Answer_ClinicalData_External.xlsx
‚úÖ Existing annotations kept for: ICON
‚úÖ Existing annotations kept for: 0869-0004-B_pT181_SAMPLE RESULTS_FINAL_09May2025.xlsx
‚úÖ Existing annotations kept for: 0869-0004-C_Total Tau_SAMPLE RESULTS_FINAL_24Apr2025 (1).xlsx
‚úÖ Existing annotations kept for: 0869-0004-D_miR-206_SAMPLE RESULTS_FINAL_10JUL2025.xlsx
‚úÖ Existing annotations kept for: Total File_0869-0004-A.xlsx
‚úÖ Existing annotations kept for: Metabolomics
‚úÖ Existing annotations kept for: 25_0805_Trehalose_EAP_C18-neg.xlsx
‚úÖ Existing annotations kept for: 25_0805_Trehalose_EAP_C8-pos.xlsx
‚úÖ Existing annotations kept for: 25_0805_Trehalose_EAP_HILIC-neg.xlsx
‚úÖ Existing annotations kept for: 25_0805_Trehalose_EAP_HILIC-pos.xlsx
‚úÖ Existing annotations kept for: Protavio
‚úÖ Existing annotations kept f

In [12]:
# CELL 6: LOAD AND VALIDATE ANNOTATIONS

def validate_annotation_against_schema(annotation, file_type, all_schemas):
    """
    Validate individual annotation against its schema.
    Returns (is_valid, errors_list)
    """
    errors = []
    warnings = []
    
    # Get expected schema
    if file_type == 'OmicFile':
        expected_schema = get_full_schema('OmicFile', all_schemas)
    else:
        expected_schema = get_full_schema('ClinicalFile', all_schemas)
    
    # Check for required fields
    for attr_name, attr_def in expected_schema.items():
        if isinstance(attr_def, dict) and attr_def.get('required', False):
            if attr_name not in annotation:
                errors.append(f"Missing required field: {attr_name}")
            elif not annotation[attr_name] or annotation[attr_name] == '' or annotation[attr_name] == ['']:
                errors.append(f"Required field '{attr_name}' is empty")
    
    # Check multivalued field constraints
    for attr_name, value in annotation.items():
        if attr_name.startswith('_'):  # Skip metadata fields
            continue
            
        if attr_name in expected_schema:
            attr_def = expected_schema[attr_name]
            if isinstance(attr_def, dict):
                is_multivalued = attr_def.get('multivalued', False)
                
                if is_multivalued and not isinstance(value, list):
                    errors.append(f"Field '{attr_name}' should be a list (multivalued)")
                elif not is_multivalued and isinstance(value, list):
                    warnings.append(f"Field '{attr_name}' is a list but should be single value")
        else:
            warnings.append(f"Field '{attr_name}' not found in schema (may be deprecated)")
    
    # Check for completely empty annotations
    non_metadata_fields = {k: v for k, v in annotation.items() if not k.startswith('_')}
    filled_fields = {
        k: v for k, v in non_metadata_fields.items() 
        if v and v != '' and v != [''] and v != []
    }
    
    if len(filled_fields) == 0:
        warnings.append("No fields have been filled out yet")
    
    return len(errors) == 0, errors, warnings

def validate_all_annotations(annotations_data, all_schemas):
    """Validate all annotations and report issues."""
    validation_results = {}
    total_errors = 0
    total_warnings = 0
    valid_count = 0
    
    for syn_id, file_data in annotations_data.items():
        for file_name, annotation in file_data.items():
            file_type = annotation.get('_file_type', 'ClinicalFile')
            is_valid, errors, warnings = validate_annotation_against_schema(annotation, file_type, all_schemas)
            
            validation_results[syn_id] = {
                'file_name': file_name,
                'is_valid': is_valid,
                'errors': errors,
                'warnings': warnings,
                'file_type': file_type
            }
            
            total_errors += len(errors)
            total_warnings += len(warnings)
            
            if is_valid:
                valid_count += 1
                if len(warnings) > 0:
                    print(f"‚ö†Ô∏è  {file_name}: Valid with {len(warnings)} warnings")
                else:
                    print(f"‚úÖ {file_name}: Valid")
            else:
                print(f"‚ùå {file_name}: {len(errors)} validation errors")
                for error in errors[:3]:  # Show first 3 errors
                    print(f"   ‚Ä¢ {error}")
                if len(errors) > 3:
                    print(f"   ... and {len(errors) - 3} more errors")
    
    return validation_results, total_errors, total_warnings, valid_count


def preview_annotation_cleaning(annotations_data, sample_limit=3):
    """Preview what the cleaning will do to annotations."""
    from collections import defaultdict
    
    # Import the cleaning function for testing
    def is_meaningful_value(val):
        preserved_values = {'Unknown', 'N/A', 'unknown', 'n/a', 'NA', 'na'}
        if val in preserved_values:
            return True
        if isinstance(val, str):
            return val.strip() != ''
        return val is not None
    
    stats = {
        'total_fields': 0,
        'cleaned_fields': 0,
        'removed_fields': 0,
        'preserved_meaningful': 0
    }
    
    examples = []
    count = 0
    
    for syn_id, file_data in annotations_data.items():
        for file_name, annotation in file_data.items():
            original_count = 0
            cleaned_count = 0
            preserved_examples = []
            removed_examples = []
            
            for key, value in annotation.items():
                if not key.startswith('_'):
                    original_count += 1
                    stats['total_fields'] += 1
                    
                    # Simulate cleaning logic
                    will_keep = False
                    if isinstance(value, list):
                        meaningful_items = [v for v in value if is_meaningful_value(v)]
                        if meaningful_items:
                            will_keep = True
                            if any(item in {'Unknown', 'N/A'} for item in meaningful_items):
                                stats['preserved_meaningful'] += 1
                    elif is_meaningful_value(value):
                        will_keep = True
                        if value in {'Unknown', 'N/A'}:
                            stats['preserved_meaningful'] += 1
                    
                    if will_keep:
                        cleaned_count += 1
                        stats['cleaned_fields'] += 1
                        if len(preserved_examples) < 2:
                            preserved_examples.append(f'{key}: {str(value)[:30]}')
                    else:
                        stats['removed_fields'] += 1
                        if len(removed_examples) < 2:
                            removed_examples.append(f'{key}: {str(value)[:30]}')
            
            if count < sample_limit:
                examples.append({
                    'file_name': file_name,
                    'original': original_count,
                    'cleaned': cleaned_count,
                    'removed': original_count - cleaned_count,
                    'preserved_examples': preserved_examples,
                    'removed_examples': removed_examples
                })
                count += 1
    
    return stats, examples

# Execute validation
print("üîç ANNOTATION VALIDATION")
print("=" * 25)

if os.path.exists(annotation_file_path):
    # Load annotations for validation
    annotations_data = load_existing_annotations(annotation_file_path)
    
    # Preview cleaning effects
    cleaning_stats, cleaning_examples = preview_annotation_cleaning(annotations_data)
    print(f"üßπ Cleaning Preview:")
    print(f"   üìä Total fields: {cleaning_stats['total_fields']}")
    print(f"   ‚úÖ Will keep: {cleaning_stats['cleaned_fields']}")
    print(f"   üóëÔ∏è Will remove: {cleaning_stats['removed_fields']}")
    print(f"   üéØ Preserved meaningful: {cleaning_stats['preserved_meaningful']}")
    
    for example in cleaning_examples:
        print(f"üìÑ {example['file_name'][:50]}...")
        print(f"   Original: {example['original']} ‚Üí Cleaned: {example['cleaned']} ({example['removed']} removed)")
        if example['preserved_examples']:
            print(f"   ‚úÖ Keeping: {example['preserved_examples'][0]}")
        if example['removed_examples']:
            print(f"   üóëÔ∏è Removing: {example['removed_examples'][0]}")
    print(f"üìã Loaded {len(annotations_data)} annotation entries from: {annotation_file_path}")
    
    if all_schemas and annotations_data:
        # Run validation
        validation_results, total_errors, total_warnings, valid_count = validate_all_annotations(annotations_data, all_schemas)
        
        print(f"\nüìä Validation Summary:")
        print(f"   ‚úÖ Valid: {valid_count}")
        print(f"   ‚ùå Invalid: {len(validation_results) - valid_count}")
        print(f"   üî¢ Total errors: {total_errors}")
        print(f"   ‚ö†Ô∏è  Total warnings: {total_warnings}")
        
        if total_errors > 0:
            print(f"\nüõë Please fix validation errors before proceeding to annotation application")
            print(f"üìù Edit the annotation file: {annotation_file_path}")
            print(f"üîÑ Re-run this cell after making changes")
        else:
            print(f"\nüöÄ All annotations are valid! Ready to apply to Synapse entities")
            if total_warnings > 0:
                print(f"üìù Note: {total_warnings} warnings found (non-blocking)")
    else:
        print("‚ùå Cannot validate: missing schemas or annotation data")
        validation_results = {}
        total_errors = 1  # Block progression
else:
    print(f"‚ùå Annotation file not found: {annotation_file_path}")
    print("üìù Please run the previous cells to create annotations first")
    validation_results = {}
    total_errors = 1  # Block progression

üîç ANNOTATION VALIDATION
üßπ Cleaning Preview:
   üìä Total fields: 731
   ‚úÖ Will keep: 0
   üóëÔ∏è Will remove: 731
   üéØ Preserved meaningful: 0
üìÑ Answer Clinical Data...
   Original: 29 ‚Üí Cleaned: 0 (29 removed)
   üóëÔ∏è Removing: clinicalDomain: ['']
üìÑ 2024.T.15_Answer_ClinicalData_External.xlsx...
   Original: 29 ‚Üí Cleaned: 0 (29 removed)
   üóëÔ∏è Removing: clinicalDomain: ['']
üìÑ ICON...
   Original: 29 ‚Üí Cleaned: 0 (29 removed)
   üóëÔ∏è Removing: clinicalDomain: ['']
üìã Loaded 23 annotation entries from: ../annotations/trehalose_biomarker_data_annotations.json
  ClinicalFile inherits from BaseFile
  ClinicalFile uses mixin ClinicalFileMixin
‚ùå Answer Clinical Data: 5 validation errors
   ‚Ä¢ Required field 'title' is empty
   ‚Ä¢ Required field 'creator' is empty
   ‚Ä¢ Required field 'keywords' is empty
   ... and 2 more errors
  ClinicalFile inherits from BaseFile
  ClinicalFile uses mixin ClinicalFileMixin
‚ùå 2024.T.15_Answer_ClinicalData_Exte

In [13]:
  # CELL 6.5: ENSURE VARIABLES ARE SET FOR APPLICATION
  print("üîß PREPARING VARIABLES FOR APPLICATION")
  print("=" * 35)

  # Ensure all required variables exist for Cell 7
  if 'syn' not in locals() or syn is None:
      print("‚ö†Ô∏è syn variable missing - reconnecting...")
      syn = connect_to_synapse()

  if 'annotations_data' not in locals():
      print("‚ö†Ô∏è annotations_data missing - reloading...")
      if 'annotation_file_path' in locals() and os.path.exists(annotation_file_path):
          annotations_data = load_existing_annotations(annotation_file_path)
      else:
          annotations_data = {}

  if 'validation_results' not in locals():
      print("‚ö†Ô∏è validation_results missing - setting empty...")
      validation_results = {}

  if 'total_errors' not in locals():
      print("‚ö†Ô∏è total_errors missing - setting to 0...")
      total_errors = 0

  # Quick validation check
  if annotations_data and all_schemas:
      validation_results, total_errors, total_warnings, valid_count = validate_all_annotations(annotations_data, all_schemas)

  print(f"‚úÖ Variables ready for application")
  print(f"   syn: {'‚úÖ' if syn else '‚ùå'}")
  print(f"   annotations_data: {len(annotations_data) if annotations_data else 0} entries")
  print(f"   validation_results: {len(validation_results)} results")
  print(f"   total_errors: {total_errors}")

üîß PREPARING VARIABLES FOR APPLICATION
  ClinicalFile inherits from BaseFile
  ClinicalFile uses mixin ClinicalFileMixin
‚ùå Answer Clinical Data: 5 validation errors
   ‚Ä¢ Required field 'title' is empty
   ‚Ä¢ Required field 'creator' is empty
   ‚Ä¢ Required field 'keywords' is empty
   ... and 2 more errors
  ClinicalFile inherits from BaseFile
  ClinicalFile uses mixin ClinicalFileMixin
‚ùå 2024.T.15_Answer_ClinicalData_External.xlsx: 5 validation errors
   ‚Ä¢ Required field 'title' is empty
   ‚Ä¢ Required field 'creator' is empty
   ‚Ä¢ Required field 'keywords' is empty
   ... and 2 more errors
  ClinicalFile inherits from BaseFile
  ClinicalFile uses mixin ClinicalFileMixin
‚ùå ICON: 5 validation errors
   ‚Ä¢ Required field 'title' is empty
   ‚Ä¢ Required field 'creator' is empty
   ‚Ä¢ Required field 'keywords' is empty
   ... and 2 more errors
  ClinicalFile inherits from BaseFile
  ClinicalFile uses mixin ClinicalFileMixin
‚ùå 0869-0004-B_pT181_SAMPLE RESULTS_FINAL_09

In [8]:
# CELL 7: APPLY ANNOTATIONS TO SYNAPSE ENTITIES

def get_existing_synapse_annotations(syn, entity_id):
    """Get existing annotations from Synapse entity."""
    try:
        entity = syn.get(entity_id, downloadFile=False)
        return dict(entity.annotations) if hasattr(entity, 'annotations') and entity.annotations else {}
    except Exception as e:
        print(f"‚ùå Error getting annotations for {entity_id}: {e}")
        return {}

def clean_annotations_for_synapse(annotation):
    """Clean annotations by removing metadata fields and truly empty values.
    
    Preserves meaningful values like 'Unknown', 'N/A', but removes:
    - Empty strings ('')
    - Lists containing only empty strings (['''])
    - Empty lists ([])
    - None/null values
    """
    cleaned = {}
    
    # Values to preserve even if they might seem "empty"
    preserved_values = {'Unknown', 'N/A', 'unknown', 'n/a', 'NA', 'na'}
    
    def is_meaningful_value(val):
        """Check if a value is meaningful (not truly empty)."""
        if val in preserved_values:
            return True
        if isinstance(val, str):
            return val.strip() != ''
        return val is not None
    
    for key, value in annotation.items():
        # Skip metadata fields (starting with underscore)
        if key.startswith('_'):
            continue
        
        # Handle list values
        if isinstance(value, list):
            # Keep only meaningful values in the list
            cleaned_list = [v for v in value if is_meaningful_value(v)]
            if cleaned_list:  # Only include non-empty lists
                cleaned[key] = cleaned_list
            # Skip completely empty lists
        
        # Handle single values
        elif is_meaningful_value(value):
            cleaned[key] = value
        
        # Skip truly empty values (None, '', etc.)
    
    return cleaned

def apply_annotations_to_entity(syn, entity_id, new_annotations, dry_run=False):
    """Apply annotations to Synapse entity."""
    try:
        if dry_run:
            print(f"üîç DRY RUN: Would apply {len(new_annotations)} annotations to {entity_id}")
            return True
        
        entity = syn.get(entity_id, downloadFile=False)
        
        # Clean annotations (remove metadata fields and empty values)
        clean_annotations = clean_annotations_for_synapse(new_annotations)
        
        if not clean_annotations:
            print(f"‚ö†Ô∏è  No valid annotations to apply (all fields empty)")
            return True  # Not an error, just nothing to do
        
        entity.annotations = clean_annotations
        syn.store(entity, forceVersion=False)
        return True
        
    except Exception as e:
        print(f"‚ùå Failed to apply annotations to {entity_id}: {e}")
        return False

def apply_all_annotations(syn, annotations_data, validation_results, dry_run=False):
    """Apply annotations to all validated entities."""
    success_count = 0
    failed_count = 0
    skipped_count = 0
    
    for syn_id, file_data in annotations_data.items():
        for file_name, annotation in file_data.items():
            # Only apply if validation passed
            validation_result = validation_results.get(syn_id, {})
            if not validation_result.get('is_valid', False):
                print(f"‚è≠Ô∏è  Skipping {file_name} (validation failed)")
                skipped_count += 1
                continue
            
            print(f"üîÑ Applying annotations to {file_name} ({syn_id})")
            
            # Show what will be applied
            clean_annotations = clean_annotations_for_synapse(annotation)
            print(f"   üìã {len(clean_annotations)} non-empty fields to apply")
            
            success = apply_annotations_to_entity(syn, syn_id, annotation, dry_run)
            
            if success:
                success_count += 1
                print(f"   ‚úÖ Success")
            else:
                failed_count += 1
                print(f"   ‚ùå Failed")
    
    return success_count, failed_count, skipped_count

# Execute annotation application (only if validation passed)
print("üöÄ ANNOTATION APPLICATION TO SYNAPSE")
print("=" * 35)

# Re-establish required variables in this cell's scope
try:
    # Re-connect to Synapse if needed
    if 'syn' not in globals() or syn is None:
        syn = connect_to_synapse()
    
    # Re-load annotations if needed
    if 'annotations_data' not in globals() or not annotations_data:
        if 'annotation_file_path' in globals() and os.path.exists(annotation_file_path):
            annotations_data = load_existing_annotations(annotation_file_path)
        else:
            annotations_data = {}
    
    # Re-run validation if needed
    if 'validation_results' not in globals() or not validation_results:
        if annotations_data and 'all_schemas' in globals() and all_schemas:
            validation_results, total_errors, total_warnings, valid_count = validate_all_annotations(annotations_data, all_schemas)
        else:
            validation_results = {}
            total_errors = 1
    
    # Ensure total_errors exists
    if 'total_errors' not in globals():
        total_errors = 0
        
except NameError as e:
    print(f"‚ùå Missing dependencies: {e}")
    print("üìù Please run all previous cells first")
    syn = None
    annotations_data = {}
    validation_results = {}
    total_errors = 1

# Check if we have the required data
if not syn:
    print("‚ùå No Synapse connection available")
    print("üìù Please run Cell 3 (file enumeration) first")
elif not annotations_data:
    print("‚ùå No annotation data available") 
    print("üìù Please run Cell 5 (annotation management) first")
elif total_errors > 0:
    print(f"üõë Skipping annotation application due to {total_errors} validation errors")
    print(f"üìù Please fix errors and re-run Cell 6 (validation)")
else:
    print(f"üîç Ready to apply annotations to {len(annotations_data)} entities")
    print(f"üìù Dry run mode: {DRY_RUN}")
    
    # Apply annotations
    success_count, failed_count, skipped_count = apply_all_annotations(
        syn, annotations_data, validation_results, dry_run=DRY_RUN
    )
    
    print(f"\nüìä Application Results:")
    print(f"   ‚úÖ Success: {success_count}")
    print(f"   ‚ùå Failed: {failed_count}")
    print(f"   ‚è≠Ô∏è  Skipped: {skipped_count}")
    print(f"   üìã Total processed: {success_count + failed_count + skipped_count}")
    
    if DRY_RUN:
        print(f"\nüîç This was a DRY RUN - no actual changes made")
        print(f"üí° Set DRY_RUN = False in Cell 2 to apply changes")
    elif success_count > 0:
        print(f"\nüéâ Successfully applied annotations to {success_count} entities!")
        print(f"üîó Check your entities in Synapse to see the applied annotations")
    
    if failed_count > 0:
        print(f"\n‚ö†Ô∏è  {failed_count} entities failed to update - check error messages above")

üöÄ ANNOTATION APPLICATION TO SYNAPSE
üîç Ready to apply annotations to 23 entities
üìù Dry run mode: False
üîÑ Applying annotations to Answer Clinical Data (syn68927979)
   üìã 5 non-empty fields to apply
   ‚úÖ Success
üîÑ Applying annotations to 2024.T.15_Answer_ClinicalData_External.xlsx (syn68929229)
   üìã 5 non-empty fields to apply
   ‚úÖ Success
üîÑ Applying annotations to ICON (syn68927980)
   üìã 6 non-empty fields to apply
   ‚úÖ Success
üîÑ Applying annotations to 0869-0004-B_pT181_SAMPLE RESULTS_FINAL_09May2025.xlsx (syn68929235)
   üìã 6 non-empty fields to apply
   ‚úÖ Success
üîÑ Applying annotations to 0869-0004-C_Total Tau_SAMPLE RESULTS_FINAL_24Apr2025 (1).xlsx (syn68929236)
   üìã 6 non-empty fields to apply
   ‚úÖ Success
üîÑ Applying annotations to 0869-0004-D_miR-206_SAMPLE RESULTS_FINAL_10JUL2025.xlsx (syn68929234)
   üìã 6 non-empty fields to apply
   ‚úÖ Success
üîÑ Applying annotations to Total File_0869-0004-A.xlsx (syn68929233)
   üìã 6 no

In [10]:
# CELL 8: CREATE ENTITY VIEW FOR STAGING FOLDER WITH MINIMAL ANNOTATION COLUMNS

from synapseclient.models import EntityView, ViewTypeMask, Column, ColumnType

def extract_minimal_annotation_columns(annotations_data):
    """Extract only the most essential annotation columns that have data."""
    
    print("üîç Extracting minimal annotation columns with data...")
    
    # Manually specify the most important columns that we know have data
    # Based on the annotations file, these are the fields with actual values
    essential_columns = [
        'clinicalDomain',  # Has values like ['subject_management', 'disease_progression'] 
        'studyPhase',      # Has values like 'longitudinal'
        'dataType',
               # Has values like 'clinical' or 'omics'
    ]
    
    columns = []
    
    # Check which essential columns actually have data
    columns_with_data = set()
    for syn_id, file_data in annotations_data.items():
        for file_name, annotation in file_data.items():
            for col in essential_columns:
                if col in annotation:
                    value = annotation[col]
                    has_data = False
                    
                    if isinstance(value, list):
                        # List field - check if has non-empty values
                        meaningful_values = [v for v in value if v and v.strip() != '']
                        has_data = len(meaningful_values) > 0
                    elif isinstance(value, str):
                        # String field - check if non-empty
                        has_data = value.strip() != ''
                    
                    if has_data:
                        columns_with_data.add(col)
    
    print(f"   üìã Found {len(columns_with_data)} essential columns with data")
    
    # Create columns for the ones that have data
    for col in essential_columns:
        if col in columns_with_data:
            # Determine if it's a list type by checking the data
            is_list = False
            for syn_id, file_data in annotations_data.items():
                for file_name, annotation in file_data.items():
                    if col in annotation and isinstance(annotation[col], list):
                        # Check if it's a meaningful list (not just [''])
                        meaningful_values = [v for v in annotation[col] if v and v.strip() != '']
                        if len(meaningful_values) > 0:
                            is_list = True
                            break
                if is_list:
                    break
            
            column_type = ColumnType.STRING_LIST if is_list else ColumnType.STRING
            columns.append(Column(name=col, column_type=column_type))
            print(f"   ‚ûï {col}: {column_type.value}")
    
    print(f"‚úÖ Created {len(columns)} essential annotation columns")
    return columns

def get_or_create_minimal_entity_view(syn, project_id, staging_folder_id, annotations_data, view_name=None):
    """Create a minimal Entity View for the staging folder with only essential annotation columns."""
    
    if view_name is None:
        # Get folder name for view naming
        try:
            folder = syn.get(staging_folder_id, downloadFile=False)
            folder_name = folder.name.replace(' ', '_').replace('-', '_')
            view_name = f"{folder_name}_Entity_View"
        except Exception as e:
            print(f"‚ö†Ô∏è Could not get folder name: {e}")
            view_name = "Staging_Folder_Entity_View"
    
    print(f"üîç Creating minimal Entity View: {view_name}")
    print(f"   üìÅ Scope: {staging_folder_id}")
    print(f"   üèóÔ∏è Project: {project_id}")
    
    try:
        # Check if view already exists and delete it
        existing_views = syn.getChildren(project_id, includeTypes=['entityview'])
        for view_info in existing_views:
            if view_info['name'] == view_name:
                print(f"‚ö†Ô∏è Found existing Entity View: {view_name} ({view_info['id']})")
                print(f"   üóëÔ∏è Deleting existing view to recreate...")
                syn.delete(view_info['id'])
                break
        
        # Extract minimal annotation columns
        annotation_columns = extract_minimal_annotation_columns(annotations_data)
        
        # Create name column first, then annotation columns
        all_columns = []
        
        # Add name column first to ensure it's the first column
        all_columns.append(Column(name="name", column_type=ColumnType.STRING))
        
        # Add annotation columns after name
        all_columns.extend(annotation_columns)
        
        # Create new Entity View with minimal columns
        entity_view = EntityView(
            name=view_name,
            parent_id=project_id,
            scope_ids=[staging_folder_id],
            view_type_mask=ViewTypeMask.FILE | ViewTypeMask.FOLDER,
            columns=all_columns  # Name column first, then annotation columns
        )
        
        # Create the view in Synapse
        created_view = entity_view.store()
        view_id = created_view.id
        
        print(f"‚úÖ Created Entity View successfully!")
        print(f"   üìã View ID: {view_id}")
        print(f"   üè∑Ô∏è Total columns: {len(all_columns)} (name + {len(annotation_columns)} annotations)")
        print(f"   üîó View URL: https://www.synapse.org/#!Synapse:{view_id}")
        
        return view_id
        
    except Exception as e:
        print(f"‚ùå Error creating Entity View: {e}")
        import traceback
        traceback.print_exc()
        return None

# Execute minimal Entity View creation
print("üìä MINIMAL ENTITY VIEW CREATION")
print("=" * 35)

# Re-establish variables if needed
try:
    if 'syn' not in globals() or syn is None:
        print("‚ö†Ô∏è Reconnecting to Synapse...")
        syn = connect_to_synapse()
    
    if 'PROJECT_ID' not in globals():
        PROJECT_ID = "syn68702804"
    
    if 'STAGING_FOLDER_ID' not in globals():
        STAGING_FOLDER_ID = "syn68927891"
    
    # Load annotations data if not available
    if 'annotations_data' not in globals() or not annotations_data:
        if 'annotation_file_path' in globals() and os.path.exists(annotation_file_path):
            print("üìÇ Loading annotations data...")
            annotations_data = load_existing_annotations(annotation_file_path)
        else:
            annotations_data = {}
        
except NameError as e:
    print(f"‚ùå Missing configuration: {e}")
    print("üìù Please run previous cells first")

# Create minimal Entity View
if syn and PROJECT_ID and STAGING_FOLDER_ID and annotations_data:
    view_id = get_or_create_minimal_entity_view(
        syn, PROJECT_ID, STAGING_FOLDER_ID, annotations_data
    )
    
    if view_id:
        print(f"\n‚úÖ Entity View created successfully!")
        print(f"   üîó View in Synapse web: https://www.synapse.org/#!Synapse:{view_id}")
        print(f"   üìä Query programmatically: syn.query('SELECT * FROM {view_id}')")
        
        # Store view_id for potential use in other cells
        ENTITY_VIEW_ID = view_id
        print(f"\n‚úÖ Entity View ID stored in variable: ENTITY_VIEW_ID = '{view_id}'")
    else:
        print("‚ùå Could not create Entity View")
else:
    missing = []
    if not syn: missing.append("syn")
    if not PROJECT_ID: missing.append("PROJECT_ID") 
    if not STAGING_FOLDER_ID: missing.append("STAGING_FOLDER_ID")
    if not annotations_data: missing.append("annotations_data")
    
    print(f"‚ùå Missing required variables: {', '.join(missing)}")
    print("üìù Please run previous cells to set up these variables")

/entity/syn71975076/table/transaction/async:   0%|          | 0.00/1.00 [00:10<?, ?it/s]


üìä MINIMAL ENTITY VIEW CREATION
üîç Creating minimal Entity View: Trehalose_Biomarker_Data_Entity_View
   üìÅ Scope: syn68927891
   üèóÔ∏è Project: syn68702804
‚ö†Ô∏è Found existing Entity View: Trehalose_Biomarker_Data_Entity_View (syn71975076)
   üóëÔ∏è Deleting existing view to recreate...
üîç Extracting minimal annotation columns with data...
   üìã Found 3 essential columns with data
   ‚ûï clinicalDomain: STRING_LIST
   ‚ûï studyPhase: STRING
   ‚ûï dataType: STRING_LIST
‚úÖ Created 3 essential annotation columns




[syn71975077:Trehalose_Biomarker_Data_Entity_View:Column_name (Add)]: Column(id='81722', name='name', column_type=STRING, facet_type=None, default_value=None, maximum_size=256, maximum_list_length=None, enum_values=None, json_sub_columns=None)
[syn71975077:Trehalose_Biomarker_Data_Entity_View:Column_clinicalDomain (Add)]: Column(id=None, name='clinicalDomain', column_type=STRING_LIST, facet_type=None, default_value=None, maximum_size=None, maximum_list_length=None, enum_values=None, json_sub_columns=None)
[syn71975077:Trehalose_Biomarker_Data_Entity_View:Column_studyPhase (Add)]: Column(id=None, name='studyPhase', column_type=STRING, facet_type=None, default_value=None, maximum_size=None, maximum_list_length=None, enum_values=None, json_sub_columns=None)
[syn71975077:Trehalose_Biomarker_Data_Entity_View:Column_dataType (Add)]: Column(id=None, name='dataType', column_type=STRING_LIST, facet_type=None, default_value=None, maximum_size=None, maximum_list_length=None, enum_values=None, jso

/entity/syn71975077/table/transaction/async: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.00/1.00 [00:01<00:00, 1.19s/it]

‚úÖ Created Entity View successfully!
   üìã View ID: syn71975077
   üè∑Ô∏è Total columns: 4 (name + 3 annotations)
   üîó View URL: https://www.synapse.org/#!Synapse:syn71975077

‚úÖ Entity View created successfully!
   üîó View in Synapse web: https://www.synapse.org/#!Synapse:syn71975077
   üìä Query programmatically: syn.query('SELECT * FROM syn71975077')

‚úÖ Entity View ID stored in variable: ENTITY_VIEW_ID = 'syn71975077'





In [14]:
# CELL 9: DATASET ANNOTATION MANAGEMENT

from synapseclient.models import Dataset

def create_dataset_annotation_template(dataset_type, all_schemas):
    """Create blank dataset annotation template based on dataset type."""
    
    # Choose appropriate dataset schema
    if dataset_type.lower() == 'omic':
        schema_attributes = get_full_schema('OmicDataset', all_schemas)
        dataset_class = 'OmicDataset'
    else:
        schema_attributes = get_full_schema('ClinicalDataset', all_schemas)  
        dataset_class = 'ClinicalDataset'
    
    # Build template from schema
    template = {}
    for attr_name, attr_def in schema_attributes.items():
        # Handle multivalued attributes
        if isinstance(attr_def, dict) and attr_def.get('multivalued', False):
            template[attr_name] = ['']
        else:
            template[attr_name] = ''
    
    # Add metadata about detected dataset type
    template['_dataset_type'] = dataset_class
    template['_schema_source'] = 'data-model'
    template['_created_timestamp'] = datetime.now().isoformat()
    
    return template

def create_dataset_annotations_file_path(staging_folder_name):
    """Create path: ../annotations/{folder_name}_dataset_annotations.json"""
    clean_name = staging_folder_name.lower().replace(' ', '_').replace('-', '_')
    clean_name = re.sub(r'[^a-z0-9_]', '', clean_name)  # Remove special chars
    return f"{ANNOTATIONS_DIR}/{clean_name}_dataset_annotations.json"

def smart_merge_dataset_annotations(existing_annotations, new_template):
    """
    Smart merge that:
    1. Keeps existing filled annotations intact
    2. Only adds new attributes if they don't exist
    3. Does not overwrite any existing filled values
    """
    if not existing_annotations:
        return new_template
    
    merged = existing_annotations.copy()
    added_count = 0
    
    for key, value in new_template.items():
        if key not in merged:
            # New field - add it
            merged[key] = value
            added_count += 1
        else:
            # Field exists - only update if current value is empty
            current_value = merged[key]
            is_empty = False
            
            if isinstance(current_value, list):
                is_empty = len([v for v in current_value if v and v.strip() != '']) == 0
            elif isinstance(current_value, str):
                is_empty = current_value.strip() == ''
            else:
                is_empty = current_value is None
            
            if is_empty and value:
                merged[key] = value
                added_count += 1
    
    print(f"üìä Dataset annotation merge: {added_count} new/updated fields")
    return merged

def validate_dataset_annotation(annotation, all_schemas):
    """Validate dataset annotation against its schema."""
    errors = []
    warnings = []
    
    dataset_type = annotation.get('_dataset_type', 'ClinicalDataset')
    
    # Get expected schema
    if dataset_type == 'OmicDataset':
        expected_schema = get_full_schema('OmicDataset', all_schemas)
    else:
        expected_schema = get_full_schema('ClinicalDataset', all_schemas)
    
    # Check for required fields
    for attr_name, attr_def in expected_schema.items():
        if isinstance(attr_def, dict) and attr_def.get('required', False):
            if attr_name not in annotation:
                errors.append(f"Missing required field: {attr_name}")
            elif not annotation[attr_name] or annotation[attr_name] == '' or annotation[attr_name] == ['']:
                errors.append(f"Required field '{attr_name}' is empty")
    
    # Check multivalued field constraints
    for attr_name, value in annotation.items():
        if attr_name.startswith('_'):  # Skip metadata fields
            continue
            
        if attr_name in expected_schema:
            attr_def = expected_schema[attr_name]
            if isinstance(attr_def, dict):
                is_multivalued = attr_def.get('multivalued', False)
                
                if is_multivalued and not isinstance(value, list):
                    errors.append(f"Field '{attr_name}' should be a list (multivalued)")
                elif not is_multivalued and isinstance(value, list):
                    warnings.append(f"Field '{attr_name}' is a list but should be single value")
        else:
            warnings.append(f"Field '{attr_name}' not found in schema (may be deprecated)")
    
    # Check for completely empty annotations
    non_metadata_fields = {k: v for k, v in annotation.items() if not k.startswith('_')}
    filled_fields = {
        k: v for k, v in non_metadata_fields.items() 
        if v and v != '' and v != [''] and v != []
    }
    
    if len(filled_fields) < 3:  # At least title, description, dataType should be filled
        warnings.append("Very few fields have been filled out - consider adding more metadata")
    
    return len(errors) == 0, errors, warnings

def create_dataset_entity(syn, project_id, dataset_name, dataset_annotations):
    """Create a Dataset entity in Synapse with annotations."""
    try:
        # Clean annotations for Synapse (remove metadata fields)
        clean_annotations = {}
        for key, value in dataset_annotations.items():
            if not key.startswith('_'):  # Skip metadata fields
                # Only include non-empty values
                if isinstance(value, list):
                    cleaned_list = [v for v in value if v and v.strip() != '']
                    if cleaned_list:
                        clean_annotations[key] = cleaned_list
                elif isinstance(value, str) and value.strip() != '':
                    clean_annotations[key] = value
                elif value is not None:
                    clean_annotations[key] = value
        
        # Create Dataset entity
        dataset = Dataset(
            name=dataset_name,
            parent_id=project_id,
            dataset_items=[],  # Start empty, files can be added later
            annotations=clean_annotations
        )
        
        # Store the dataset
        created_dataset = dataset.store()
        
        print(f"‚úÖ Created Dataset entity successfully!")
        print(f"   üìã Dataset ID: {created_dataset.id}")
        print(f"   üìõ Dataset Name: {dataset_name}")
        print(f"   üè∑Ô∏è Annotations Applied: {len(clean_annotations)}")
        print(f"   üîó Dataset URL: https://www.synapse.org/#!Synapse:{created_dataset.id}")
        
        return created_dataset.id
        
    except Exception as e:
        print(f"‚ùå Error creating Dataset entity: {e}")
        import traceback
        traceback.print_exc()
        return None

# Execute dataset annotation management
print("üìä DATASET ANNOTATION MANAGEMENT")
print("=" * 30)

# Re-establish variables if needed
try:
    if 'staging_folder_name' not in globals():
        staging_folder_name = "trehalose_biomarker_data"  # fallback
    
    if 'all_schemas' not in globals() or not all_schemas:
        print("üîç Loading schemas...")
        all_schemas = get_all_schemas()
        
except NameError as e:
    print(f"‚ùå Missing dependencies: {e}")
    print("üìù Please run previous cells first")

if all_schemas:
    # Create dataset annotations file path
    dataset_annotation_file_path = create_dataset_annotations_file_path(staging_folder_name)
    print(f"üìÑ Dataset annotation file: {dataset_annotation_file_path}")
    
    # Load existing dataset annotations if they exist
    existing_dataset_annotations = load_existing_annotations(dataset_annotation_file_path)
    
    if existing_dataset_annotations:
        print(f"üìÇ Found existing dataset annotations with {len(existing_dataset_annotations)} fields")
        print(f"   üìù Edit file to update values: {dataset_annotation_file_path}")
        
        # Detect dataset type from existing annotations
        dataset_type = existing_dataset_annotations.get('_dataset_type', 'ClinicalDataset')
        if 'Omic' in dataset_type:
            template_type = 'omic'
        else:
            template_type = 'clinical'
            
        # Create template and merge with existing
        new_template = create_dataset_annotation_template(template_type, all_schemas)
        updated_dataset_annotations = smart_merge_dataset_annotations(existing_dataset_annotations, new_template)
        
    else:
        print("üìã No existing dataset annotations found")
        print("üîç Detecting dataset type from staging folder content...")
        
        # Try to detect dataset type from file annotations
        dataset_type = 'clinical'  # default
        if 'annotations_data' in globals() and annotations_data:
            # Check file types to infer dataset type
            omic_files = 0
            clinical_files = 0
            
            for syn_id, file_data in annotations_data.items():
                for file_name, annotation in file_data.items():
                    file_type = annotation.get('_file_type', '')
                    if 'Omic' in file_type:
                        omic_files += 1
                    else:
                        clinical_files += 1
            
            if omic_files > clinical_files:
                dataset_type = 'omic'
                print(f"   üìä Detected: Omic dataset ({omic_files} omic files vs {clinical_files} clinical files)")
            else:
                dataset_type = 'clinical'
                print(f"   üìä Detected: Clinical dataset ({clinical_files} clinical files vs {omic_files} omic files)")
        
        # Create new template
        updated_dataset_annotations = create_dataset_annotation_template(dataset_type, all_schemas)
        print(f"üìù Created new {dataset_type} dataset annotation template")
    
    # Save updated annotations
    save_annotations(updated_dataset_annotations, dataset_annotation_file_path)
    print(f"üíæ Dataset annotations saved to: {dataset_annotation_file_path}")
    
    # Validate the dataset annotations
    print(f"\nüîç Validating dataset annotations...")
    is_valid, errors, warnings = validate_dataset_annotation(updated_dataset_annotations, all_schemas)
    
    if errors:
        print(f"‚ùå Dataset annotation validation failed:")
        for error in errors:
            print(f"   ‚Ä¢ {error}")
        print(f"\nüìù Please fill out required fields in: {dataset_annotation_file_path}")
        print(f"üîÑ Re-run this cell after making changes")
    else:
        print(f"‚úÖ Dataset annotation validation passed!")
        if warnings:
            print(f"‚ö†Ô∏è  {len(warnings)} warnings:")
            for warning in warnings[:3]:  # Show first 3
                print(f"   ‚Ä¢ {warning}")
        
        # Check if dataset name is filled out for entity creation
        dataset_name = updated_dataset_annotations.get('title', '').strip()
        if not dataset_name:
            print(f"\nüìù To create Dataset entity, please add a 'title' field to the annotations file")
        else:
            print(f"\nüéØ Dataset is ready for entity creation!")
            print(f"   üìõ Dataset Name: {dataset_name}")
            print(f"   üîß To create Dataset entity, run a separate cell with:")
            print(f"      create_dataset_entity(syn, PROJECT_ID, '{dataset_name}', updated_dataset_annotations)")

else:
    print("‚ùå Cannot create dataset annotations: missing schemas")
    print("üìù Please run Cell 4 (schema loading) first")

üìä DATASET ANNOTATION MANAGEMENT
üìÑ Dataset annotation file: ../annotations/trehalose_biomarker_data_dataset_annotations.json
üìã No existing dataset annotations found
üîç Detecting dataset type from staging folder content...
   üìä Detected: Omic dataset (16 omic files vs 7 clinical files)
  OmicDataset inherits from BaseDataset
üìù Created new omic dataset annotation template
üíæ Dataset annotations saved to: ../annotations/trehalose_biomarker_data_dataset_annotations.json

üîç Validating dataset annotations...
  OmicDataset inherits from BaseDataset
‚ùå Dataset annotation validation failed:
   ‚Ä¢ Required field 'title' is empty
   ‚Ä¢ Required field 'creator' is empty
   ‚Ä¢ Required field 'keywords' is empty
   ‚Ä¢ Required field 'source' is empty
   ‚Ä¢ Required field 'url' is empty

üìù Please fill out required fields in: ../annotations/trehalose_biomarker_data_dataset_annotations.json
üîÑ Re-run this cell after making changes


In [None]:
# CELL 10: CREATE DATASET ENTITY

def create_dataset_entity_with_validation(syn, project_id):
    """Create Dataset entity after validating annotations and getting user input."""
    
    # Re-establish variables if needed
    if 'dataset_annotation_file_path' not in globals():
        if 'staging_folder_name' in globals():
            dataset_annotation_file_path = create_dataset_annotations_file_path(staging_folder_name)
        else:
            print("‚ùå Missing staging folder name - please run previous cells")
            return None
    
    # Load dataset annotations
    try:
        dataset_annotations = load_existing_annotations(dataset_annotation_file_path)
        if not dataset_annotations:
            print(f"‚ùå No dataset annotations found at: {dataset_annotation_file_path}")
            print("üìù Please run Cell 9 (Dataset Annotation Management) first")
            return None
    except Exception as e:
        print(f"‚ùå Error loading dataset annotations: {e}")
        return None
    
    # Validate dataset annotations
    print("üîç Validating dataset annotations before entity creation...")
    if 'all_schemas' not in globals():
        print("üîç Loading schemas...")
        all_schemas = get_all_schemas()
    
    is_valid, errors, warnings = validate_dataset_annotation(dataset_annotations, all_schemas)
    
    if errors:
        print(f"‚ùå Dataset annotation validation failed:")
        for error in errors:
            print(f"   ‚Ä¢ {error}")
        print(f"\nüìù Please fix errors in: {dataset_annotation_file_path}")
        return None
    
    print("‚úÖ Dataset annotations are valid!")
    if warnings:
        print(f"‚ö†Ô∏è  {len(warnings)} warnings (non-blocking)")
    
    # Get dataset name from annotations
    dataset_name = dataset_annotations.get('title', '').strip()
    if not dataset_name:
        print("‚ùå Dataset 'title' field is required for entity creation")
        print(f"üìù Please add a title in: {dataset_annotation_file_path}")
        return None
    
    # Check if dataset already exists
    print(f"\nüîç Checking for existing datasets with name: '{dataset_name}'")
    try:
        existing_datasets = syn.getChildren(project_id, includeTypes=['dataset'])
        for dataset_info in existing_datasets:
            if dataset_info['name'] == dataset_name:
                print(f"‚ö†Ô∏è  Dataset already exists: {dataset_name} ({dataset_info['id']})")
                print(f"   üîó URL: https://www.synapse.org/#!Synapse:{dataset_info['id']}")
                
                # Ask user what to do
                print(f"\n‚ùì What would you like to do?")
                print(f"   1. Skip creation (use existing dataset)")
                print(f"   2. Update existing dataset annotations")
                print(f"   3. Create new dataset with different name")
                
                # For notebook use, we'll skip creation and show the existing dataset
                print(f"üìù Skipping creation - existing dataset found")
                return dataset_info['id']
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not check for existing datasets: {e}")
    
    # Create new Dataset entity
    print(f"\nüîÑ Creating Dataset entity: '{dataset_name}'")
    dataset_id = create_dataset_entity(syn, project_id, dataset_name, dataset_annotations)
    
    if dataset_id:
        # Store dataset info for future use
        dataset_info = {
            'id': dataset_id,
            'name': dataset_name,
            'annotation_file': dataset_annotation_file_path,
            'created_timestamp': datetime.now().isoformat()
        }
        
        # Save dataset info to a file for tracking
        dataset_info_file = f"{ANNOTATIONS_DIR}/dataset_info.json"
        try:
            with open(dataset_info_file, 'w') as f:
                json.dump(dataset_info, f, indent=2)
            print(f"üíæ Dataset info saved to: {dataset_info_file}")
        except Exception as e:
            print(f"‚ö†Ô∏è  Could not save dataset info: {e}")
        
        return dataset_id
    else:
        return None

# Execute Dataset entity creation
print("üèóÔ∏è  DATASET ENTITY CREATION")
print("=" * 25)

# Re-establish variables if needed
try:
    if 'syn' not in globals() or syn is None:
        print("‚ö†Ô∏è Reconnecting to Synapse...")
        syn = connect_to_synapse()
    
    if 'PROJECT_ID' not in globals():
        PROJECT_ID = "syn68702804"
        
except NameError as e:
    print(f"‚ùå Missing configuration: {e}")
    print("üìù Please run previous cells first")

# Create Dataset entity
if syn and PROJECT_ID:
    dataset_id = create_dataset_entity_with_validation(syn, PROJECT_ID)
    
    if dataset_id:
        print(f"\nüéâ Dataset entity ready!")
        print(f"   üìã Dataset ID: {dataset_id}")
        print(f"   üîó View Dataset: https://www.synapse.org/#!Synapse:{dataset_id}")
        
        # Store for use in other cells
        DATASET_ID = dataset_id
        print(f"\n‚úÖ Dataset ID stored in variable: DATASET_ID = '{dataset_id}'")
        
        print(f"\nüí° Next steps:")
        print(f"   üìÅ Add files to dataset using dataset.dataset_items")
        print(f"   üè∑Ô∏è  Update annotations as needed")
        print(f"   üìä Create additional Entity Views if needed")
    else:
        print("‚ùå Dataset entity creation failed")
else:
    print("‚ùå Missing required variables: syn, PROJECT_ID")
    print("üìù Please run previous cells to set up these variables")

In [None]:
# CELL 13: ADD FILES TO DATASET ENTITY

from synapseclient.models import Folder

def add_staging_folder_to_dataset(syn, dataset_id, staging_folder_id):
    """Add all files from staging folder to the dataset entity."""
    try:
        print(f"üìÇ Adding staging folder contents to dataset...")
        print(f"   üìã Dataset ID: {dataset_id}")
        print(f"   üìÅ Staging Folder ID: {staging_folder_id}")
        
        # Get the dataset entity
        dataset=Dataset(id=dataset_id).get() 
        
        # Get initial item count
        initial_count = len(dataset.dataset_items) if hasattr(dataset, 'dataset_items') else 0
        print(f"   üìä Current dataset items: {initial_count}")
        
        # Add the entire staging folder (this recursively adds all child files)
        print(f"\nüîÑ Adding folder to dataset...")
        dataset.add_item(Folder(id=staging_folder_id))
        
        # Store the changes to Synapse
        print(f"üíæ Saving changes to Synapse...")
        updated_dataset = dataset.store()
        
        # Get updated item count
        final_count = len(updated_dataset.dataset_items) if hasattr(updated_dataset, 'dataset_items') else 0
        added_count = final_count - initial_count
        
        print(f"\n‚úÖ Successfully added staging folder to dataset!")
        print(f"   üìä Total dataset items: {final_count}")
        print(f"   ‚ûï Files added: {added_count}")
        print(f"   üîó Dataset URL: https://www.synapse.org/#!Synapse:{dataset_id}")
        
        # List some of the items
        if final_count > 0:
            print(f"\nüìã Dataset items preview:")
            for idx, item in enumerate(updated_dataset.dataset_items[:5]):
                entity_id = item.entity_id if hasattr(item, 'entity_id') else 'unknown'
                version = item.version_number if hasattr(item, 'version_number') else 'latest'
                print(f"   {idx+1}. {entity_id} (v{version})")
            
            if final_count > 5:
                print(f"   ... and {final_count - 5} more items")
        
        return updated_dataset
        
    except Exception as e:
        print(f"‚ùå Error adding staging folder to dataset: {e}")
        import traceback
        traceback.print_exc()
        return None

def verify_dataset_contents(syn, dataset_id):
    """Verify the dataset contents and annotations."""
    try:
        print(f"\nüîç DATASET VERIFICATION")
        print("=" * 35)
        
        # Get the dataset
        dataset=Dataset(id=dataset_id).get()
        
        print(f"üìã Dataset: {dataset.name}")
        print(f"   ID: {dataset.id}")
        print(f"   Parent: {dataset.parent_id}")
        
        # Check items
        item_count = len(dataset.dataset_items) if hasattr(dataset, 'dataset_items') else 0
        print(f"\nüìä Dataset Items: {item_count}")
        
        # Check annotations
        annotations = dataset.annotations if hasattr(dataset, 'annotations') else {}
        print(f"üè∑Ô∏è  Annotations: {len(annotations)}")
        
        if annotations:
            print(f"\n   Key annotations:")
            # Show some important annotations
            important_keys = ['title', 'description', 'dataType', 'studyPhase', 'disease', 'dataFormat']
            for key in important_keys:
                if key in annotations:
                    value = annotations[key]
                    if isinstance(value, list):
                        display_value = ', '.join([str(v) for v in value[:3]])
                        if len(value) > 3:
                            display_value += f" (+{len(value)-3} more)"
                    else:
                        display_value = str(value)[:60]
                    print(f"   ‚Ä¢ {key}: {display_value}")
        
        print(f"\n‚úÖ Dataset verification complete!")
        return True
        
    except Exception as e:
        print(f"‚ùå Error verifying dataset: {e}")
        return False

# Execute: Add files to dataset
print("üèóÔ∏è  ADD FILES TO DATASET")
print("=" * 30)

# Re-establish variables if needed
try:
    if 'syn' not in globals() or syn is None:
        print("‚ö†Ô∏è Reconnecting to Synapse...")
        syn = connect_to_synapse()
    
    if 'STAGING_FOLDER_ID' not in globals():
        STAGING_FOLDER_ID = "syn68927891"  # fallback
    
    # Check if we have a dataset_id from the previous cell
    if 'dataset_id' not in globals() or dataset_id is None:
        print("‚ùå No dataset_id found from previous cell")
        print("üìù Please run Cell 12 (Dataset Entity Creation) first")
    else:
        print(f"üìã Using dataset from previous cell: {dataset_id}")
        
        # Add staging folder to dataset
        updated_dataset = add_staging_folder_to_dataset(
            syn, 
            dataset_id, 
            STAGING_FOLDER_ID
        )
        
        if updated_dataset:
            # Verify the dataset
            verify_dataset_contents(syn, dataset_id)
        else:
            print("‚ùå Could not add files to dataset")
            
except NameError as e:
    print(f"‚ùå Missing configuration: {e}")
    print("üìù Please run previous cells first")
except Exception as e:
    print(f"‚ùå Error: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# CELL 14: ADD DATASET COLUMNS FOR FACETED SEARCH

from synapseclient.models import Column, ColumnType, FacetType, Dataset

def get_dataset_column_schema(dataset_type):
    """Get column schema based on dataset type (Clinical or Omic)."""
    
    # Shared columns for both clinical and omic datasets
    # Note: Set maximum_size and maximum_list_length to stay under 64KB row limit
    shared_columns = [
        {"name": "dataType", "type": ColumnType.STRING, "facet": FacetType.ENUMERATION, "max_size": 100, "desc": "Data type"},
        {"name": "fileFormat", "type": ColumnType.STRING, "facet": FacetType.ENUMERATION, "max_size": 50, "desc": "File format"},
        {"name": "species", "type": ColumnType.STRING, "facet": FacetType.ENUMERATION, "max_size": 100, "desc": "Species"},
        {"name": "disease", "type": ColumnType.STRING, "facet": FacetType.ENUMERATION, "max_size": 100, "desc": "Disease"},
        {"name": "studyType", "type": ColumnType.STRING, "facet": FacetType.ENUMERATION, "max_size": 100, "desc": "Study type"},
        {"name": "dataFormat", "type": ColumnType.STRING_LIST, "facet": FacetType.ENUMERATION, "max_list_len": 10, "desc": "Data format(s)"},
    ]
    
    # Clinical-specific columns
    clinical_columns = [
        {"name": "studyPhase", "type": ColumnType.STRING, "facet": FacetType.ENUMERATION, "max_size": 100, "desc": "Phase of study"},
        {"name": "keyMeasures", "type": ColumnType.STRING_LIST, "facet": FacetType.ENUMERATION, "max_list_len": 20, "desc": "Key measurements"},
        {"name": "assessmentType", "type": ColumnType.STRING_LIST, "facet": FacetType.ENUMERATION, "max_list_len": 15, "desc": "Type of assessment"},
        {"name": "clinicalDomain", "type": ColumnType.STRING_LIST, "facet": FacetType.ENUMERATION, "max_list_len": 15, "desc": "Clinical domain"},
        {"name": "hasLongitudinalData", "type": ColumnType.BOOLEAN, "facet": FacetType.ENUMERATION, "desc": "Contains longitudinal data"},
        {"name": "studyDesign", "type": ColumnType.STRING, "facet": FacetType.ENUMERATION, "max_size": 150, "desc": "Study design type"},
        {"name": "primaryOutcome", "type": ColumnType.STRING, "facet": None, "max_size": 250, "desc": "Primary outcome measure"},
    ]
    
    # Omic-specific columns
    omic_columns = [
        {"name": "assay", "type": ColumnType.STRING_LIST, "facet": FacetType.ENUMERATION, "max_list_len": 10, "desc": "Assay type(s)"},
        {"name": "platform", "type": ColumnType.STRING, "facet": FacetType.ENUMERATION, "max_size": 150, "desc": "Sequencing/analysis platform"},
        {"name": "libraryStrategy", "type": ColumnType.STRING, "facet": FacetType.ENUMERATION, "max_size": 100, "desc": "Library strategy"},
        {"name": "libraryLayout", "type": ColumnType.STRING, "facet": FacetType.ENUMERATION, "max_size": 50, "desc": "Library layout"},
        {"name": "cellType", "type": ColumnType.STRING_LIST, "facet": FacetType.ENUMERATION, "max_list_len": 10, "desc": "Cell type(s)"},
        {"name": "biospecimenType", "type": ColumnType.STRING_LIST, "facet": FacetType.ENUMERATION, "max_list_len": 10, "desc": "Biospecimen type(s)"},
        {"name": "processingLevel", "type": ColumnType.STRING, "facet": FacetType.ENUMERATION, "max_size": 100, "desc": "Data processing level"},
    ]
    
    # Combine columns based on dataset type
    if dataset_type and 'omic' in dataset_type.lower():
        return shared_columns + omic_columns
    else:
        return shared_columns + clinical_columns

def add_dataset_columns(syn, dataset_id, dataset_type=None):
    """Add annotation columns to dataset for faceted search."""
    try:
        print(f"üîß Adding columns to dataset for faceted search...")
        print(f"   üìã Dataset ID: {dataset_id}")
        print(f"   üìä Dataset Type: {dataset_type or 'Auto-detect'}")
        
        # Get the dataset using Dataset model (not syn.get)
        print(f"   üîÑ Loading dataset...")
        dataset = Dataset(id=dataset_id).get()
        
        # Auto-detect dataset type if not provided
        if not dataset_type:
            annotations = dataset.annotations if hasattr(dataset, 'annotations') else {}
            dataset_type = annotations.get('_dataset_type', 'ClinicalDataset')
            print(f"   üîç Auto-detected type: {dataset_type}")
        
        # Get column schema for this dataset type
        columns_to_add = get_dataset_column_schema(dataset_type)
        
        # Get existing columns
        existing_columns = []
        if hasattr(dataset, 'columns_to_store') and dataset.columns_to_store:
            existing_columns = [col.name for col in dataset.columns_to_store]
        
        print(f"   üìä Existing columns: {len(existing_columns)}")
        print(f"   ‚ûï Columns to add: {len(columns_to_add)}")
        
        # Add new columns
        new_columns = []
        for col_info in columns_to_add:
            if col_info['name'] not in existing_columns:
                new_columns.append(col_info)
        
        if new_columns:
            print(f"\nüîÑ Adding {len(new_columns)} new columns...")
            for col_info in new_columns:
                try:
                    # Build column with size constraints
                    col_kwargs = {
                        'name': col_info['name'],
                        'column_type': col_info['type'],
                        'facet_type': col_info.get('facet')
                    }
                    
                    # Add size constraints based on column type
                    if col_info['type'] == ColumnType.STRING and 'max_size' in col_info:
                        col_kwargs['maximum_size'] = col_info['max_size']
                    elif col_info['type'] == ColumnType.STRING_LIST and 'max_list_len' in col_info:
                        col_kwargs['maximum_list_length'] = col_info['max_list_len']
                    
                    col = Column(**col_kwargs)
                    dataset.add_column(column=col)
                    
                    # Show size info
                    size_info = ''
                    if 'max_size' in col_info:
                        size_info = f" (max: {col_info['max_size']})"
                    elif 'max_list_len' in col_info:
                        size_info = f" (max list: {col_info['max_list_len']})"
                    
                    print(f"   ‚úì {col_info['name']}: {col_info['type'].value}{size_info}")
                except Exception as e:
                    print(f"   ‚úó {col_info['name']}: {e}")
            
            # Store the dataset to persist columns using Dataset model's store method
            print(f"\nüíæ Saving columns to dataset...")
            updated_dataset = dataset.store()
            
            # Verify columns were added
            final_column_count = len(updated_dataset.columns_to_store) if hasattr(updated_dataset, 'columns_to_store') else 0
            
            print(f"\n‚úÖ Successfully added columns to dataset!")
            print(f"   üìä Total columns: {final_column_count}")
            print(f"   ‚ûï New columns: {len(new_columns)}")
            print(f"   üîó Dataset URL: https://www.synapse.org/#!Synapse:{dataset_id}")
            
            return updated_dataset
        else:
            print(f"\n‚úÖ All required columns already exist!")
            print(f"   üìä Total columns: {len(existing_columns)}")
            return dataset
        
    except Exception as e:
        print(f"‚ùå Error adding columns to dataset: {e}")
        import traceback
        traceback.print_exc()
        return None

def verify_dataset_columns(syn, dataset_id):
    """Verify dataset columns were added correctly."""
    try:
        print(f"\nüîç COLUMN VERIFICATION")
        print("=" * 35)
        
        # Get the dataset using Dataset model with include_columns=True
        dataset = Dataset(id=dataset_id).get(include_columns=True)
        
        if hasattr(dataset, 'columns_to_store') and dataset.columns_to_store:
            columns = dataset.columns_to_store
            print(f"üìä Total columns: {len(columns)}")
            
            # Group columns by facet type
            faceted_columns = [c for c in columns if c.facet_type]
            non_faceted_columns = [c for c in columns if not c.facet_type]
            
            print(f"\n   Faceted columns (searchable): {len(faceted_columns)}")
            print(f"   Non-faceted columns: {len(non_faceted_columns)}")
            
            # Show sample of faceted columns
            print(f"\n   Sample faceted columns:")
            for col in faceted_columns[:10]:
                facet_display = col.facet_type.value if col.facet_type else 'None'
                # Show size constraints
                size_info = ''
                if col.maximum_size:
                    size_info = f" (max: {col.maximum_size})"
                elif col.maximum_list_length:
                    size_info = f" (max list: {col.maximum_list_length})"
                print(f"   ‚Ä¢ {col.name}: {col.column_type.value}{size_info}")
            
            if len(faceted_columns) > 10:
                print(f"   ... and {len(faceted_columns) - 10} more faceted columns")
        else:
            print(f"‚ö†Ô∏è  No columns found on dataset")
        
        print(f"\n‚úÖ Column verification complete!")
        return True
        
    except Exception as e:
        print(f"‚ùå Error verifying columns: {e}")
        import traceback
        traceback.print_exc()
        return False

# Execute: Add columns to dataset
print("üèóÔ∏è  ADD DATASET COLUMNS")
print("=" * 30)

# Re-establish variables if needed
try:
    if 'syn' not in globals() or syn is None:
        print("‚ö†Ô∏è Reconnecting to Synapse...")
        syn = connect_to_synapse()
    
    # Check if we have a dataset_id from previous cells
    if 'dataset_id' not in globals() or dataset_id is None:
        print("‚ùå No dataset_id found from previous cells")
        print("üìù Please run Cell 12 (Dataset Entity Creation) first")
    else:
        print(f"üìã Using dataset from previous cell: {dataset_id}")
        
        # Load dataset annotations to get type
        if 'dataset_annotation_file_path' in globals() and os.path.exists(dataset_annotation_file_path):
            dataset_annotations = load_existing_annotations(dataset_annotation_file_path)
            dataset_type = dataset_annotations.get('_dataset_type', 'ClinicalDataset')
        else:
            dataset_type = None  # Will auto-detect
        
        # Add columns to dataset
        updated_dataset = add_dataset_columns(
            syn, 
            dataset_id,
            dataset_type=dataset_type
        )
        
        if updated_dataset:
            # Verify the columns
            verify_dataset_columns(syn, dataset_id)
        else:
            print("‚ùå Could not add columns to dataset")
            
except NameError as e:
    print(f"‚ùå Missing configuration: {e}")
    print("üìù Please run previous cells first")
except Exception as e:
    print(f"‚ùå Error: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# CELL 11: CREATE DATASET WIKI
def set_wiki_content(wiki_content,dataset_id) -> str: 
    wiki = syn.store(Wiki(title="Dataset Documentation", markdown=wiki_content, owner=dataset_id))
    print(f"Wiki created successfully with ID: {wiki.id}")
    return wiki.id
wiki_content="""
**Summary:** The study describes the first large-scale, NIH-funded multicenter Expanded Access Protocol (EAP) for ALS in the United States, offering investigational intravenous trehalose (SLS-005) to individuals with ALS who were ineligible for concurrent randomized clinical trials (RCTs). Over 24 weeks, 70 participants enrolled at 20 sites received trehalose infusions and provided detailed clinical and biomarker data. No significant benefit was found for neurofilament light levels, functional status, or survival compared to historical controls. Biomarker and clinical data, as well as additional banked serum samples, are now available to the research community for further study.

**Overall Design:**
- Multicenter, open-label, non-randomized EAP for ALS patients ineligible for RCTs.
- Conducted at 20 sites across the US, involving up to 24 weeks of weekly intravenous trehalose infusion.
- Two participant cohorts: (1) ALS patients na√Øve to trehalose and trial-ineligible and (2) previous trial participants needing continued drug access.
- Outcomes included safety/tolerability, neurofilament light chain (NfL) biomarker, ALS Functional Rating Scale-Revised (ALSFRS-R), slow vital capacity, survival, and quality of life.
- Data and serum samples shared in public repositories for further research; protocol included patient and caregiver involvement, and remote/home infusions when possible.

<details>

<summary>Show More</summary>
<b>Contact:</b>

- Sabrina Paganoni, MD, PhD; spaganoni@mgh.harvard.edu.
- Sean M. Healey & AMG Center for ALS and Neurological Clinical Research Institute, Massachusetts General Hospital, Harvard Medical School, Boston, MA, USA  

<b>Contributors:</b>  Senda Ajroud-Driss, Suma Babu, James D. Berry, Cynthia Bodkin, Namita A. Goyal, Kelly Gwathmey, Daragh Heitzman, Shafeeq Ladha, Courtney E. McIlduff, Sabrina Paganoni, Laura Rosow, Mr. Alexander V. Sherman, David Walk, Jackie Whitesell, Eufrosina Young, Warren Wasiewski

<b>Publication:</b>

- Krivickas B, Scirocco E, Giacomelli E, Sharma S, et al. Multicenter Expanded Access Protocol for Research Through Access to Trehalose in People With Amyotrophic Lateral Sclerosis. *Muscle & Nerve*. 2025;0:1‚Äì9. doi: 10.1002/mus.70011.
[**DOI:** 10.1002/mus.70011](https://doi.org/10.1002/mus.70011)

</details>
"""
set_wiki_content(wiki_content,"syn72016774")


Wiki created successfully with ID: 636540


'636540'

In [22]:
# CELL 12: DATASET COLUMN REORDERING 
def reorder_dataset_columns(syn, dataset_id, desired_column_order):
    dataset=Dataset(id=dataset_id).get(include_columns=True)
    current_columns = list(dataset.columns.keys())
    final_order=[]
    for col in desired_column_order:
        if col in current_columns:
            final_order.append(col)
    remaining_cols = [col for col in current_columns if col not in final_order]
    final_order.extend(remaining_cols)
    for target_index, col_name in enumerate(final_order):
        dataset.reorder_column(name=col_name, index=target_index)
    dataset.store()

# Template-based column ordering (adapted for SRA dataset)
# Based on syn68808453 but modified for SRA-specific fields
template_column_order = [
        # === SYSTEM COLUMNS (keep at front) ===
        'id',
        'name',

        # === KEY FILE ANNOTATIONS (high priority) ===
        'fileFormat',
        'studyType',
        'dataType',
        'keyMeasures',
        'assessmentType',
        'ClinicalDomain',
        'hasLongitudinalData',
        'disease',
        # === DEFAULT SYNAPSE COLUMNS (maintain original order) ===
        'description',
        'createdOn',
        'createdBy',
        'etag',
        'modifiedOn',
        'modifiedBy',
        'path',
        'type',
        'currentVersion',
        'parentId',
        'benefactorId',
        'projectId',
        'dataFileHandleId',
        'dataFileName',
        'dataFileSizeBytes',
        'dataFileMD5Hex',
        'dataFileConcreteType',
        'dataFileBucket',
        'dataFileKey'
]
reorder_dataset_columns(syn, "syn72016774", template_column_order)

[syn72016774:Trehalose Biomarker Dataset]: (Column Order): ['id', 'name', 'fileFormat', 'studyType', 'dataType', 'disease', 'description', 'createdOn', 'createdBy', 'etag', 'modifiedOn', 'modifiedBy', 'path', 'type', 'currentVersion', 'parentId', 'benefactorId', 'projectId', 'dataFileHandleId', 'dataFileName', 'dataFileSizeBytes', 'dataFileMD5Hex', 'dataFileConcreteType', 'dataFileBucket', 'dataFileKey', 'species', 'dataFormat', 'assay', 'platform', 'libraryStrategy', 'libraryLayout', 'cellType', 'biospecimenType', 'processingLevel']


/entity/syn72016774/table/transaction/async: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.00/1.00 [00:01<00:00, 1.13s/it]


In [None]:
# CELL 15: REORDER DATASET COLUMNS

from synapseclient.models import Dataset

def get_column_order_template(dataset_type):
    """Get column order template based on dataset type."""
    
    # System columns - always first
    system_columns = [
        'id',
        'name',
    ]
    
    # Shared high-priority annotation columns
    shared_priority = [
        'dataType',
        'fileFormat',
        'studyType',
        'species',
        'disease',
        'dataFormat',
    ]
    
    # Clinical-specific priority columns
    clinical_priority = [
        'studyPhase',
        'assessmentType',
        'clinicalDomain',
        'keyMeasures',
        'hasLongitudinalData',
        'studyDesign',
        'primaryOutcome',
    ]
    
    # Omic-specific priority columns
    omic_priority = [
        'assay',
        'platform',
        'libraryStrategy',
        'libraryLayout',
        'cellType',
        'biospecimenType',
        'processingLevel',
    ]
    
    # Standard Synapse columns - keep after annotations
    synapse_columns = [
        'description',
        'createdOn',
        'createdBy',
        'etag',
        'modifiedOn',
        'modifiedBy',
        'path',
        'type',
        'currentVersion',
        'parentId',
        'benefactorId',
        'projectId',
        'dataFileHandleId',
        'dataFileName',
        'dataFileSizeBytes',
        'dataFileMD5Hex',
        'dataFileConcreteType',
        'dataFileBucket',
        'dataFileKey',
    ]
    
    # Build complete order based on dataset type
    if dataset_type and 'omic' in dataset_type.lower():
        return system_columns + shared_priority + omic_priority + synapse_columns
    else:
        return system_columns + shared_priority + clinical_priority + synapse_columns

def reorder_dataset_columns(syn, dataset_id, dataset_type=None):
    """Reorder dataset columns based on template."""
    try:
        print(f"üîÑ Reordering dataset columns...")
        print(f"   üìã Dataset ID: {dataset_id}")
        print(f"   üìä Dataset Type: {dataset_type or 'Auto-detect'}")
        
        # Get the dataset with columns
        print(f"   üîÑ Loading dataset with columns...")
        dataset = Dataset(id=dataset_id).get(include_columns=True)
        
        # Auto-detect dataset type if not provided
        if not dataset_type:
            annotations = dataset.annotations if hasattr(dataset, 'annotations') else {}
            dataset_type = annotations.get('_dataset_type', 'ClinicalDataset')
            print(f"   üîç Auto-detected type: {dataset_type}")
        
        # Get column order template
        template_order = get_column_order_template(dataset_type)
        
        # Get current columns
        current_columns = list(dataset.columns.keys())
        print(f"   üìä Current columns: {len(current_columns)}")
        print(f"   üìê Template positions: {len(template_order)}")
        
        # Build final order: template columns first, then any remaining
        final_order = []
        
        # Add columns from template (if they exist)
        for col in template_order:
            if col in current_columns:
                final_order.append(col)
        
        # Add any remaining columns not in template
        remaining_columns = [col for col in current_columns if col not in final_order]
        final_order.extend(remaining_columns)
        
        print(f"\nüìä Reordering plan:")
        print(f"   Template-ordered: {len(final_order) - len(remaining_columns)}")
        print(f"   Additional columns: {len(remaining_columns)}")
        if remaining_columns:
            preview = ', '.join(remaining_columns[:5])
            if len(remaining_columns) > 5:
                preview += f" + {len(remaining_columns) - 5} more"
            print(f"   Additional: {preview}")
        
        # Execute reordering
        print(f"\nüîÑ Executing column reordering...")
        reorder_count = 0
        failed_count = 0
        
        for target_index, col_name in enumerate(final_order):
            try:
                current_position = list(dataset.columns.keys()).index(col_name)
                
                if current_position != target_index:
                    dataset.reorder_column(name=col_name, index=target_index)
                    # Show details for first 15 moves
                    if reorder_count < 15:
                        print(f"   üîÑ '{col_name}': position {current_position} ‚Üí {target_index}")
                    reorder_count += 1
                    
            except Exception as e:
                print(f"   ‚ùå Failed to reorder '{col_name}': {e}")
                failed_count += 1
        
        if reorder_count > 15:
            print(f"   ... and {reorder_count - 15} more reorderings")
        
        # Store the dataset if changes were made
        if reorder_count > 0:
            print(f"\nüíæ Storing dataset with new column order...")
            updated_dataset = dataset.store()
            
            print(f"\n‚úÖ Successfully reordered columns!")
            print(f"   ‚úì Reordered: {reorder_count} columns")
            if failed_count > 0:
                print(f"   ‚úó Failed: {failed_count} columns")
            
            # Show new column order (first 20)
            final_columns = list(updated_dataset.columns.keys())
            print(f"\nüìã NEW COLUMN ORDER (first 20 of {len(final_columns)}):")
            
            for i, col in enumerate(final_columns[:20]):
                # Mark template vs additional columns
                marker = "üéØ" if col in template_order else "‚ûï"
                print(f"   {i+1:2d}. {marker} {col}")
            
            if len(final_columns) > 20:
                print(f"   ... and {len(final_columns) - 20} more columns")
            
            # Template compliance
            template_cols_in_front = sum(1 for col in final_columns[:len(template_order)] if col in template_order)
            print(f"\nüéØ Template Compliance:")
            print(f"   Template columns positioned correctly: {template_cols_in_front}")
            
            print(f"\nüîó Dataset URL: https://www.synapse.org/#!Synapse:{dataset_id}")
            
            return updated_dataset
        else:
            print(f"\n‚úÖ Columns already in correct order!")
            return dataset
        
    except Exception as e:
        print(f"‚ùå Error reordering columns: {e}")
        import traceback
        traceback.print_exc()
        return None

# Execute: Reorder dataset columns
print("üèóÔ∏è  REORDER DATASET COLUMNS")
print("=" * 30)

# Re-establish variables if needed
try:
    if 'syn' not in globals() or syn is None:
        print("‚ö†Ô∏è Reconnecting to Synapse...")
        syn = connect_to_synapse()
    
    # Check if we have a dataset_id from previous cells
    if 'dataset_id' not in globals() or dataset_id is None:
        print("‚ùå No dataset_id found from previous cells")
        print("üìù Please run Cell 12 (Dataset Entity Creation) first")
    else:
        print(f"üìã Using dataset from previous cell: {dataset_id}")
        
        # Load dataset annotations to get type
        if 'dataset_annotation_file_path' in globals() and os.path.exists(dataset_annotation_file_path):
            dataset_annotations = load_existing_annotations(dataset_annotation_file_path)
            dataset_type = dataset_annotations.get('_dataset_type', 'ClinicalDataset')
        else:
            dataset_type = None  # Will auto-detect
        
        # Reorder columns
        updated_dataset = reorder_dataset_columns(
            syn, 
            dataset_id,
            dataset_type=dataset_type
        )
        
        if updated_dataset:
            print(f"\nüéØ BENEFITS OF COLUMN ORDERING:")
            print(f"   üìã Consistent layout across datasets")
            print(f"   üë• Familiar structure for researchers")
            print(f"   üîç Key annotations prominently positioned")
            print(f"   üìä Technical metadata logically grouped")
        else:
            print("‚ùå Could not reorder columns")
            
except NameError as e:
    print(f"‚ùå Missing configuration: {e}")
    print("üìù Please run previous cells first")
except Exception as e:
    print(f"‚ùå Error: {e}")
    import traceback
    traceback.print_exc()


In [28]:
# CELL 13: Acknowledgement Statement & description
def set_acknowledgement_statement(syn, dataset_id, statement, description=""):
    dataset=syn.get(dataset_id, downloadFile=False)
    dataset_annotations=dataset.annotations
    dataset_annotations['acknowledgementStatement'] = statement
    dataset.annotations = dataset_annotations
    dataset.description = description
    syn.store(dataset, forceVersion=False)
    return dataset_annotations
set_acknowledgement_statement(syn, "syn72016774", "syn64892175/wiki/633969")


{'url': ['https://www.synapse.org/Synapse:syn68927891'],
 'assay': ['ELISA',
  'quantitative PCR',
  'miRNA-seq',
  'liquid chromatography/tandem mass spectrometry',
  'proximity extension assay',
  'immunoassay',
  'Protein target assay'],
 'title': ['Trehalose Biomarker Dataset'],
 'sameAs': [''],
 'source': ['ALL ALS Clinical Trials'],
 'creator': ['Sabrina Paganoni, MD, PhD'],
 'disease': [''],
 'license': [''],
 'species': ['Homo sapiens'],
 'dataType': ['clinical',
  'biomarker',
  'metabolomics',
  'proteomics',
  'SomaScan'],
 'keywords': ['Amyotrophic Lateral Sclerosis',
  'trehalose',
  'biomarker',
  'metabolomics',
  'proteomics',
  'SomaScan'],
 'platform': [''],
 'publisher': [''],
 'dataFormat': ['excel', 'csv', 'pdf'],
 'contributor': ['Sabrina Paganoni, MD, PhD'],
 'description': [''],
 'alternateName': [''],
 'curationLevel': [''],
 'datePublished': [''],
 'FACSPopulation': [''],
 'GEOSuperSeries': [''],
 'individualCount': ['70'],
 'libraryStrategy': [''],
 'processi

In [29]:
# CELL 13: SET DATASET PERMISSIONS
def set_dataset_permissions(syn, dataset_id, principal_id, access_type, modify_benefactor=False, overwrite=True):
    dataset = Dataset(id=dataset_id).get(include_columns=True)
    dataset.get_permissions()
    main_folder_permissions = ["READ", "DOWNLOAD"]
    dataset.set_permissions(
        principal_id=principal_id,
        access_type=main_folder_permissions,
        modify_benefactor=modify_benefactor,  # Create local ACL for this folder
        overwrite=overwrite,
    )
set_dataset_permissions(syn, "syn72016774", 273948, ["READ", "DOWNLOAD"], modify_benefactor=False, overwrite=True)

In [None]:
# CELL 14: SNAPSHOT
def snapshot_dataset(syn, dataset_id, comment="Dataset snapshot", label="v1.0"):
    dataset = Dataset(id=dataset_id).get(include_columns=True)
    snapshot = dataset.snapshot(comment=comment, label=label)
    print(f"‚úÖ Created snapshot successfully!")
    #print(f"   üìã Snapshot ID: {snapshot.id}")
    print(f"   üè∑Ô∏è Name: {snapshot.name}")
    print(f"   üîó URL: https://www.synapse.org/#!Synapse:{snapshot.id}")
    return snapshot.id
snapshot_dataset(syn, "syn72016774", comment="Trehalose Biomarker 2026.1 release", label="2026.1")

[syn72016774:Trehalose Biomarker Dataset]: Creating a snapshot of the <class 'synapseclient.models.dataset.Dataset'>.


/entity/syn72016774/table/transaction/async: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.00/1.00 [00:01<00:00, 1.12s/it]


‚úÖ Created snapshot successfully!


AttributeError: 'TableUpdateTransaction' object has no attribute 'id'

In [34]:
# CELL 15: Add Dataset to Prod Collection 
def add_dataset_to_collection(syn, dataset_id, collection_id):
    try:

        dataset_collection = DatasetCollection(id=collection_id).get()
        dataset = Dataset(id=dataset_id).get(include_columns=True)
        dataset_collection.add_item(dataset)
        dataset_collection.store()
        print(f"‚úÖ Added Dataset {dataset_id} to Collection {collection_id} successfully!")

    except Exception as e:
        print(f"‚ùå Error adding Dataset to Collection: {e}")
add_dataset_to_collection(syn, "syn72016774", DATASETS_COLLECTION_ID)

[ERROR] Error occurred while running store_async on <class 'synapseclient.models.dataset.DatasetCollection'>.
Traceback (most recent call last):
  File "/home/ramayyala/.local/share/mamba/envs/amp-als/lib/python3.10/site-packages/synapseclient/core/async_utils.py", line 133, in newmethod
    return loop.run_until_complete(wrapper(*args, **kwargs))
  File "/home/ramayyala/.local/share/mamba/envs/amp-als/lib/python3.10/site-packages/nest_asyncio.py", line 98, in run_until_complete
    return f.result()
  File "/home/ramayyala/.local/share/mamba/envs/amp-als/lib/python3.10/asyncio/futures.py", line 201, in result
    raise self._exception
  File "/home/ramayyala/.local/share/mamba/envs/amp-als/lib/python3.10/asyncio/tasks.py", line 232, in __step
    result = coro.send(None)
  File "/home/ramayyala/.local/share/mamba/envs/amp-als/lib/python3.10/site-packages/synapseclient/core/async_utils.py", line 122, in wrapper
    return await getattr(self, async_method_name)(*args, **kwargs)
  File "

‚ùå Error adding Dataset to Collection: 400 Client Error: Each dataset collection item must have a unique entity ID.  Duplicate: syn72016774


In [33]:
# CELL 15: Move to release
def move_folder(syn, folder_id, new_parent_id):
    folder = Folder(id=folder_id).get()
    folder.parent_id = new_parent_id
    folder = folder.store()
    print(f"Moved folder to: {folder.parent_id}")
move_folder(syn, STAGING_FOLDER_ID, RELEASE_FOLDER_ID)

Moved folder to: syn68885183
