In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from datafolio import DataFolio

# Clean up any existing test bundles
import shutil
if Path('demo_bundles').exists():
    shutil.rmtree('demo_bundles')

print("=" * 60)
print("DataFolio Demo - Protein Analysis Workflow")
print("=" * 60)


DataFolio Demo - Protein Analysis Workflow


In [3]:
print("\nüìä STEP 1: Creating simulated external datasets...")

# Create temp directory for external data
external_data_dir = Path('temp_external_data')
external_data_dir.mkdir(exist_ok=True)

# Large training dataset (simulated datalake data)
training_data = pd.DataFrame({
    'protein_id': [f'PROT_{i:05d}' for i in range(1000)],
    'sequence_length': np.random.randint(50, 500, 1000),
    'hydrophobicity': np.random.randn(1000),
    'charge': np.random.randn(1000),
    'label': np.random.choice(['membrane', 'cytoplasmic', 'nuclear'], 1000)
})
training_file = external_data_dir / 'training_proteins.parquet'
training_data.to_parquet(training_file, index=False)
print(f"  ‚úì Created external training data: {len(training_data)} proteins")

# Validation dataset
validation_data = pd.DataFrame({
    'protein_id': [f'PROT_{i:05d}' for i in range(1000, 1200)],
    'sequence_length': np.random.randint(50, 500, 200),
    'hydrophobicity': np.random.randn(200),
    'charge': np.random.randn(200),
    'label': np.random.choice(['membrane', 'cytoplasmic', 'nuclear'], 200)
})
validation_file = external_data_dir / 'validation_proteins.parquet'
validation_data.to_parquet(validation_file, index=False)
print(f"  ‚úì Created external validation data: {len(validation_data)} proteins")



üìä STEP 1: Creating simulated external datasets...
  ‚úì Created external training data: 1000 proteins
  ‚úì Created external validation data: 200 proteins


In [5]:
print("\nüóÇÔ∏è  STEP 2: Creating DataFolio bundle...")

folio = DataFolio(
    path='demo_bundles',
    prefix='protein-analysis',
    metadata={
        'experiment': 'protein_localization_v2',
        'date': '2024-01-15',
        'scientist': 'Dr. Smith',
        'model_type': 'random_forest',
        'description': 'Protein subcellular localization prediction',
        'parameters': {
            'n_estimators': 100,
            'max_depth': 10,
            'random_state': 42
        }
    }
)

print(f"  ‚úì Bundle created: {folio._bundle_dir}")
print(f"  ‚úì Initial metadata keys: {list(folio.metadata.keys())}")



üóÇÔ∏è  STEP 2: Creating DataFolio bundle...
  ‚úì Bundle created: demo_bundles/protein-analysis-mindful-indigo-viper
  ‚úì Initial metadata keys: ['experiment', 'date', 'scientist', 'model_type', 'description', 'parameters', 'created_at', 'updated_at']


In [6]:
print("\nüîó STEP 3: Referencing external datasets...")

folio.reference_table(
    'training_data',
    path=str(training_file),
    table_format='parquet',
    num_rows=len(training_data),
    description='Large training dataset from protein database',
    code='training_data.to_parquet(training_file)'
)

folio.reference_table(
    'validation_data',
    path=str(validation_file),
    table_format='parquet',
    num_rows=len(validation_data),
    description='Validation dataset for model evaluation',
    code='validation_data.to_parquet(validation_file)'
)

print(f"  ‚úì Referenced training data ({len(training_data)} rows)")
print(f"  ‚úì Referenced validation data ({len(validation_data)} rows)")




üîó STEP 3: Referencing external datasets...
  ‚úì Referenced training data (1000 rows)
  ‚úì Referenced validation data (200 rows)


In [7]:
print("\nüìà STEP 4: Adding analysis results...")

# Model performance metrics
metrics_df = pd.DataFrame({
    'metric': ['accuracy', 'precision', 'recall', 'f1_score'],
    'train': [0.945, 0.932, 0.928, 0.930],
    'validation': [0.912, 0.901, 0.898, 0.899]
})
folio.add_table(
    'performance_metrics',
    metrics_df,
    description='Model performance summary',
    inputs=['training_data', 'validation_data'],
    models=['rf_classifier'],
    code='evaluate_model(model, X_train, y_train, X_val, y_val)'
)
print(f"  ‚úì Added performance metrics")

# Confusion matrix
confusion_matrix = pd.DataFrame(
    [[150, 10, 5],
     [8, 140, 12],
     [7, 15, 138]],
    index=['membrane', 'cytoplasmic', 'nuclear'],
    columns=['pred_membrane', 'pred_cytoplasmic', 'pred_nuclear']
)
folio.add_table(
    'confusion_matrix',
    confusion_matrix,
    description='Validation confusion matrix',
    inputs=['validation_data'],
    models=['rf_classifier'],
    code='confusion_matrix(y_true, y_pred)'
)
print(f"  ‚úì Added confusion matrix")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': ['sequence_length', 'hydrophobicity', 'charge'],
    'importance': [0.45, 0.35, 0.20]
}).sort_values('importance', ascending=False)
folio.add_table(
    'feature_importance',
    feature_importance,
    description='RF feature importance',
    models=['rf_classifier'],
    code='pd.DataFrame(model.feature_importances_)'
)
print(f"  ‚úì Added feature importance")



üìà STEP 4: Adding analysis results...
  ‚úì Added performance metrics
  ‚úì Added confusion matrix
  ‚úì Added feature importance


In [8]:
print("\nü§ñ STEP 5: Adding trained model...")

# Create and "train" a simple model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Prepare features
X_train = training_data[['sequence_length', 'hydrophobicity', 'charge']].values
y_train = training_data['label'].values

# Train model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train_scaled, y_train)

# Add model to bundle
folio.add_model(
    'rf_classifier',
    model,
    description='Random forest classifier for protein localization',
    inputs=['training_data'],
    hyperparameters={'n_estimators': 100, 'max_depth': 10, 'random_state': 42},
    code='model = RandomForestClassifier(...).fit(X_train_scaled, y_train)'
)
print(f"  ‚úì Model trained and added to bundle")

# Also save the scaler
folio.add_model(
    'scaler',
    scaler,
    description='Feature scaler (StandardScaler)',
    inputs=['training_data'],
    code='scaler = StandardScaler().fit(X_train)'
)
print(f"  ‚úì Scaler added to bundle")



ü§ñ STEP 5: Adding trained model...
  ‚úì Model trained and added to bundle
  ‚úì Scaler added to bundle


In [9]:
print("\nüé® STEP 6: Adding artifacts...")

# Create a simple plot
import matplotlib.pyplot as plt

# Feature importance plot
fig, ax = plt.subplots(figsize=(8, 5))
feature_importance.plot(x='feature', y='importance', kind='barh', ax=ax, legend=False)
ax.set_xlabel('Importance')
ax.set_title('Feature Importance')
plt.tight_layout()
plot_file = Path('temp_plot.png')
plt.savefig(plot_file)
plt.close()

folio.add_artifact('feature_importance_plot', plot_file, category='plots', description='Feature importance visualization')
plot_file.unlink()  # Clean up temp file
print(f"  ‚úì Added feature importance plot")



üé® STEP 6: Adding artifacts...
  ‚úì Added feature importance plot


In [10]:
print("\n‚úèÔ∏è  STEP 7: Updating metadata...")

folio.metadata['training_samples'] = len(training_data)
folio.metadata['validation_samples'] = len(validation_data)
folio.metadata['final_accuracy'] = 0.912
folio.metadata['notes'] = 'Initial model performs well, consider adding more features'

print(f"  ‚úì Metadata updated (auto-saved!)")
print(f"  ‚úì Total metadata keys: {len(folio.metadata)}")



‚úèÔ∏è  STEP 7: Updating metadata...
  ‚úì Metadata updated (auto-saved!)
  ‚úì Total metadata keys: 12


In [11]:
print("\nüìÇ STEP 9: Loading existing bundle (simulating new session)...")

# Get the bundle directory path
bundle_path = folio._bundle_dir

# Load it
loaded_folio = DataFolio(path=bundle_path)

print(f"  ‚úì Loaded bundle from: {bundle_path}")
print(f"  ‚úì Experiment: {loaded_folio.metadata['experiment']}")
print(f"  ‚úì Final accuracy: {loaded_folio.metadata['final_accuracy']}")



üìÇ STEP 9: Loading existing bundle (simulating new session)...
  ‚úì Loaded bundle from: demo_bundles/protein-analysis-mindful-indigo-viper
  ‚úì Experiment: protein_localization_v2
  ‚úì Final accuracy: 0.912


In [14]:
print(loaded_folio.describe())

DataFolio: demo_bundles/protein-analysis-mindful-indigo-viper

Created: 2025-10-13T16:18:39.741317+00:00
Updated: 2025-10-13T16:19:30.600211+00:00

Referenced Tables (2):
  ‚Ä¢ training_data [referenced_table]: Large training dataset from protein database
  ‚Ä¢ validation_data [referenced_table]: Validation dataset for model evaluation

Included Tables (3):
  ‚Ä¢ performance_metrics [included_table]: Model performance summary
    ‚Ü≥ inputs: training_data, validation_data
    ‚Ü≥ models: rf_classifier
  ‚Ä¢ confusion_matrix [included_table]: Validation confusion matrix
    ‚Ü≥ inputs: validation_data
    ‚Ü≥ models: rf_classifier
  ‚Ä¢ feature_importance [included_table]: RF feature importance
    ‚Ü≥ models: rf_classifier

Models (2):
  ‚Ä¢ rf_classifier [model]: Random forest classifier for protein localization
    ‚Ü≥ inputs: training_data
    ‚Ü≥ hyperparameters: n_estimators=100, max_depth=10, random_state=42
  ‚Ä¢ scaler [model]: Feature scaler (StandardScaler)
    ‚Ü≥ inputs: tr

In [20]:
# Read referenced table (from external file)
print(f"\n  ‚úì Reading referenced training data...")
training = loaded_folio.get_table('training_data')
print(f"    Shape: {training.shape}")
print(f"    First 3 rows:")



  ‚úì Reading referenced training data...
    Shape: (1000, 5)
    First 3 rows:


In [21]:
loaded_model = loaded_folio.get_model('rf_classifier')
loaded_scaler = loaded_folio.get_model('scaler')


In [22]:
sample = validation_data.iloc[0:1][['sequence_length', 'hydrophobicity', 'charge']].values
sample_scaled = loaded_scaler.transform(sample)
prediction = loaded_model.predict(sample_scaled)
print(f"    Sample prediction: {prediction[0]}")
print(f"    Actual label: {validation_data.iloc[0]['label']}")


    Sample prediction: nuclear
    Actual label: cytoplasmic


In [23]:
print("\nüîó STEP 11: Exploring lineage tracking...")

# View bundle description with lineage
print("\n" + folio.describe())

# Query lineage relationships
print("\n  Lineage queries:")
print(f"    performance_metrics inputs: {folio.get_inputs('performance_metrics')}")
print(f"    training_data dependents: {folio.get_dependents('training_data')}")
print(f"    rf_classifier dependents: {folio.get_dependents('rf_classifier')}")

# View full lineage graph
graph = folio.get_lineage_graph()
print("\n  Full dependency graph:")
for item, inputs in graph.items():
    if inputs:
        print(f"    {item} ‚Üê {inputs}")



üîó STEP 11: Exploring lineage tracking...

DataFolio: demo_bundles/protein-analysis-mindful-indigo-viper

Created: 2025-10-13T16:18:39.741317+00:00
Updated: 2025-10-13T16:19:30.600211+00:00

Referenced Tables (2):
  ‚Ä¢ training_data [referenced_table]: Large training dataset from protein database
  ‚Ä¢ validation_data [referenced_table]: Validation dataset for model evaluation

Included Tables (3):
  ‚Ä¢ performance_metrics [included_table]: Model performance summary
    ‚Ü≥ inputs: training_data, validation_data
    ‚Ü≥ models: rf_classifier
  ‚Ä¢ confusion_matrix [included_table]: Validation confusion matrix
    ‚Ü≥ inputs: validation_data
    ‚Ü≥ models: rf_classifier
  ‚Ä¢ feature_importance [included_table]: RF feature importance
    ‚Ü≥ models: rf_classifier

Models (2):
  ‚Ä¢ rf_classifier [model]: Random forest classifier for protein localization
    ‚Ü≥ inputs: training_data
    ‚Ü≥ hyperparameters: n_estimators=100, max_depth=10, random_state=42
  ‚Ä¢ scaler [model]: Feat

In [24]:
folio_tuned = folio.copy(
    new_path='demo_bundles',
    new_prefix='protein-analysis-tuned',
    metadata_updates={
        'experiment': 'protein_localization_v2_tuned',
        'parent_experiment': folio.metadata['experiment'],
        'changes': 'Hyperparameter tuning variant',
    },
    exclude_items=['performance_metrics', 'confusion_matrix', 'feature_importance']
)


In [25]:
print(f"  ‚úì Created tuned variant: {folio_tuned._bundle_dir}")
print(f"    Items: {list(folio_tuned._items.keys())}")
print(f"    Note: Excluded metrics - will regenerate after retraining")



  ‚úì Created tuned variant: demo_bundles/protein-analysis-tuned-keen-indigo-ibis
    Items: ['training_data', 'validation_data', 'feature_importance_plot', 'scaler', 'rf_classifier']
    Note: Excluded metrics - will regenerate after retraining
