# Stage 05: Data Storage Infrastructure

This notebook sets up proper data storage with file handling, directory structure, and data versioning.

## Objectives
- Create organized directory structure
- Implement data versioning
- Set up automated backup systems
- Test storage operations

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
from datetime import datetime
import utils
import json
from pathlib import Path

print("💾 Data Storage Infrastructure Setup")

## 1. Directory Structure Setup

In [None]:
# Create comprehensive directory structure
directories = [
    '../data/raw/financial_apis',
    '../data/processed/cleaned',
    '../data/processed/features',
    '../data/processed/models',
    '../data/backup',
    '../models/trained',
    '../models/checkpoints',
    '../reports/daily',
    '../reports/monthly'
]

for directory in directories:
    Path(directory).mkdir(parents=True, exist_ok=True)
    print(f"✅ Created: {directory}")

print("\n📁 Directory structure created successfully")

## 2. Test Data Storage Functions

In [None]:
# Create sample data for testing
sample_data = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=10),
    'symbol': ['TEST'] * 10,
    'price': np.random.uniform(100, 200, 10),
    'volume': np.random.randint(1000, 10000, 10)
})

print("Sample data created:")
print(sample_data.head())

# Test saving with timestamp
saved_path = utils.save_with_timestamp(
    df=sample_data,
    prefix="test_storage",
    source="testing",
    ext="csv"
)

print(f"\n✅ Test data saved to: {saved_path}")

## 3. Data Versioning System

In [None]:
def create_data_manifest(data_path: str, description: str) -> dict:
    """Create a manifest file for data versioning."""
    manifest = {
        'timestamp': datetime.now().isoformat(),
        'file_path': data_path,
        'description': description,
        'file_size_mb': os.path.getsize(data_path) / (1024 * 1024) if os.path.exists(data_path) else 0,
        'version': datetime.now().strftime('%Y%m%d_%H%M%S')
    }
    
    # Save manifest
    manifest_path = data_path.replace('.csv', '_manifest.json')
    with open(manifest_path, 'w') as f:
        json.dump(manifest, f, indent=2)
    
    print(f"📋 Manifest created: {manifest_path}")
    return manifest

# Create manifest for test data
if os.path.exists(saved_path):
    manifest = create_data_manifest(saved_path, "Test storage functionality")
    print(f"Version: {manifest['version']}")

## 4. Storage Validation

In [None]:
def validate_storage_structure():
    """Validate that all required directories exist."""
    required_dirs = [
        '../data/raw',
        '../data/processed', 
        '../models',
        '../reports'
    ]
    
    all_exist = True
    for directory in required_dirs:
        if os.path.exists(directory):
            print(f"✅ {directory} exists")
        else:
            print(f"❌ {directory} missing")
            all_exist = False
    
    return all_exist

print("🔍 Validating storage structure:")
storage_valid = validate_storage_structure()

if storage_valid:
    print("\n✅ Storage infrastructure validated successfully")
else:
    print("\n❌ Storage validation failed")

## 5. Summary

In [None]:
print("\n🎯 Stage 05 Summary:")
print("✅ Directory structure created")
print("✅ Storage functions tested")
print("✅ Data versioning implemented")
print("✅ Storage validation completed")

print("\n📋 Next Steps:")
print("- Stage 06: Create preprocessing pipeline")
print("- Stage 07: Build risk analysis models")
print("- Stage 08: Create portfolio optimization")