In [1]:
# Interactive Development Notebook - Algoritmo GD Project
# Load real project data and keep DataFrames in memory for development

import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple
import warnings

# Add project root to path so we can import from src/
project_root = Path.cwd()
if 'src' not in sys.path:
    sys.path.insert(0, str(project_root))

print("🚀 Interactive Development Environment - Algoritmo GD Project")
print("=" * 70)

# =============================================================================
# 1. IMPORT PROJECT MODULES AND CONFIGURATION
# =============================================================================

try:
    # Import project configuration and modules
    from src.config import CONFIG, PROJECT_NAME
    from src.models import DescansosDataModel
    from base_data_project.utils import create_components
    from base_data_project.log_config import setup_logger
    from base_data_project.storage.containers import CSVDataContainer, DBDataContainer
    
    print("✅ Project modules imported successfully")
    print(f"📁 Project: {PROJECT_NAME}")
    print(f"🗂️  Root directory: {project_root}")
    
except ImportError as e:
    print(f"❌ Error importing project modules: {e}")
    print("Make sure you're running this notebook from the project root directory")
    raise

# Configure logging
logger = setup_logger(PROJECT_NAME, log_level=logging.INFO)

# =============================================================================
# 2. CONFIGURATION AND EXTERNAL DATA SETUP
# =============================================================================

print("\n📋 Setting up configuration and external data...")

# Use the real project configuration
use_db = True  # Set to True if you want to use database, False for CSV
external_call_data = CONFIG.get('external_call_data', {
    'current_process_id': 249652,
    'api_proc_id': 999,
    'wfm_proc_id': 249652,
    'wfm_user': 'WFM',
    'start_date': '2025-01-01',
    'end_date': '2025-12-31',
    'wfm_proc_colab': None,
})

print(f"📊 Data source: {'Database' if use_db else 'CSV files'}")
print(f"📅 Date range: {external_call_data['start_date']} to {external_call_data['end_date']}")
print(f"🔢 Process ID: {external_call_data['current_process_id']}")

# =============================================================================
# 3. INITIALIZE DATA MANAGER AND COMPONENTS
# =============================================================================

print("\n🔧 Initializing data manager and components...")

try:
    # Create data manager using the project's utility function
    data_manager, process_manager = create_components(
        use_db=use_db, 
        no_tracking=True,  # Disable tracking for development
        config=CONFIG,
        project_name=PROJECT_NAME  # Pass project name explicitly
    )
    print("✅ Data manager created successfully")
    
except Exception as e:
    print(f"❌ Error creating data manager: {e}")
    raise

# =============================================================================
# 4. LOAD PROJECT DATA INTO MEMORY
# =============================================================================

print("\n📊 Loading project data into memory...")

# Create the appropriate data container based on configuration
if use_db:
    data_container = DBDataContainer(
        project_name=PROJECT_NAME,
        config=CONFIG
    )
else:
    data_container = CSVDataContainer(
        project_name=PROJECT_NAME,
        config=CONFIG
    )

# Initialize the data model with real project structure and data container
data_model = DescansosDataModel(
    project_name=PROJECT_NAME, 
    external_data=external_call_data,
    data_container=data_container
)

print("✅ Data model initialized")

# Context manager for data manager connection
with data_manager:
    
    # =============================================================================
    # 4.1 LOAD PROCESS DATA (Stage 1)
    # =============================================================================
    
    print("\n🔄 Stage 1: Loading process data...")
    
    try:
        # Get entities to load from configuration
        entities_dict = CONFIG.get('available_entities_processing', {})
        
        success = data_model.load_process_data(data_manager, entities_dict)
        
        if success:
            print("✅ Process data loaded successfully")
            print(f"   📋 Valid employees: {len(data_model.auxiliary_data.get('valid_emp', []))} records")
            print(f"   🏢 Unit ID: {data_model.auxiliary_data.get('unit_id')}")
            print(f"   🏭 Section ID: {data_model.auxiliary_data.get('secao_id')}")
            print(f"   👤 Position IDs: {data_model.auxiliary_data.get('posto_id_list')}")
        else:
            print("❌ Failed to load process data")
            
    except Exception as e:
        print(f"❌ Error in Stage 1: {e}")
        logger.error(f"Stage 1 error: {e}", exc_info=True)
    
    # =============================================================================
    # 4.2 LOAD DETAILED DATA FOR EACH POSITION (Stage 2)
    # =============================================================================
    
    print("\n🔄 Stage 2: Loading detailed data for positions...")
    
    posto_id_list = data_model.auxiliary_data.get('posto_id_list', [])

    
    if posto_id_list:
        # Process first position as example (you can modify this)
        posto_id = posto_id_list[1]
        print(f"📍 Processing position ID: {posto_id}")
        
        try:
            # Load colaborador info
            success = data_model.load_colaborador_info(data_manager, posto_id)
            if success:
                print(f"   ✅ Colaborador info loaded")
                df_colaborador = data_model.raw_data.get('df_colaborador')
                if df_colaborador is not None:
                    print(f"      📊 {len(df_colaborador)} employee records")
            
            # Load estimativas info  
            success = data_model.load_estimativas_info(
                data_manager, 
                posto_id, 
                external_call_data['start_date'], 
                external_call_data['end_date']
            )
            if success:
                print(f"   ✅ Estimativas info loaded")
                df_estimativas = data_model.raw_data.get('df_estimativas')
                if df_estimativas is not None:
                    print(f"      📈 {len(df_estimativas)} estimate records")
            
            # Load calendario info
            success = data_model.load_calendario_info(
                data_manager,
                external_call_data['current_process_id'],
                posto_id,
                external_call_data['start_date'],
                external_call_data['end_date']
            )
            if success:
                print(f"   ✅ Calendario info loaded")
                df_calendario = data_model.raw_data.get('df_calendario')
                if df_calendario is not None:
                    print(f"      📅 Calendar matrix: {df_calendario.shape}")
            
        except Exception as e:
            print(f"   ❌ Error loading data for position {posto_id}: {e}")
            logger.error(f"Position {posto_id} error: {e}", exc_info=True)
    


🚀 Interactive Development Environment - Algoritmo GD Project
2025-06-26 14:37:28,374 |     INFO | Logger initialized for algoritmo_GD
✅ Project modules imported successfully
📁 Project: algoritmo_GD
🗂️  Root directory: c:\ALCAMPO\python-algorithms\algortimo-gd

📋 Setting up configuration and external data...
📊 Data source: Database
📅 Date range: 2025-01-01 to 2025-12-31
🔢 Process ID: 249730

🔧 Initializing data manager and components...
Creating components for project: algoritmo_GD
2025-06-26 14:37:30,713 |     INFO | Data manager for 'db' not registered, trying built-in managers
2025-06-26 14:37:30,715 |     INFO | Initialized BaseDataManager
✅ Data manager created successfully

📊 Loading project data into memory...
2025-06-26 14:37:30,716 |     INFO | Initialized DBDataContainer
2025-06-26 14:37:30,718 |     INFO | Initializing database data container with URL: oracle+cx_oracle://ANTONIO_ALVES:4dB>(fUU77P?/@10.175.28.20:1523/?service_name=WFM_ALCAMPO_TST01
2025-06-26 14:37:37,697 |   

In [2]:
with data_manager:
    # =============================================================================
    # 4.3 PERFORM DATA TRANSFORMATIONS (Stage 3)
    # =============================================================================
    
    print("\n🔄 Stage 3: Performing data transformations...")
    
    try:
        #print("--------------------------------")
        # print("Pre-tratamento")
        # print(f"df_granularidade: {data_model.auxiliary_data['df_granularidade']}")
        # print(f"df_faixa_horario: {data_model.auxiliary_data['df_faixa_horario']}")
        # print(f"df_feriados: {data_model.auxiliary_data['df_feriados']}")
        # print(f"df_estrutura_wfm: {data_model.auxiliary_data['df_estrutura_wfm']}")
        # print(f"df_estimativas_raw: {data_model.raw_data['df_estimativas']}")
        # print("--------------------------------")
        # Load estimativas transformations
        success = data_model.load_estimativas_transformations()
        if success:
            print("   ✅ Estimativas transformations completed")
        
        # Load colaborador transformations  
        success = data_model.load_colaborador_transformations()
        if success:
            print("   ✅ Colaborador transformations completed")
        
        # Load calendario transformations
        success = data_model.load_calendario_transformations()
        if success:
            print("   ✅ Calendario transformations completed")
        
        # Store matriz2_bk before func_inicializa
        data_model.medium_data['matriz2_bk'] = data_model.raw_data['df_calendario'].copy()
        
        # Debug: Print matriz2_bk info
        matriz2_bk = data_model.medium_data['matriz2_bk']
        print(f"\n🔍 Debug matriz2_bk before func_inicializa:")
        print(f"   Shape: {matriz2_bk.shape}")
        print(f"   First few rows:\n{matriz2_bk.head()}")
        print(f"   Columns: {matriz2_bk.columns.tolist()}")

        # print("--------------------------------")
        # print("Pos-tratamento")
        # print(f"df_granularidade: {data_model.auxiliary_data['df_granularidade']}")
        # print(f"df_estimativas_raw: {data_model.raw_data['df_estimativas']}")
        # print("--------------------------------")

        # Debug: Print raw_data['df_estimativas'] before func_inicializa
        print("\n🔍 Debug raw_data['df_estimativas'] before func_inicializa:")
        df_est = data_model.raw_data['df_estimativas']
        print(f"   Shape: {df_est.shape}")
        print(f"   Columns: {df_est.columns.tolist()}")
        print(f"   First few rows:\n{df_est.head()}")
    except Exception as e:
        print(f"   ❌ Error in transformations: {e}")
        logger.error(f"Transformation error: {e}", exc_info=True)

print("\n🎉 Data loading completed!")

2025-06-26 14:38:40,891 |     INFO | Connected to database: oracle+cx_oracle://ANTONIO_ALVES:4dB>(fUU77P?/@10.175.28.20:1523/?service_name=WFM_ALCAMPO_TST01

🔄 Stage 3: Performing data transformations...
2025-06-26 14:38:40,988 |     INFO | df_feriados_filtered columns: ['fk_unidade', 'fk_pais', 'fk_estado', 'fk_cidade', 'database', 'descricao', 'tipo', 'feriado_fixo']
   ✅ Estimativas transformations completed
2025-06-26 14:38:53,002 |     INFO | Starting load_ma_bd processing
2025-06-26 14:38:53,099 |     INFO | teste entre novo convenio


  output_final = output_final.fillna(0)
  matriz_ma[non_date_columns] = matriz_ma[non_date_columns].fillna(0)


2025-06-26 14:38:53,161 |     INFO | teste entre novo convenio
2025-06-26 14:38:53,331 |    ERROR | Empleado 5039237 sin suficiente LQ para fines de semana de calidad
2025-06-26 14:38:53,333 |     INFO | columnes matriz a: ['fk_colaborador', 'unidade', 'secao', 'posto', 'convenio', 'nome', 'matricula', 'min_dia_trab', 'max_dia_trab', 'tipo_turno', 'seq_turno', 't_total', 'l_total', 'dyf_max_t', 'q', 'c2d', 'c3d', 'cxx', 'semana_1', 'out', 'ciclo', 'data_admissao', 'data_demissao', 'fk_tipo_posto', 'h_tm_in', 'h_tm_out', 'h_tt_in', 'h_tt_out', 'h_seg_in', 'h_seg_out', 'h_ter_in', 'h_ter_out', 'h_qua_in', 'h_qua_out', 'h_qui_in', 'h_qui_out', 'h_sex_in', 'h_sex_out', 'h_sab_in', 'h_sab_out', 'h_dom_in', 'h_dom_out', 'h_fer_in', 'h_fer_out', 'limite_superior_manha', 'limite_inferior_tarde', 'emp', 'lq', 'min', 'max', 'tipo_contrato']
2025-06-26 14:38:53,374 |     INFO | load_ma_bd completed successfully. Processed 13 employees.
   ✅ Colaborador transformations completed
2025-06-26 14:38:5

In [3]:
with data_manager:
    try: 
        # Perform func_inicializa
        success = data_model.func_inicializa(
            start_date=external_call_data['start_date'],
            end_date=external_call_data['end_date'],
            fer=data_model.auxiliary_data.get('df_festivos'),
            closed_days=data_model.auxiliary_data.get('df_closed_days')
        )
        if success:
            print("   ✅ func_inicializa completed")
            
            # Debug: Print medium_data['df_estimativas'] after func_inicializa
            print("\n🔍 Debug medium_data['df_estimativas'] after func_inicializa:")
            df_est = data_model.medium_data['df_estimativas']
            print(f"   Shape: {df_est.shape}")
            print(f"   Columns: {df_est.columns.tolist()}")
            print(f"   First few rows:\n{df_est.head()}")
            
    except Exception as e:
        print(f"   ❌ Error in transformations: {e}")
        logger.error(f"Transformation error: {e}", exc_info=True)

print("\n🎉 Data loading completed!")


2025-06-26 14:39:04,042 |     INFO | Connected to database: oracle+cx_oracle://ANTONIO_ALVES:4dB>(fUU77P?/@10.175.28.20:1523/?service_name=WFM_ALCAMPO_TST01
2025-06-26 14:39:04,044 |     INFO | === Debug matrizB_og (df_estimativas) ===
2025-06-26 14:39:04,047 |     INFO | Shape: (730, 8)
2025-06-26 14:39:04,049 |     INFO | Columns: ['data', 'media_turno', 'max_turno', 'min_turno', 'sd_turno', 'turno', 'fk_tipo_posto', 'data_turno']
2025-06-26 14:39:04,058 |     INFO | First few rows:
        data  media_turno  max_turno  min_turno  sd_turno turno fk_tipo_posto  \
0 2025-01-01          0.0        0.0        0.0       0.0     M          None   
1 2025-01-02          0.0        0.0        0.0       0.0     M          None   
2 2025-01-03          0.0        0.0        0.0       0.0     M          None   
3 2025-01-04          0.0        0.0        0.0       0.0     M          None   
4 2025-01-05          0.0        0.0        0.0       0.0     M          None   

     data_turno  
0  20

  matrizB_ini.loc[matrizB_ini['data'].isin(special_dates), 'min_turno'] = matrizB_ini['max_turno']
  mask_friday = (matrizB_ini['data'].isin(friday_dates)) & (matrizB_ini['turno'] == 'M')


2025-06-26 14:39:04,774 |     INFO | Columns in matrizA_og after processing: ['fk_colaborador', 'unidade', 'secao', 'posto', 'convenio', 'nome', 'matricula', 'min_dia_trab', 'max_dia_trab', 'tipo_turno', 'seq_turno', 't_total', 'l_total', 'dyf_max_t', 'q', 'c2d', 'c3d', 'cxx', 'semana_1', 'out', 'ciclo', 'data_admissao', 'data_demissao', 'fk_tipo_posto', 'h_tm_in', 'h_tm_out', 'h_tt_in', 'h_tt_out', 'h_seg_in', 'h_seg_out', 'h_ter_in', 'h_ter_out', 'h_qua_in', 'h_qua_out', 'h_qui_in', 'h_qui_out', 'h_sex_in', 'h_sex_out', 'h_sab_in', 'h_sab_out', 'h_dom_in', 'h_dom_out', 'h_fer_in', 'h_fer_out', 'limite_superior_manha', 'limite_inferior_tarde', 'emp', 'lq', 'min', 'max', 'tipo_contrato', 'ld', 'l_dom', 'lq_og', 'total_dom_fes', 'total_fes', 'total_holidays', 'descansos_atrb', 'COLABORADOR', 'LD_at', 'LQ_at', 'LRES_at', 'CXX_at', 'C2D_at', 'C3D_at']
2025-06-26 14:39:20,093 |     INFO | matrizB_m:            data  media_turno  max_turno  min_turno  sd_turno turno  \
0    2025-01-01      

In [None]:
df_matriz2_bk = data_model.auxiliary_data['matriz2_bk']
df_matriz_og = data_model.raw_data['df_calendario']
df_cal2 = data_model.medium_data['matriz2_bk']

In [None]:

# =============================================================================
# 5. ORGANIZE DATAFRAMES FOR EASY ACCESS
# =============================================================================

print("\n📊 Organizing DataFrames for interactive access...")

# Extract all DataFrames from the data model
auxiliary_dataframes = {}
raw_dataframes = {}
medium_dataframes = {}
rare_dataframes = {}
formatted_dataframes = {}

# Auxiliary data
for key, value in data_model.auxiliary_data.items():
    if isinstance(value, pd.DataFrame):
        auxiliary_dataframes[key] = value

# Raw data  
for key, value in data_model.raw_data.items():
    if isinstance(value, pd.DataFrame):
        raw_dataframes[key] = value

# Medium data (transformed)
for key, value in data_model.medium_data.items():
    if isinstance(value, pd.DataFrame):
        medium_dataframes[key] = value

# Rare data (algorithm results)
for key, value in data_model.rare_data.items():
    if isinstance(value, pd.DataFrame):
        rare_dataframes[key] = value

# Formatted data (final output)
for key, value in data_model.formatted_data.items():
    if isinstance(value, pd.DataFrame):
        formatted_dataframes[key] = value

# =============================================================================
# 6. DISPLAY AVAILABLE DATAFRAMES
# =============================================================================

print("\n📋 AVAILABLE DATAFRAMES")
print("=" * 70)

all_dataframes = {
    "🗂️ AUXILIARY": auxiliary_dataframes,
    "📁 RAW": raw_dataframes, 
    "⚙️ MEDIUM (Transformed)": medium_dataframes,
    "💎 RARE (Algorithm Results)": rare_dataframes,
    "📊 FORMATTED (Final)": formatted_dataframes
}

for category, dataframes in all_dataframes.items():
    if dataframes:
        print(f"\n{category}:")
        for name, df in dataframes.items():
            print(f"   📋 {name:<25} → {df.shape[0]:>6} rows × {df.shape[1]:>3} columns")
    else:
        print(f"\n{category}: (no DataFrames yet)")

# =============================================================================
# 7. QUICK ACCESS VARIABLES AND UTILITY FUNCTIONS
# =============================================================================

print(f"\n🔗 QUICK ACCESS VARIABLES")
print("=" * 70)

# Make key DataFrames easily accessible with simple variable names
try:
    if 'valid_emp' in auxiliary_dataframes:
        valid_emp = auxiliary_dataframes['valid_emp']
        print(f"✅ valid_emp           → {valid_emp.shape}")
    
    if 'df_colaborador' in raw_dataframes:
        df_colaborador = raw_dataframes['df_colaborador']
        print(f"✅ df_colaborador      → {df_colaborador.shape}")
    
    if 'df_estimativas' in raw_dataframes:
        df_estimativas = raw_dataframes['df_estimativas']
        print(f"✅ df_estimativas      → {df_estimativas.shape}")
    
    if 'df_calendario' in raw_dataframes:
        df_calendario = raw_dataframes['df_calendario']
        print(f"✅ df_calendario       → {df_calendario.shape}")
    
    if 'matrizA_bk' in medium_dataframes:
        matrizA_bk = medium_dataframes['matrizA_bk']
        print(f"✅ matrizA_bk          → {matrizA_bk.shape}")
    
    if 'matriz2_bk' in medium_dataframes:
        matriz2_bk = medium_dataframes['matriz2_bk']
        print(f"✅ matriz2_bk          → {matriz2_bk.shape}")
    
    if 'matrizB_bk' in medium_dataframes:
        matrizB_bk = medium_dataframes['matrizB_bk']
        print(f"✅ matrizB_bk          → {matrizB_bk.shape}")
        
except Exception as e:
    print(f"⚠️ Some DataFrames may not be available yet: {e}")

# =============================================================================
# 8. UTILITY FUNCTIONS FOR DATA EXPLORATION
# =============================================================================

def explore_df(df, name="DataFrame"):
    """Explore a DataFrame with detailed information"""
    print(f"\n🔍 EXPLORING: {name}")
    print("=" * 60)
    print(f"📏 Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print(f"💾 Memory usage: {df.memory_usage(deep=True).sum() / 1024:.1f} KB")
    
    print(f"\n📋 Columns ({len(df.columns)}):")
    for i, col in enumerate(df.columns):
        dtype = df[col].dtype
        null_count = df[col].isnull().sum()
        print(f"   {i+1:2d}. {col:<20} ({dtype}) - {null_count} nulls")
    
    print(f"\n📊 First 3 rows:")
    print(df.head(3).to_string())
    
    # Numeric summary
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\n📈 Numeric columns summary:")
        print(df[numeric_cols].describe())
    
    return df

def compare_dfs(*dataframes, names=None):
    """Compare multiple DataFrames"""
    if names is None:
        names = [f"DataFrame_{i+1}" for i in range(len(dataframes))]
    
    print(f"\n🔄 COMPARING DATAFRAMES")
    print("=" * 60)
    
    for name, df in zip(names, dataframes):
        print(f"📋 {name:<20} → {df.shape[0]:>6} rows × {df.shape[1]:>3} columns")
    
    # Check for common columns
    if len(dataframes) > 1:
        all_columns = [set(df.columns) for df in dataframes]
        common_cols = set.intersection(*all_columns)
        
        print(f"\n🔗 Common columns ({len(common_cols)}):")
        for col in sorted(common_cols):
            print(f"   • {col}")

def show_sample_data(df_dict, category_name, n_rows=3):
    """Show sample data from DataFrames in a category"""
    print(f"\n📖 SAMPLE DATA: {category_name}")
    print("=" * 60)
    
    for name, df in df_dict.items():
        print(f"\n🔹 {name} (showing {min(n_rows, len(df))} rows):")
        if len(df) > 0:
            print(df.head(n_rows).to_string())
        else:
            print("   (empty DataFrame)")

def search_columns(pattern, df_dict=None):
    """Search for columns matching a pattern across all DataFrames"""
    if df_dict is None:
        df_dict = {**auxiliary_dataframes, **raw_dataframes, **medium_dataframes}
    
    print(f"\n🔍 SEARCHING COLUMNS: '{pattern}'")
    print("=" * 60)
    
    found = False
    for df_name, df in df_dict.items():
        matching_cols = [col for col in df.columns if pattern.lower() in col.lower()]
        if matching_cols:
            found = True
            print(f"\n📋 {df_name}:")
            for col in matching_cols:
                print(f"   • {col}")
    
    if not found:
        print(f"❌ No columns found matching '{pattern}'")

def df_info():
    """Show information about all available DataFrames"""
    print(f"\n📊 ALL DATAFRAMES INFO")
    print("=" * 70)
    
    categories = [
        ("🗂️ AUXILIARY", auxiliary_dataframes),
        ("📁 RAW", raw_dataframes),
        ("⚙️ MEDIUM", medium_dataframes),
        ("💎 RARE", rare_dataframes),
        ("📊 FORMATTED", formatted_dataframes)
    ]
    
    for category_name, df_dict in categories:
        if df_dict:
            print(f"\n{category_name}:")
            for name, df in df_dict.items():
                memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
                print(f"   📋 {name:<25} → {df.shape[0]:>6} rows × {df.shape[1]:>3} cols ({memory_mb:.1f} MB)")

# =============================================================================
# 9. INSTRUCTIONS AND EXAMPLES
# =============================================================================

print(f"\n🛠️ UTILITY FUNCTIONS AVAILABLE:")
print("=" * 70)
print("🔍 explore_df(dataframe, 'name')              → Detailed DataFrame exploration")
print("🔄 compare_dfs(df1, df2, names=['A', 'B'])    → Compare multiple DataFrames")  
print("📖 show_sample_data(df_dict, 'category', 5)   → Show sample data from category")
print("🔍 search_columns('pattern')                  → Find columns matching pattern")
print("📊 df_info()                                  → Show all DataFrames info")

print(f"\n💡 EXAMPLE USAGE:")
print("=" * 70)
print("# Explore specific DataFrames")
print("explore_df(valid_emp, 'Valid Employees')")
print("explore_df(df_colaborador, 'Employee Details')")
print("")
print("# Compare DataFrames")
print("compare_dfs(df_colaborador, matrizA_bk, names=['Raw', 'Processed'])")
print("")
print("# Show sample data")
print("show_sample_data(raw_dataframes, 'Raw Data', 3)")
print("")
print("# Search for specific columns")
print("search_columns('matricula')")
print("search_columns('data')")
print("")
print("# Access DataFrames directly")
print("valid_emp.head()")
print("df_colaborador.describe()")
print("matrizA_bk.columns")

print(f"\n🎯 DIRECT ACCESS TO PROJECT DATA:")
print("=" * 70)
print("📊 data_model.auxiliary_data    → Dictionary with auxiliary data")
print("📁 data_model.raw_data          → Dictionary with raw DataFrames")  
print("⚙️ data_model.medium_data       → Dictionary with transformed DataFrames")
print("💎 data_model.rare_data         → Dictionary with algorithm results")
print("📋 data_model.formatted_data    → Dictionary with final formatted data")
print("")
print("📊 auxiliary_dataframes         → Easy access to auxiliary DataFrames")
print("📁 raw_dataframes              → Easy access to raw DataFrames")
print("⚙️ medium_dataframes           → Easy access to medium DataFrames")

print(f"\n✨ READY FOR INTERACTIVE DEVELOPMENT!")
print("🔧 All project DataFrames are loaded and available in memory")
print("📝 Use the utility functions above to explore and analyze the data")
print("🚀 Start developing your data transformations!")

In [None]:
    # =============================================================================
    # 4.3 PERFORM DATA TRANSFORMATIONS (Stage 3)
    # =============================================================================
with data_manager:
    print("\n🔄 Stage 3: Performing data transformations...")
    
    try:
        # Load estimativas transformations
        success = data_model.load_estimativas_transformations()
        if success:
            print("   ✅ Estimativas transformations completed")
        
        # Load colaborador transformations  
        success = data_model.load_colaborador_transformations()
        if success:
            print("   ✅ Colaborador transformations completed")
        
        # Load calendario transformations
        success = data_model.load_calendario_transformations()
        if success:
            print("   ✅ Calendario transformations completed")
        
        # Store matriz2_bk before func_inicializa
        data_model.medium_data['matriz2_bk'] = data_model.raw_data['df_calendario'].copy()
        
        # Debug: Print matriz2_bk info
        matriz2_bk = data_model.medium_data['matriz2_bk']
        print(f"\n🔍 Debug matriz2_bk before func_inicializa:")
        print(f"   Shape: {matriz2_bk.shape}")
        print(f"   First few rows:\n{matriz2_bk.head()}")
        print(f"   Columns: {matriz2_bk.columns.tolist()}")

        # Debug: Print raw_data['df_estimativas'] before func_inicializa
        print("\n🔍 Debug raw_data['df_estimativas'] before func_inicializa:")
        df_est = data_model.raw_data['df_estimativas']
        print(f"   Shape: {df_est.shape}")
        print(f"   Columns: {df_est.columns.tolist()}")
        print(f"   First few rows:\n{df_est.head()}")

        # Perform func_inicializa
        success = data_model.func_inicializa(
            start_date=external_call_data['start_date'],
            end_date=external_call_data['end_date'],
            fer=data_model.auxiliary_data.get('df_festivos'),
            closed_days=data_model.auxiliary_data.get('df_closed_days')
        )
        if success:
            print("   ✅ func_inicializa completed")
            
            # Debug: Print medium_data['df_estimativas'] after func_inicializa
            print("\n🔍 Debug medium_data['df_estimativas'] after func_inicializa:")
            df_est = data_model.medium_data['df_estimativas']
            print(f"   Shape: {df_est.shape}")
            print(f"   Columns: {df_est.columns.tolist()}")
            print(f"   First few rows:\n{df_est.head()}")
            
    except Exception as e:
        print(f"   ❌ Error in transformations: {e}")
        logger.error(f"Transformation error: {e}", exc_info=True)

print("\n🎉 Data loading completed!")

# =============================================================================
# 5. ORGANIZE DATAFRAMES FOR EASY ACCESS
# =============================================================================

print("\n📊 Organizing DataFrames for interactive access...")

# Extract all DataFrames from the data model
auxiliary_dataframes = {}
raw_dataframes = {}
medium_dataframes = {}
rare_dataframes = {}
formatted_dataframes = {}

# Auxiliary data
for key, value in data_model.auxiliary_data.items():
    if isinstance(value, pd.DataFrame):
        auxiliary_dataframes[key] = value

# Raw data  
for key, value in data_model.raw_data.items():
    if isinstance(value, pd.DataFrame):
        raw_dataframes[key] = value

# Medium data (transformed)
for key, value in data_model.medium_data.items():
    if isinstance(value, pd.DataFrame):
        medium_dataframes[key] = value

# Rare data (algorithm results)
for key, value in data_model.rare_data.items():
    if isinstance(value, pd.DataFrame):
        rare_dataframes[key] = value

# Formatted data (final output)
for key, value in data_model.formatted_data.items():
    if isinstance(value, pd.DataFrame):
        formatted_dataframes[key] = value

# =============================================================================
# 6. DISPLAY AVAILABLE DATAFRAMES
# =============================================================================

print("\n📋 AVAILABLE DATAFRAMES")
print("=" * 70)

all_dataframes = {
    "🗂️ AUXILIARY": auxiliary_dataframes,
    "📁 RAW": raw_dataframes, 
    "⚙️ MEDIUM (Transformed)": medium_dataframes,
    "💎 RARE (Algorithm Results)": rare_dataframes,
    "📊 FORMATTED (Final)": formatted_dataframes
}

for category, dataframes in all_dataframes.items():
    if dataframes:
        print(f"\n{category}:")
        for name, df in dataframes.items():
            print(f"   📋 {name:<25} → {df.shape[0]:>6} rows × {df.shape[1]:>3} columns")
    else:
        print(f"\n{category}: (no DataFrames yet)")

# =============================================================================
# 7. QUICK ACCESS VARIABLES AND UTILITY FUNCTIONS
# =============================================================================

print(f"\n🔗 QUICK ACCESS VARIABLES")
print("=" * 70)

# Make key DataFrames easily accessible with simple variable names
try:
    if 'valid_emp' in auxiliary_dataframes:
        valid_emp = auxiliary_dataframes['valid_emp']
        print(f"✅ valid_emp           → {valid_emp.shape}")
    
    if 'df_colaborador' in raw_dataframes:
        df_colaborador = raw_dataframes['df_colaborador']
        print(f"✅ df_colaborador      → {df_colaborador.shape}")
    
    if 'df_estimativas' in raw_dataframes:
        df_estimativas = raw_dataframes['df_estimativas']
        print(f"✅ df_estimativas      → {df_estimativas.shape}")
    
    if 'df_calendario' in raw_dataframes:
        df_calendario = raw_dataframes['df_calendario']
        print(f"✅ df_calendario       → {df_calendario.shape}")
    
    if 'matrizA_bk' in medium_dataframes:
        matrizA_bk = medium_dataframes['matrizA_bk']
        print(f"✅ matrizA_bk          → {matrizA_bk.shape}")
    
    if 'matriz2_bk' in medium_dataframes:
        matriz2_bk = medium_dataframes['matriz2_bk']
        print(f"✅ matriz2_bk          → {matriz2_bk.shape}")
    
    if 'matrizB_bk' in medium_dataframes:
        matrizB_bk = medium_dataframes['matrizB_bk']
        print(f"✅ matrizB_bk          → {matrizB_bk.shape}")
        
except Exception as e:
    print(f"⚠️ Some DataFrames may not be available yet: {e}")

# =============================================================================
# 8. UTILITY FUNCTIONS FOR DATA EXPLORATION
# =============================================================================

def explore_df(df, name="DataFrame"):
    """Explore a DataFrame with detailed information"""
    print(f"\n🔍 EXPLORING: {name}")
    print("=" * 60)
    print(f"📏 Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print(f"💾 Memory usage: {df.memory_usage(deep=True).sum() / 1024:.1f} KB")
    
    print(f"\n📋 Columns ({len(df.columns)}):")
    for i, col in enumerate(df.columns):
        dtype = df[col].dtype
        null_count = df[col].isnull().sum()
        print(f"   {i+1:2d}. {col:<20} ({dtype}) - {null_count} nulls")
    
    print(f"\n📊 First 3 rows:")
    print(df.head(3).to_string())
    
    # Numeric summary
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\n📈 Numeric columns summary:")
        print(df[numeric_cols].describe())
    
    return df

def compare_dfs(*dataframes, names=None):
    """Compare multiple DataFrames"""
    if names is None:
        names = [f"DataFrame_{i+1}" for i in range(len(dataframes))]
    
    print(f"\n🔄 COMPARING DATAFRAMES")
    print("=" * 60)
    
    for name, df in zip(names, dataframes):
        print(f"📋 {name:<20} → {df.shape[0]:>6} rows × {df.shape[1]:>3} columns")
    
    # Check for common columns
    if len(dataframes) > 1:
        all_columns = [set(df.columns) for df in dataframes]
        common_cols = set.intersection(*all_columns)
        
        print(f"\n🔗 Common columns ({len(common_cols)}):")
        for col in sorted(common_cols):
            print(f"   • {col}")

def show_sample_data(df_dict, category_name, n_rows=3):
    """Show sample data from DataFrames in a category"""
    print(f"\n📖 SAMPLE DATA: {category_name}")
    print("=" * 60)
    
    for name, df in df_dict.items():
        print(f"\n🔹 {name} (showing {min(n_rows, len(df))} rows):")
        if len(df) > 0:
            print(df.head(n_rows).to_string())
        else:
            print("   (empty DataFrame)")

def search_columns(pattern, df_dict=None):
    """Search for columns matching a pattern across all DataFrames"""
    if df_dict is None:
        df_dict = {**auxiliary_dataframes, **raw_dataframes, **medium_dataframes}
    
    print(f"\n🔍 SEARCHING COLUMNS: '{pattern}'")
    print("=" * 60)
    
    found = False
    for df_name, df in df_dict.items():
        matching_cols = [col for col in df.columns if pattern.lower() in col.lower()]
        if matching_cols:
            found = True
            print(f"\n📋 {df_name}:")
            for col in matching_cols:
                print(f"   • {col}")
    
    if not found:
        print(f"❌ No columns found matching '{pattern}'")

def df_info():
    """Show information about all available DataFrames"""
    print(f"\n📊 ALL DATAFRAMES INFO")
    print("=" * 70)
    
    categories = [
        ("🗂️ AUXILIARY", auxiliary_dataframes),
        ("📁 RAW", raw_dataframes),
        ("⚙️ MEDIUM", medium_dataframes),
        ("💎 RARE", rare_dataframes),
        ("📊 FORMATTED", formatted_dataframes)
    ]
    
    for category_name, df_dict in categories:
        if df_dict:
            print(f"\n{category_name}:")
            for name, df in df_dict.items():
                memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
                print(f"   📋 {name:<25} → {df.shape[0]:>6} rows × {df.shape[1]:>3} cols ({memory_mb:.1f} MB)")

# =============================================================================
# 9. INSTRUCTIONS AND EXAMPLES
# =============================================================================

print(f"\n🛠️ UTILITY FUNCTIONS AVAILABLE:")
print("=" * 70)
print("🔍 explore_df(dataframe, 'name')              → Detailed DataFrame exploration")
print("🔄 compare_dfs(df1, df2, names=['A', 'B'])    → Compare multiple DataFrames")  
print("📖 show_sample_data(df_dict, 'category', 5)   → Show sample data from category")
print("🔍 search_columns('pattern')                  → Find columns matching pattern")
print("📊 df_info()                                  → Show all DataFrames info")

print(f"\n💡 EXAMPLE USAGE:")
print("=" * 70)
print("# Explore specific DataFrames")
print("explore_df(valid_emp, 'Valid Employees')")
print("explore_df(df_colaborador, 'Employee Details')")
print("")
print("# Compare DataFrames")
print("compare_dfs(df_colaborador, matrizA_bk, names=['Raw', 'Processed'])")
print("")
print("# Show sample data")
print("show_sample_data(raw_dataframes, 'Raw Data', 3)")
print("")
print("# Search for specific columns")
print("search_columns('matricula')")
print("search_columns('data')")
print("")
print("# Access DataFrames directly")
print("valid_emp.head()")
print("df_colaborador.describe()")
print("matrizA_bk.columns")

print(f"\n🎯 DIRECT ACCESS TO PROJECT DATA:")
print("=" * 70)
print("📊 data_model.auxiliary_data    → Dictionary with auxiliary data")
print("📁 data_model.raw_data          → Dictionary with raw DataFrames")  
print("⚙️ data_model.medium_data       → Dictionary with transformed DataFrames")
print("💎 data_model.rare_data         → Dictionary with algorithm results")
print("📋 data_model.formatted_data    → Dictionary with final formatted data")
print("")
print("📊 auxiliary_dataframes         → Easy access to auxiliary DataFrames")
print("📁 raw_dataframes              → Easy access to raw DataFrames")
print("⚙️ medium_dataframes           → Easy access to medium DataFrames")

print(f"\n✨ READY FOR INTERACTIVE DEVELOPMENT!")
print("🔧 All project DataFrames are loaded and available in memory")
print("📝 Use the utility functions above to explore and analyze the data")
print("🚀 Start developing your data transformations!")

In [None]:
    # =============================================================================
    # 4.3 PERFORM DATA TRANSFORMATIONS (Stage 3)
    # =============================================================================
    
    print("\n🔄 Stage 3: Performing data transformations...")
    
    try:
        # Load estimativas transformations
        success = data_model.load_estimativas_transformations()
        if success:
            print("   ✅ Estimativas transformations completed")
        
        # Load colaborador transformations  
        success = data_model.load_colaborador_transformations()
        if success:
            print("   ✅ Colaborador transformations completed")
        
        # Load calendario transformations
        success = data_model.load_calendario_transformations()
        if success:
            print("   ✅ Calendario transformations completed")
        
        # Store matriz2_bk before func_inicializa
        data_model.medium_data['matriz2_bk'] = data_model.raw_data['df_calendario'].copy()
        
        # Debug: Print matriz2_bk info
        matriz2_bk = data_model.medium_data['matriz2_bk']
        print(f"\n🔍 Debug matriz2_bk before func_inicializa:")
        print(f"   Shape: {matriz2_bk.shape}")
        print(f"   First few rows:\n{matriz2_bk.head()}")
        print(f"   Columns: {matriz2_bk.columns.tolist()}")

        # Debug: Print raw_data['df_estimativas'] before func_inicializa
        print("\n🔍 Debug raw_data['df_estimativas'] before func_inicializa:")
        df_est = data_model.raw_data['df_estimativas']
        print(f"   Shape: {df_est.shape}")
        print(f"   Columns: {df_est.columns.tolist()}")
        print(f"   First few rows:\n{df_est.head()}")

        # Perform func_inicializa
        success = data_model.func_inicializa(
            start_date=external_call_data['start_date'],
            end_date=external_call_data['end_date'],
            fer=data_model.auxiliary_data.get('df_festivos'),
            closed_days=data_model.auxiliary_data.get('df_closed_days')
        )
        if success:
            print("   ✅ func_inicializa completed")
            
            # Debug: Print medium_data['df_estimativas'] after func_inicializa
            print("\n🔍 Debug medium_data['df_estimativas'] after func_inicializa:")
            df_est = data_model.medium_data['df_estimativas']
            print(f"   Shape: {df_est.shape}")
            print(f"   Columns: {df_est.columns.tolist()}")
            print(f"   First few rows:\n{df_est.head()}")
            
    except Exception as e:
        print(f"   ❌ Error in transformations: {e}")
        logger.error(f"Transformation error: {e}", exc_info=True)

print("\n🎉 Data loading completed!")

# =============================================================================
# 5. ORGANIZE DATAFRAMES FOR EASY ACCESS
# =============================================================================

print("\n📊 Organizing DataFrames for interactive access...")

# Extract all DataFrames from the data model
auxiliary_dataframes = {}
raw_dataframes = {}
medium_dataframes = {}
rare_dataframes = {}
formatted_dataframes = {}

# Auxiliary data
for key, value in data_model.auxiliary_data.items():
    if isinstance(value, pd.DataFrame):
        auxiliary_dataframes[key] = value

# Raw data  
for key, value in data_model.raw_data.items():
    if isinstance(value, pd.DataFrame):
        raw_dataframes[key] = value

# Medium data (transformed)
for key, value in data_model.medium_data.items():
    if isinstance(value, pd.DataFrame):
        medium_dataframes[key] = value

# Rare data (algorithm results)
for key, value in data_model.rare_data.items():
    if isinstance(value, pd.DataFrame):
        rare_dataframes[key] = value

# Formatted data (final output)
for key, value in data_model.formatted_data.items():
    if isinstance(value, pd.DataFrame):
        formatted_dataframes[key] = value

# =============================================================================
# 6. DISPLAY AVAILABLE DATAFRAMES
# =============================================================================

print("\n📋 AVAILABLE DATAFRAMES")
print("=" * 70)

all_dataframes = {
    "🗂️ AUXILIARY": auxiliary_dataframes,
    "📁 RAW": raw_dataframes, 
    "⚙️ MEDIUM (Transformed)": medium_dataframes,
    "💎 RARE (Algorithm Results)": rare_dataframes,
    "📊 FORMATTED (Final)": formatted_dataframes
}

for category, dataframes in all_dataframes.items():
    if dataframes:
        print(f"\n{category}:")
        for name, df in dataframes.items():
            print(f"   📋 {name:<25} → {df.shape[0]:>6} rows × {df.shape[1]:>3} columns")
    else:
        print(f"\n{category}: (no DataFrames yet)")

# =============================================================================
# 7. QUICK ACCESS VARIABLES AND UTILITY FUNCTIONS
# =============================================================================

print(f"\n🔗 QUICK ACCESS VARIABLES")
print("=" * 70)

# Make key DataFrames easily accessible with simple variable names
try:
    if 'valid_emp' in auxiliary_dataframes:
        valid_emp = auxiliary_dataframes['valid_emp']
        print(f"✅ valid_emp           → {valid_emp.shape}")
    
    if 'df_colaborador' in raw_dataframes:
        df_colaborador = raw_dataframes['df_colaborador']
        print(f"✅ df_colaborador      → {df_colaborador.shape}")
    
    if 'df_estimativas' in raw_dataframes:
        df_estimativas = raw_dataframes['df_estimativas']
        print(f"✅ df_estimativas      → {df_estimativas.shape}")
    
    if 'df_calendario' in raw_dataframes:
        df_calendario = raw_dataframes['df_calendario']
        print(f"✅ df_calendario       → {df_calendario.shape}")
    
    if 'matrizA_bk' in medium_dataframes:
        matrizA_bk = medium_dataframes['matrizA_bk']
        print(f"✅ matrizA_bk          → {matrizA_bk.shape}")
    
    if 'matriz2_bk' in medium_dataframes:
        matriz2_bk = medium_dataframes['matriz2_bk']
        print(f"✅ matriz2_bk          → {matriz2_bk.shape}")
    
    if 'matrizB_bk' in medium_dataframes:
        matrizB_bk = medium_dataframes['matrizB_bk']
        print(f"✅ matrizB_bk          → {matrizB_bk.shape}")
        
except Exception as e:
    print(f"⚠️ Some DataFrames may not be available yet: {e}")

# =============================================================================
# 8. UTILITY FUNCTIONS FOR DATA EXPLORATION
# =============================================================================

def explore_df(df, name="DataFrame"):
    """Explore a DataFrame with detailed information"""
    print(f"\n🔍 EXPLORING: {name}")
    print("=" * 60)
    print(f"📏 Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print(f"💾 Memory usage: {df.memory_usage(deep=True).sum() / 1024:.1f} KB")
    
    print(f"\n📋 Columns ({len(df.columns)}):")
    for i, col in enumerate(df.columns):
        dtype = df[col].dtype
        null_count = df[col].isnull().sum()
        print(f"   {i+1:2d}. {col:<20} ({dtype}) - {null_count} nulls")
    
    print(f"\n📊 First 3 rows:")
    print(df.head(3).to_string())
    
    # Numeric summary
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\n📈 Numeric columns summary:")
        print(df[numeric_cols].describe())
    
    return df

def compare_dfs(*dataframes, names=None):
    """Compare multiple DataFrames"""
    if names is None:
        names = [f"DataFrame_{i+1}" for i in range(len(dataframes))]
    
    print(f"\n🔄 COMPARING DATAFRAMES")
    print("=" * 60)
    
    for name, df in zip(names, dataframes):
        print(f"📋 {name:<20} → {df.shape[0]:>6} rows × {df.shape[1]:>3} columns")
    
    # Check for common columns
    if len(dataframes) > 1:
        all_columns = [set(df.columns) for df in dataframes]
        common_cols = set.intersection(*all_columns)
        
        print(f"\n🔗 Common columns ({len(common_cols)}):")
        for col in sorted(common_cols):
            print(f"   • {col}")

def show_sample_data(df_dict, category_name, n_rows=3):
    """Show sample data from DataFrames in a category"""
    print(f"\n📖 SAMPLE DATA: {category_name}")
    print("=" * 60)
    
    for name, df in df_dict.items():
        print(f"\n🔹 {name} (showing {min(n_rows, len(df))} rows):")
        if len(df) > 0:
            print(df.head(n_rows).to_string())
        else:
            print("   (empty DataFrame)")

def search_columns(pattern, df_dict=None):
    """Search for columns matching a pattern across all DataFrames"""
    if df_dict is None:
        df_dict = {**auxiliary_dataframes, **raw_dataframes, **medium_dataframes}
    
    print(f"\n🔍 SEARCHING COLUMNS: '{pattern}'")
    print("=" * 60)
    
    found = False
    for df_name, df in df_dict.items():
        matching_cols = [col for col in df.columns if pattern.lower() in col.lower()]
        if matching_cols:
            found = True
            print(f"\n📋 {df_name}:")
            for col in matching_cols:
                print(f"   • {col}")
    
    if not found:
        print(f"❌ No columns found matching '{pattern}'")

def df_info():
    """Show information about all available DataFrames"""
    print(f"\n📊 ALL DATAFRAMES INFO")
    print("=" * 70)
    
    categories = [
        ("🗂️ AUXILIARY", auxiliary_dataframes),
        ("📁 RAW", raw_dataframes),
        ("⚙️ MEDIUM", medium_dataframes),
        ("💎 RARE", rare_dataframes),
        ("📊 FORMATTED", formatted_dataframes)
    ]
    
    for category_name, df_dict in categories:
        if df_dict:
            print(f"\n{category_name}:")
            for name, df in df_dict.items():
                memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
                print(f"   📋 {name:<25} → {df.shape[0]:>6} rows × {df.shape[1]:>3} cols ({memory_mb:.1f} MB)")

# =============================================================================
# 9. INSTRUCTIONS AND EXAMPLES
# =============================================================================

print(f"\n🛠️ UTILITY FUNCTIONS AVAILABLE:")
print("=" * 70)
print("🔍 explore_df(dataframe, 'name')              → Detailed DataFrame exploration")
print("🔄 compare_dfs(df1, df2, names=['A', 'B'])    → Compare multiple DataFrames")  
print("📖 show_sample_data(df_dict, 'category', 5)   → Show sample data from category")
print("🔍 search_columns('pattern')                  → Find columns matching pattern")
print("📊 df_info()                                  → Show all DataFrames info")

print(f"\n💡 EXAMPLE USAGE:")
print("=" * 70)
print("# Explore specific DataFrames")
print("explore_df(valid_emp, 'Valid Employees')")
print("explore_df(df_colaborador, 'Employee Details')")
print("")
print("# Compare DataFrames")
print("compare_dfs(df_colaborador, matrizA_bk, names=['Raw', 'Processed'])")
print("")
print("# Show sample data")
print("show_sample_data(raw_dataframes, 'Raw Data', 3)")
print("")
print("# Search for specific columns")
print("search_columns('matricula')")
print("search_columns('data')")
print("")
print("# Access DataFrames directly")
print("valid_emp.head()")
print("df_colaborador.describe()")
print("matrizA_bk.columns")

print(f"\n🎯 DIRECT ACCESS TO PROJECT DATA:")
print("=" * 70)
print("📊 data_model.auxiliary_data    → Dictionary with auxiliary data")
print("📁 data_model.raw_data          → Dictionary with raw DataFrames")  
print("⚙️ data_model.medium_data       → Dictionary with transformed DataFrames")
print("💎 data_model.rare_data         → Dictionary with algorithm results")
print("📋 data_model.formatted_data    → Dictionary with final formatted data")
print("")
print("📊 auxiliary_dataframes         → Easy access to auxiliary DataFrames")
print("📁 raw_dataframes              → Easy access to raw DataFrames")
print("⚙️ medium_dataframes           → Easy access to medium DataFrames")

print(f"\n✨ READY FOR INTERACTIVE DEVELOPMENT!")
print("🔧 All project DataFrames are loaded and available in memory")
print("📝 Use the utility functions above to explore and analyze the data")
print("🚀 Start developing your data transformations!")

In [None]:
# Debug func_inicializa MatrizB Processing
# This focuses on the specific part where df_estimativas gets processed

def debug_func_inicializa_matrizb(data_model):
    """
    Debug the MatrizB processing in func_inicializa where df_estimativas is handled
    """
    print("🔍 DEBUGGING func_inicializa MatrizB Processing")
    print("=" * 60)
    
    # Get the starting data
    matrizB_og = data_model.raw_data.get('df_estimativas', pd.DataFrame()).copy()
    matriz2_bk = data_model.medium_data.get('matriz2_bk', pd.DataFrame())
    
    print(f"📊 Starting data:")
    print(f"   matrizB_og (df_estimativas): {matrizB_og.shape}")
    print(f"   matriz2_bk: {matriz2_bk.shape}")
    
    if len(matrizB_og) == 0:
        print("❌ matrizB_og is empty - this is the source of the problem!")
        return
    
    print(f"\n📋 matrizB_og columns: {list(matrizB_og.columns)}")
    print(f"📊 matrizB_og sample:")
    print(matrizB_og.head(3))
    
    # Get year from matrizB_og
    if 'data' in matrizB_og.columns:
        ano = pd.to_datetime(matrizB_og['data'].min()).year
        print(f"\n📅 Year from data: {ano}")
        
        # Adjust minTurno for specific dates (this is from the R code)
        special_dates = [f'{ano}-12-23', f'{ano}-12-24', f'{ano}-12-30', f'{ano}-12-31']
        friday_dates = [f'{ano}-12-22', f'{ano}-12-29']
        
        matrizB_ini = matrizB_og.copy()
        
        # Check if the required columns exist
        required_cols = ['min_turno', 'max_turno']
        missing_cols = [col for col in required_cols if col not in matrizB_ini.columns]
        
        if missing_cols:
            print(f"❌ Missing required columns: {missing_cols}")
            print(f"   Available columns: {list(matrizB_ini.columns)}")
            return
        
        # Apply the special date logic
        matrizB_ini.loc[matrizB_ini['data'].isin(special_dates), 'min_turno'] = matrizB_ini['max_turno']
        mask_friday = (matrizB_ini['data'].isin(friday_dates)) & (matrizB_ini['turno'] == 'M')
        matrizB_ini.loc[mask_friday, 'min_turno'] = matrizB_ini.loc[mask_friday, 'max_turno']
        
        print(f"✅ Applied special date adjustments")
        print(f"   matrizB_ini shape after adjustments: {matrizB_ini.shape}")
    else:
        print("❌ 'data' column not found in matrizB_og")
        return
    
    # Now the critical part - creating the +H column
    print(f"\n🔄 Creating +H column from matriz2_bk...")
    
    if len(matriz2_bk) == 0:
        print("❌ matriz2_bk is empty - cannot create +H column!")
        return
    
    print(f"📋 matriz2_bk columns: {list(matriz2_bk.columns)}")
    
    # Check the logic for calculating +H for morning shifts
    print(f"\n🌅 Processing morning shifts...")
    
    trab_manha_data = []
    unique_dates = matriz2_bk['DATA'].unique() if 'DATA' in matriz2_bk.columns else []
    
    print(f"   Found {len(unique_dates)} unique dates in matriz2_bk")
    
    if len(unique_dates) == 0:
        print("❌ No dates found in matriz2_bk DATA column")
        return
    
    # Sample a few dates to check the logic
    sample_dates = unique_dates[:3] if len(unique_dates) >= 3 else unique_dates
    
    for date in sample_dates:
        if date == 'TIPO_DIA':
            continue
            
        day_data = matriz2_bk[(matriz2_bk['DATA'] == date) & 
                            (matriz2_bk['COLABORADOR'] != 'TIPO_DIA')].copy()
        
        print(f"   📅 Date {date}: {len(day_data)} employee records")
        
        if len(day_data) == 0:
            print(f"      ⚠️ No employee data for date {date}")
            continue
        
        # Check the TIPO_TURNO and HORARIO columns
        if 'TIPO_TURNO' in day_data.columns and 'HORARIO' in day_data.columns:
            morning_workers = day_data[
                (day_data['TIPO_TURNO'] == 'M') & 
                (day_data['HORARIO'].str.contains('H|NL', case=False, na=False))
            ]
            print(f"      🌅 Morning workers: {len(morning_workers)}")
        else:
            print(f"      ❌ Missing TIPO_TURNO or HORARIO columns")
    
    # The issue might be in the merge logic
    print(f"\n🔗 Checking merge logic...")
    
    # Check if matrizB_ini has the expected columns for merging
    merge_cols = ['data', 'turno']
    available_merge_cols = [col for col in merge_cols if col in matrizB_ini.columns]
    
    print(f"   Required merge columns: {merge_cols}")
    print(f"   Available in matrizB_ini: {available_merge_cols}")
    
    if len(available_merge_cols) != len(merge_cols):
        print(f"❌ Cannot merge - missing columns in matrizB_ini")
        return
    
    # Test the merge for morning data
    if len(trab_manha_data) == 0:
        # Create at least one sample to test
        trab_manha_data = [{
            'DATA': sample_dates[0] if len(sample_dates) > 0 else '2025-01-01',
            'TURNO': 'M',
            '+H': 5.0
        }]
    
    trab_manha = pd.DataFrame(trab_manha_data)
    print(f"   trab_manha sample: {trab_manha.shape}")
    print(f"   trab_manha columns: {list(trab_manha.columns)}")
    
    # Test merge
    matrizB_m = matrizB_ini[matrizB_ini['turno'] == 'M'].copy()
    print(f"   matrizB morning records: {len(matrizB_m)}")
    
    if len(matrizB_m) > 0:
        try:
            merged = matrizB_m.merge(trab_manha, left_on=['data', 'turno'], 
                                   right_on=['DATA', 'TURNO'], how='left')
            print(f"   ✅ Merge successful: {merged.shape}")
            print(f"   +H column created: {'+H' in merged.columns}")
        except Exception as e:
            print(f"   ❌ Merge failed: {e}")
    
    # Summary
    print(f"\n📋 SUMMARY:")
    print(f"   🔸 matrizB_og (input): {matrizB_og.shape}")
    print(f"   🔸 Expected output should have +H column")
    print(f"   🔸 Issue likely in +H calculation or merge logic")
    
    return matrizB_ini

# Function to test the exact transformation
def test_matrizb_transformation(data_model):
    """
    Test the exact MatrizB transformation to see where it fails
    """
    print("\n🧪 TESTING MatrizB Transformation")
    print("=" * 50)
    
    # Get the raw data
    matrizB_og = data_model.raw_data.get('df_estimativas', pd.DataFrame()).copy()
    
    if len(matrizB_og) == 0:
        print("❌ Cannot test - matrizB_og is empty")
        return
    
    print(f"📊 Starting with: {matrizB_og.shape}")
    
    # Apply the basic transformation steps
    try:
        # Step 1: Convert data types
        numeric_cols = ['max_turno', 'min_turno', 'media_turno', 'sd_turno']
        for col in numeric_cols:
            if col in matrizB_og.columns:
                matrizB_og[col] = pd.to_numeric(matrizB_og[col], errors='coerce')
                print(f"   ✅ Converted {col} to numeric")
            else:
                print(f"   ⚠️ Column {col} not found")
        
        # Step 2: Add +H column (placeholder)
        matrizB_og['+H'] = 0
        print(f"   ✅ Added +H column")
        
        # Step 3: Apply the calculation logic
        param_pess_obj = 0.5
        matrizB_og['aux'] = np.where(
            matrizB_og['media_turno'] != 0,
            matrizB_og['sd_turno'] / matrizB_og['media_turno'],
            0
        )
        
        matrizB_og['pess_obj'] = np.where(
            matrizB_og['aux'] >= param_pess_obj,
            np.ceil(matrizB_og['media_turno']),
            np.round(matrizB_og['media_turno'])
        )
        
        matrizB_og['diff'] = matrizB_og['+H'] - matrizB_og['pess_obj']
        
        print(f"   ✅ Applied calculations")
        print(f"   📊 Final shape: {matrizB_og.shape}")
        
        # Store in medium_data to test
        data_model.medium_data['test_df_estimativas'] = matrizB_og.copy()
        
        print(f"   ✅ Test successful - stored in medium_data['test_df_estimativas']")
        
        return matrizB_og
        
    except Exception as e:
        print(f"   ❌ Test failed: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run the debugging
if 'data_model' in locals():
    result = debug_func_inicializa_matrizb(data_model)
    test_result = test_matrizb_transformation(data_model)
    
    if test_result is not None:
        print(f"\n✅ The transformation CAN work!")
        print(f"   The issue is likely in the +H calculation logic in func_inicializa")
        print(f"   Check the matriz2_bk processing section")
else:
    print("❌ data_model not found. Run the main notebook first.")