In [9]:
# Quick check - add this to your existing code
def quick_diagnosis_209(processed_data):
    work_209 = processed_data
    print(f"Punch Code 209 Summary:")
    print(f"Total records: {len(work_209)}")
    print(f"NoOfMan range: {work_209['NoOfMan'].min()} to {work_209['NoOfMan'].max()}")
    print(f"NoOfMan mean: {work_209['NoOfMan'].mean():.2f}")
    print(f"Days with 0 workers: {(work_209['NoOfMan'] == 0).sum()}")
    print(f"Days with >5 workers: {(work_209['NoOfMan'] > 5).sum()}")
    
    # Check recent data
    recent = work_209.nlargest(10, 'Date')
    print(f"Recent 10 days:")
    print(recent[['Date', 'NoOfMan']].to_string())

In [10]:
import pandas as pd

df = pd.read_excel('209.xlsx')
print(df)

           Date  WorkType   Hours   NoOfMan  SystemHours  Quantity  \
0    2019-12-14       209   97.69  12.21125     0.163055      2204   
1    2019-12-17       209  239.29  29.91125   115.320277     37657   
2    2019-08-21       209   83.45  10.43125    56.451666     17539   
3    2019-08-23       209   72.95   9.11875    37.546388     22999   
4    2019-11-15       209  105.89  13.23625    59.047777     20015   
...         ...       ...     ...       ...          ...       ...   
1521 2021-04-07       209   67.98   8.49750    45.684722     21730   
1522 2020-11-13       209   75.74   9.46750    45.157500     17356   
1523 2024-11-28       209  139.81  17.47625    79.968611     23877   
1524 2022-10-13       209   63.24   7.90500    42.677500     19435   
1525 2021-11-19       209  131.74  16.46750    91.498888     27580   

      ResourceKPI     SystemKPI  
0       22.561163  13516.911471  
1      157.369719    326.542746  
2      210.173757    310.690564  
3      315.270733    61

In [11]:
quick_diagnosis_209(df)

Punch Code 209 Summary:
Total records: 1526
NoOfMan range: 0.0 to 39.02625
NoOfMan mean: 8.56
Days with 0 workers: 41
Days with >5 workers: 1276
Recent 10 days:
           Date   NoOfMan
75   2025-05-21   6.86000
79   2025-05-20   9.66250
80   2025-05-19   9.01250
78   2025-05-16   6.55625
77   2025-05-15   7.05125
103  2025-05-14   9.53000
76   2025-05-13  11.18875
1273 2025-05-12  11.36875
801  2025-05-09   7.08875
410  2025-05-08   5.12625


In [16]:
"""
Debug script to identify why punch code 209 predictions are near zero
despite having substantial actual values during training
"""
import pandas as pd
import numpy as np
import pickle
import logging
from datetime import datetime, timedelta

def debug_prediction_pipeline_209(df, models, work_type='209'):
    """
    Comprehensive debugging of prediction pipeline for punch code 209
    """
    print(f"=== PREDICTION PIPELINE DEBUG FOR WORK TYPE {work_type} ===\n")
    
    # 1. Verify model exists and basic info
    if work_type not in models:
        print(f"ERROR: No model found for work type {work_type}")
        return
    
    model = models[work_type]
    print(f"1. MODEL VERIFICATION:")
    print(f"   Model type: {type(model)}")
    print(f"   Is Pipeline: {hasattr(model, 'steps')}")
    
    if hasattr(model, 'steps'):
        print(f"   Pipeline steps: {[step[0] for step in model.steps]}")
    
    # 2. Check what features the model expects vs what we're providing
    print(f"\n2. FEATURE COMPARISON:")
    
    # Get expected features from model training
    try:
        if hasattr(model, 'feature_names_in_'):
            expected_features = list(model.feature_names_in_)
        elif hasattr(model.named_steps['model'], 'feature_names_in_'):
            expected_features = list(model.named_steps['model'].feature_names_in_)
        else:
            expected_features = None
        
        print(f"   Expected features from model: {len(expected_features) if expected_features else 'Unknown'}")
        if expected_features:
            print(f"   First 10 expected: {expected_features[:10]}")
    except:
        expected_features = None
        print("   Could not extract expected features from model")
    
    # Get features we're actually providing
    from utils.feature_engineering import get_feature_lists
    numeric_features, categorical_features = get_feature_lists(
        include_advanced_features=True, 
        include_productivity_metrics=True
    )
    provided_features = numeric_features + categorical_features
    
    print(f"   Features we're providing: {len(provided_features)}")
    print(f"   Numeric: {len(numeric_features)}, Categorical: {len(categorical_features)}")
    
    # Check which features are missing in our data
    work_data = df[df['WorkType'] == work_type].copy()
    available_features = [f for f in provided_features if f in work_data.columns]
    missing_features = [f for f in provided_features if f not in work_data.columns]
    
    print(f"   Available in data: {len(available_features)}")
    print(f"   Missing from data: {len(missing_features)}")
    
    if missing_features:
        print(f"   MISSING FEATURES: {missing_features[:10]}...")  # Show first 10
    
    # 3. Examine actual feature values for recent data
    print(f"\n3. RECENT DATA FEATURE ANALYSIS:")
    
    # Get the most recent record
    work_data_sorted = work_data.sort_values('Date', ascending=False)
    recent_record = work_data_sorted.iloc[0]
    
    print(f"   Most recent date: {recent_record['Date']}")
    print(f"   Actual NoOfMan: {recent_record['NoOfMan']}")
    
    # Check lag features specifically
    lag_features = [f for f in available_features if 'lag' in f and 'NoOfMan' in f]
    print(f"\n   LAG FEATURES VALUES:")
    for feature in lag_features[:8]:  # Show first 8 lag features
        value = recent_record.get(feature, 'MISSING')
        print(f"   {feature}: {value}")
    
    # Check rolling features
    rolling_features = [f for f in available_features if 'rolling' in f and 'NoOfMan' in f]
    print(f"\n   ROLLING FEATURES VALUES:")
    for feature in rolling_features[:5]:  # Show first 5 rolling features
        value = recent_record.get(feature, 'MISSING')
        print(f"   {feature}: {value}")
    
    # 4. Make a prediction and trace through the process
    print(f"\n4. PREDICTION TRACE:")
    
    try:
        # Prepare features for prediction
        X_pred = work_data_sorted.iloc[[0]][available_features]
        
        # Add any missing columns with zeros
        for feature in provided_features:
            if feature not in X_pred.columns:
                X_pred[feature] = 0.0
        
        print(f"   Input shape: {X_pred.shape}")
        print(f"   Input columns: {len(X_pred.columns)}")
        
        # Make prediction
        prediction = model.predict(X_pred)[0]
        print(f"   PREDICTION: {prediction:.6f}")
        
        # If it's a pipeline, trace through steps
        if hasattr(model, 'steps'):
            print(f"\n   PIPELINE TRACE:")
            
            # Transform through preprocessor
            if 'preprocessor' in [step[0] for step in model.steps]:
                preprocessor = model.named_steps['preprocessor']
                X_transformed = preprocessor.transform(X_pred)
                print(f"   After preprocessing shape: {X_transformed.shape}")
                print(f"   Preprocessed sample values: {X_transformed[0][:10]}")  # First 10 values
                
                # Check if all values are zero or very small
                if isinstance(X_transformed, np.ndarray):
                    non_zero_count = np.count_nonzero(X_transformed)
                    total_count = X_transformed.size
                    print(f"   Non-zero features: {non_zero_count}/{total_count} ({non_zero_count/total_count*100:.1f}%)")
                    print(f"   Max absolute value: {np.abs(X_transformed).max():.6f}")
    
    except Exception as e:
        print(f"   ERROR in prediction: {str(e)}")
        import traceback
        traceback.print_exc()
    
    # 5. Compare with training data patterns
    print(f"\n5. TRAINING DATA COMPARISON:")
    
    # Get some high-value training examples
    high_value_examples = work_data[work_data['NoOfMan'] > 10].sort_values('NoOfMan', ascending=False).head(3)
    
    print(f"   High-value training examples:")
    for idx, row in high_value_examples.iterrows():
        print(f"   Date: {row['Date']}, NoOfMan: {row['NoOfMan']:.2f}")
        
        # Show key lag features for these examples
        key_lags = ['NoOfMan_lag_1', 'NoOfMan_lag_7', 'NoOfMan_rolling_mean_7']
        lag_values = []
        for lag_feature in key_lags:
            if lag_feature in row:
                lag_values.append(f"{lag_feature}: {row[lag_feature]:.2f}")
        print(f"     Key lags: {', '.join(lag_values)}")
    
    return recent_record, available_features, missing_features

def debug_feature_generation_209(df, work_type='209'):
    """
    Debug the feature generation process specifically
    """
    print(f"\n=== FEATURE GENERATION DEBUG FOR WORK TYPE {work_type} ===\n")
    
    work_data = df[df['WorkType'] == work_type].copy()
    work_data = work_data.sort_values('Date')
    
    print(f"1. RAW DATA CHECK:")
    print(f"   Total records: {len(work_data)}")
    print(f"   Date range: {work_data['Date'].min()} to {work_data['Date'].max()}")
    
    # Check the last few days of raw data
    print(f"\n   Last 5 days raw data:")
    last_5 = work_data.tail(5)[['Date', 'NoOfMan']]
    for idx, row in last_5.iterrows():
        print(f"   {row['Date'].date()}: {row['NoOfMan']}")
    
    # 2. Check lag feature calculation manually
    print(f"\n2. MANUAL LAG CALCULATION:")
    
    latest_date = work_data['Date'].max()
    print(f"   Latest date: {latest_date}")
    
    # Calculate what lag features SHOULD be
    for lag_days in [1, 7]:
        lag_date = latest_date - timedelta(days=lag_days)
        lag_records = work_data[work_data['Date'] == lag_date]
        lag_value = lag_records['NoOfMan'].sum() if not lag_records.empty else 0
        print(f"   {lag_days}-day lag ({lag_date.date()}): {lag_value}")
    
    # Calculate 7-day rolling mean manually
    last_7_days = work_data[work_data['Date'] > latest_date - timedelta(days=7)]
    rolling_mean = last_7_days['NoOfMan'].mean()
    print(f"   7-day rolling mean: {rolling_mean:.2f}")
    
    # 3. Check if feature engineering is working correctly
    print(f"\n3. FEATURE ENGINEERING VERIFICATION:")
    
    # Check if we have the required columns after feature engineering
    required_columns = ['NoOfMan_lag_1', 'NoOfMan_lag_7', 'NoOfMan_rolling_mean_7', 
                       'DayOfWeek_feat', 'Month_feat', 'IsWeekend_feat']
    
    for col in required_columns:
        if col in work_data.columns:
            latest_value = work_data[col].iloc[-1] if len(work_data) > 0 else "N/A"
            print(f"   {col}: {latest_value}")
        else:
            print(f"   {col}: MISSING")
    
    return work_data

def test_prediction_with_known_good_data(df, models, work_type='209'):
    """
    Test prediction using data that should give good results
    """
    print(f"\n=== TESTING WITH KNOWN GOOD DATA ===\n")
    
    work_data = df[df['WorkType'] == work_type].copy()
    work_data = work_data.sort_values('Date')
    
    # Find a period where we had consistently high values
    high_periods = work_data[work_data['NoOfMan'] > 10]
    
    if len(high_periods) > 0:
        print(f"1. TESTING WITH HIGH-VALUE PERIOD:")
        
        # Take a record from a high-value period
        test_record = high_periods.iloc[-1]  # Last high-value record
        print(f"   Test date: {test_record['Date']}")
        print(f"   Actual NoOfMan: {test_record['NoOfMan']}")
        
        # Get available features
        from utils.feature_engineering import get_feature_lists
        numeric_features, categorical_features = get_feature_lists(
            include_advanced_features=True, 
            include_productivity_metrics=True
        )
        available_features = [f for f in numeric_features + categorical_features if f in work_data.columns]
        
        # Prepare for prediction
        X_test = pd.DataFrame([test_record])[available_features]
        
        # Add missing features with zeros
        all_features = numeric_features + categorical_features
        for feature in all_features:
            if feature not in X_test.columns:
                X_test[feature] = 0.0
        
        # Make prediction
        model = models[work_type]
        prediction = model.predict(X_test)[0]
        
        print(f"   Predicted NoOfMan: {prediction:.6f}")
        print(f"   Prediction error: {abs(test_record['NoOfMan'] - prediction):.6f}")
        
        # Show key feature values for this record
        key_features = ['NoOfMan_lag_1', 'NoOfMan_lag_7', 'NoOfMan_rolling_mean_7']
        print(f"   Key feature values:")
        for feature in key_features:
            if feature in test_record:
                print(f"     {feature}: {test_record[feature]:.2f}")
    
    else:
        print("   No high-value periods found for testing")

def comprehensive_debug_209(data_path, models_path):
    """
    Run all debugging functions
    """
    try:
        # Load data
        print("Loading data...")
        if data_path.endswith('.csv'):
            df = pd.read_csv(data_path)
        else:
            df = pd.read_excel(data_path)
        
        # Basic preprocessing
        df['Date'] = pd.to_datetime(df['Date'])
        if 'PunchCode' in df.columns:
            df = df.rename(columns={'PunchCode': 'WorkType'})
        df['WorkType'] = df['WorkType'].astype(str)
        
        # Load models
        print("Loading models...")
        with open(models_path, 'rb') as f:
            models = pickle.load(f)
        
        print(f"Loaded {len(models)} models")
        
        if '209' not in models:
            print("ERROR: No model found for punch code 209!")
            return
        
        # Run all debug functions
        recent_record, available_features, missing_features = debug_prediction_pipeline_209(df, models, '209')
        work_data = debug_feature_generation_209(df, '209')
        test_prediction_with_known_good_data(df, models, '209')
        
        # Final recommendations
        print(f"\n=== DIAGNOSIS SUMMARY ===")
        print(f"1. Missing features: {len(missing_features)} out of expected features")
        print(f"2. Available features: {len(available_features)}")
        
        if missing_features:
            print(f"3. CRITICAL: Missing features may be causing zero predictions")
            print(f"   Key missing features to investigate: {missing_features[:5]}")
        
        print(f"\n=== RECOMMENDATIONS ===")
        print(f"1. Check if feature engineering is running correctly in prediction pipeline")
        print(f"2. Verify that lag features are being calculated with actual historical data") 
        print(f"3. Ensure the same preprocessing is applied during prediction as during training")
        print(f"4. Check if productivity features (Quantity, KPIs) are available during prediction")
        
        return True
        
    except Exception as e:
        print(f"Error in comprehensive debug: {e}")
        import traceback
        traceback.print_exc()
        return False

# Quick fix to test if the issue is missing features
def quick_test_with_manual_features(models, work_type='209'):
    """
    Test prediction with manually created features based on the training data patterns
    """
    print(f"\n=== QUICK TEST WITH MANUAL FEATURES ===\n")
    
    if work_type not in models:
        print(f"No model for {work_type}")
        return
    
    model = models[work_type]
    
    # Create a test case with reasonable lag values based on the training data
    # (mean was 8.56, so let's use values around that)
    test_features = {
        'NoOfMan_lag_1': 8.0,
        'NoOfMan_lag_2': 7.5,
        'NoOfMan_lag_3': 9.0,
        'NoOfMan_lag_7': 8.5,
        'NoOfMan_lag_14': 8.0,
        'NoOfMan_lag_30': 7.8,
        'NoOfMan_rolling_mean_7': 8.2,
        'NoOfMan_rolling_max_7': 12.0,
        'NoOfMan_rolling_min_7': 5.0,
        'NoOfMan_rolling_std_7': 2.0,
        'NoOfMan_same_dow_lag': 8.0,
        'NoOfMan_7day_trend': 0.5,
        'NoOfMan_1day_trend': 0.2,
        'DayOfWeek_feat': 1,  # Tuesday
        'Month_feat': 5,      # May
        'IsWeekend_feat': 0,  # Not weekend
        'Year_feat': 2025,
        'Quarter': 2
    }
    
    # Add productivity features with reasonable values
    productivity_features = {
        'Quantity_lag_1': 1000,
        'Quantity_lag_7': 950,
        'Quantity_rolling_mean_7': 980,
        'Quantity_per_Worker': 120,
        'Combined_KPI_lag_1': 0.85,
        'ResourceKPI_lag_1': 0.82,
        'SystemKPI_lag_1': 0.88,
        'Hours_SystemHours_Ratio_lag_1': 1.1,
        'Workers_Predicted_from_Quantity': 8.3
    }
    
    test_features.update(productivity_features)
    
    # Get all expected features and set missing ones to 0
    from utils.feature_engineering import get_feature_lists
    numeric_features, categorical_features = get_feature_lists(
        include_advanced_features=True, 
        include_productivity_metrics=True
    )
    
    all_expected_features = numeric_features + categorical_features
    
    for feature in all_expected_features:
        if feature not in test_features:
            test_features[feature] = 0.0
    
    # Create DataFrame and predict
    X_test = pd.DataFrame([test_features])
    
    try:
        prediction = model.predict(X_test)[0]
        print(f"Test prediction with manual features: {prediction:.2f}")
        print(f"Expected range based on training data: 5-15 workers")
        
        if prediction < 1.0:
            print("PROBLEM: Still getting very low prediction even with reasonable feature values!")
            print("This suggests an issue with the model itself or feature preprocessing.")
        else:
            print("SUCCESS: Manual features produce reasonable prediction.")
            print("Issue is likely in feature generation during actual prediction.")
            
    except Exception as e:
        print(f"Error in manual test: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    # Replace with your actual paths
    data_path = "C:/forlogssystems/work_utilization_app/209.xlsx" 
    models_path = "C:/forlogssystems/Models/work_utilization_models.pkl" 
    
    # Run comprehensive debugging
    success = comprehensive_debug_209(data_path, models_path)
    
    if success:
        # Load models for quick test
        with open(models_path, 'rb') as f:
            models = pickle.load(f)
        
        # Test with manual features
        quick_test_with_manual_features(models, '209')

Loading data...
Loading models...
Loaded 9 models
=== PREDICTION PIPELINE DEBUG FOR WORK TYPE 209 ===

1. MODEL VERIFICATION:
   Model type: <class 'sklearn.pipeline.Pipeline'>
   Is Pipeline: True
   Pipeline steps: ['preprocessor', 'model']

2. FEATURE COMPARISON:
   Expected features from model: 32
   First 10 expected: ['NoOfMan_lag_1', 'NoOfMan_lag_2', 'NoOfMan_lag_3', 'NoOfMan_lag_7', 'NoOfMan_rolling_mean_7', 'IsWeekend_feat', 'NoOfMan_rolling_max_7', 'NoOfMan_rolling_min_7', 'NoOfMan_rolling_std_7', 'NoOfMan_same_dow_lag']




   Features we're providing: 32
   Numeric: 28, Categorical: 4
   Available in data: 0
   Missing from data: 32
   MISSING FEATURES: ['NoOfMan_lag_1', 'NoOfMan_lag_2', 'NoOfMan_lag_3', 'NoOfMan_lag_7', 'NoOfMan_rolling_mean_7', 'IsWeekend_feat', 'NoOfMan_rolling_max_7', 'NoOfMan_rolling_min_7', 'NoOfMan_rolling_std_7', 'NoOfMan_same_dow_lag']...

3. RECENT DATA FEATURE ANALYSIS:
   Most recent date: 2025-05-21 00:00:00
   Actual NoOfMan: 6.86

   LAG FEATURES VALUES:

   ROLLING FEATURES VALUES:

4. PREDICTION TRACE:
   Input shape: (1, 32)
   Input columns: 32
   PREDICTION: 0.155619

   PIPELINE TRACE:
   After preprocessing shape: (1, 57)
   Preprocessed sample values: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
   Non-zero features: 1/57 (1.8%)
   Max absolute value: 1.000000

5. TRAINING DATA COMPARISON:
   High-value training examples:
   Date: 2019-12-12 00:00:00, NoOfMan: 39.03
     Key lags: 
   Date: 2023-12-18 00:00:00, NoOfMan: 31.73
     Key lags: 
   Date: 2019-12-11 00:00:00, NoOfMa