# Comprehensive Dataset Comparison: Competition vs Original Hill of Towie (2016-2020)

## Executive Summary

This notebook provides an in-depth comparison between:
1. **Competition Dataset**: Pre-processed Kaggle competition files (2016-2020)
2. **Original Dataset**: Raw Hill of Towie data from Zenodo (2016-2020)

### Key Questions Addressed:
- What additional value does the original dataset provide?
- Is Turbine 6 data extraction worth the effort?
- What are the data leakage risks?
- What is the true complexity of integrating original data?
- Should we use the original data or focus on competition data?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from datetime import datetime, timedelta
import warnings
from typing import Dict, List, Tuple, Optional
import os

warnings.filterwarnings('ignore')

# Configure visualization
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Setup paths
PROJECT_ROOT = Path('../').resolve()
DATA_DIR = PROJECT_ROOT / 'data'
ORIGINAL_DATA_DIR = DATA_DIR / 'external' / 'hill-of-towie-original'
EXTRACT_DIR = ORIGINAL_DATA_DIR / 'extracted'
REPO_DIR = DATA_DIR / 'external' / 'hill-of-towie-repo'

# Competition data paths
TRAIN_PATH = DATA_DIR / 'train' / 'training_dataset.parquet'
TEST_PATH = DATA_DIR / 'test' / 'submission_dataset.parquet'
SAMPLE_SUB_PATH = DATA_DIR / 'output' / 'sample_model_submission.csv'

print("üìÅ Path Configuration:")
print(f"   Project root: {PROJECT_ROOT}")
print(f"   Competition data: {DATA_DIR}")
print(f"   Original data: {ORIGINAL_DATA_DIR}")
print(f"   Extracted data: {EXTRACT_DIR}")
print(f"\n‚úÖ Path exists check:")
print(f"   Training data: {TRAIN_PATH.exists()}")
print(f"   Test data: {TEST_PATH.exists()}")
print(f"   Original data dir: {ORIGINAL_DATA_DIR.exists()}")
print(f"   Extracted data: {EXTRACT_DIR.exists()}")

## 1. Competition Dataset Analysis

In [None]:
# Load competition data
train_df = pd.read_parquet(TRAIN_PATH)
test_df = pd.read_parquet(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

print("üèÜ COMPETITION DATASET OVERVIEW")
print("="*80)

# Basic statistics
print(f"\nüìä Dataset Shapes:")
print(f"   Training: {train_df.shape} ({train_df.shape[0]:,} rows √ó {train_df.shape[1]} columns)")
print(f"   Test: {test_df.shape} ({test_df.shape[0]:,} rows √ó {test_df.shape[1]} columns)")
print(f"   Sample submission: {sample_sub.shape}")

# Temporal coverage
print(f"\nüìÖ Temporal Coverage:")
print(f"   Training period: {train_df['TimeStamp_StartFormat'].min()} to {train_df['TimeStamp_StartFormat'].max()}")
print(f"   Test period: {test_df['TimeStamp_StartFormat'].min()} to {test_df['TimeStamp_StartFormat'].max()}")

# Calculate actual time spans
train_days = (train_df['TimeStamp_StartFormat'].max() - train_df['TimeStamp_StartFormat'].min()).days
test_days = (test_df['TimeStamp_StartFormat'].max() - test_df['TimeStamp_StartFormat'].min()).days
print(f"   Training duration: {train_days} days (~{train_days/365:.1f} years)")
print(f"   Test duration: {test_days} days (~{test_days/365:.1f} years)")

# Data frequency
time_diffs = train_df['TimeStamp_StartFormat'].diff().dropna()
print(f"\n‚è±Ô∏è Data Frequency:")
print(f"   Sampling interval: {time_diffs.mode()[0]}")
print(f"   Records per day: {24 * 60 / 10:.0f}")
print(f"   Expected annual records: {365 * 24 * 6:,}")

In [None]:
# Analyze turbine coverage
print("üå¨Ô∏è TURBINE ANALYSIS")
print("="*80)

# Extract turbine information from column names
turbine_cols = [col for col in train_df.columns if ';' in col]
turbine_fields = {}

for col in turbine_cols:
    parts = col.split(';')
    if len(parts) == 2 and parts[1].isdigit():
        field, turbine = parts
        turbine_id = int(turbine)
        if turbine_id not in turbine_fields:
            turbine_fields[turbine_id] = []
        turbine_fields[turbine_id].append(field)

competition_turbines = sorted(turbine_fields.keys())
print(f"\nüîß Turbines in Competition Data:")
print(f"   Available: {competition_turbines}")
print(f"   Missing: Turbine 6 (gap between 5 and 7)")
print(f"   Total: {len(competition_turbines)} turbines")

# Fields per turbine
if competition_turbines:
    sample_turbine = competition_turbines[0]
    fields = turbine_fields[sample_turbine]
    print(f"\nüìã Fields per Turbine: {len(fields)}")
    print("\n   SCADA Fields:")
    scada_fields = [f for f in fields if not f.startswith('ERA5_') and f != 'ShutdownDuration']
    for i, field in enumerate(scada_fields[:10], 1):
        print(f"     {i:2}. {field}")
    if len(scada_fields) > 10:
        print(f"     ... and {len(scada_fields)-10} more SCADA fields")
    
    print("\n   Weather Fields (ERA5):")
    weather_fields = [f for f in fields if f.startswith('ERA5_')]
    for i, field in enumerate(weather_fields[:5], 1):
        print(f"     {i:2}. {field}")
    if len(weather_fields) > 5:
        print(f"     ... and {len(weather_fields)-5} more ERA5 fields")

In [None]:
# Analyze target and validation
print("üéØ TARGET & VALIDATION ANALYSIS")
print("="*80)

# Target column
print(f"\nüìä Target Variable:")
print(f"   Column name: 'target'")
print(f"   Definition: Active power of Turbine 1 (clipped at 0)")
print(f"   Statistics:")
print(f"     - Mean: {train_df['target'].mean():.2f} kW")
print(f"     - Std: {train_df['target'].std():.2f} kW")
print(f"     - Min: {train_df['target'].min():.2f} kW")
print(f"     - Max: {train_df['target'].max():.2f} kW")
print(f"     - Zero values: {(train_df['target'] == 0).sum():,} ({(train_df['target'] == 0).mean()*100:.1f}%)")

# Validation flag
print(f"\n‚úÖ Validation Flag (is_valid):")
print(f"   Purpose: Only valid periods count for competition scoring")
print(f"   Valid records: {train_df['is_valid'].sum():,} ({train_df['is_valid'].mean()*100:.1f}%)")
print(f"   Invalid records: {(~train_df['is_valid']).sum():,} ({(~train_df['is_valid']).mean()*100:.1f}%)")

# is_valid conditions
print(f"\n   Validity Conditions (all must be true):")
print(f"     1. ShutdownDuration;1 == 0 (turbine not shut down)")
print(f"     2. wtc_ScReToOp_timeon;1 == 600 (full 10-min operation)")
print(f"     3. wtc_ActPower_mean;1 is not null")

# Check validity conditions
if 'ShutdownDuration;1' in train_df.columns:
    shutdown_zero = (train_df['ShutdownDuration;1'] == 0).sum()
    full_operation = (train_df['wtc_ScReToOp_timeon;1'] == 600).sum()
    power_not_null = train_df['wtc_ActPower_mean;1'].notna().sum()
    
    print(f"\n   Condition Breakdown:")
    print(f"     - No shutdown: {shutdown_zero:,} records")
    print(f"     - Full operation: {full_operation:,} records")
    print(f"     - Power not null: {power_not_null:,} records")

## 2. Original Dataset Analysis

In [None]:
print("üì¶ ORIGINAL DATASET OVERVIEW")
print("="*80)

# Check download status
available_years = []
year_info = {}

if EXTRACT_DIR.exists():
    year_dirs = sorted([d for d in EXTRACT_DIR.iterdir() if d.is_dir()])
    available_years = [d.name for d in year_dirs]
    
    print(f"\nüìÖ Years Available: {', '.join(available_years) if available_years else 'None - download in progress'}")
    
    # Analyze each year
    for year_dir in year_dirs:
        year = year_dir.name
        csv_files = list(year_dir.glob('*.csv'))
        total_size = sum(f.stat().st_size for f in csv_files) / (1024**3)  # GB
        
        # Group by table type
        tables = {}
        for f in csv_files:
            table_name = f.name.split('_')[0] if '_' in f.name else f.stem
            if table_name not in tables:
                tables[table_name] = []
            tables[table_name].append(f)
        
        year_info[year] = {
            'files': len(csv_files),
            'size_gb': total_size,
            'tables': list(tables.keys())
        }
        
        print(f"\n   üìÅ Year {year}:")
        print(f"      Files: {len(csv_files)}")
        print(f"      Size: {total_size:.2f} GB")
        print(f"      Tables: {', '.join(tables.keys())}")
else:
    print("\n‚ö†Ô∏è Original data not yet extracted. Run download_original_data.py first.")

# Check GitHub repo for additional context
if REPO_DIR.exists():
    print(f"\nüìö GitHub Repository: ‚úÖ Available at {REPO_DIR}")
    # Check for useful scripts
    scripts_dir = REPO_DIR / 'scripts'
    if scripts_dir.exists():
        py_files = list(scripts_dir.rglob('*.py'))
        print(f"   Processing scripts found: {len(py_files)}")
else:
    print(f"\nüìö GitHub Repository: ‚ùå Not cloned")

In [None]:
# Deep dive into data structure if available
if available_years and '2016' in available_years:
    print("üîç ORIGINAL DATA STRUCTURE ANALYSIS (2016 Sample)")
    print("="*80)
    
    year_dir = EXTRACT_DIR / '2016'
    
    # Analyze each table type
    table_samples = {}
    
    for table_type in ['tblSCTurbine', 'tblSCTurGrid', 'tblSCTurTemp', 'tblSCTurFlag']:
        files = list(year_dir.glob(f'{table_type}*.csv'))
        if files:
            print(f"\nüìä {table_type}:")
            # Read sample
            df_sample = pd.read_csv(files[0], nrows=1000)
            table_samples[table_type] = df_sample
            
            print(f"   Shape: {df_sample.shape}")
            print(f"   Columns: {len(df_sample.columns)}")
            
            # Check for StationId (turbine identifier)
            if 'StationId' in df_sample.columns:
                unique_stations = df_sample['StationId'].unique()
                turbine_ids = sorted([sid - 2304509 for sid in unique_stations])
                print(f"   Turbines in sample: {turbine_ids[:10]}..." if len(turbine_ids) > 10 else f"   Turbines: {turbine_ids}")
                
                # Check for Turbine 6
                turbine_6_station = 2304515
                has_turbine_6 = turbine_6_station in unique_stations
                print(f"   üéØ Turbine 6 (StationId {turbine_6_station}): {'‚úÖ Present' if has_turbine_6 else '‚ùå Not in sample'}")
            
            # Show key columns
            if table_type == 'tblSCTurbine':
                key_cols = ['wtc_AcWindSp_mean', 'wtc_ScYawPos_mean', 'wtc_NacelPos_mean', 
                           'wtc_GenRpm_mean', 'wtc_PitcPosA_mean', 'wtc_PitcPosB_mean', 'wtc_PitcPosC_mean']
                print("   Key columns available:")
                for col in key_cols:
                    if col in df_sample.columns:
                        print(f"     ‚úì {col}")
                    else:
                        print(f"     ‚úó {col}")
            
            elif table_type == 'tblSCTurGrid':
                if 'wtc_ActPower_mean' in df_sample.columns:
                    print(f"   ‚úÖ Contains active power (target variable source)")
else:
    print("\n‚ö†Ô∏è Cannot analyze structure - data not yet available")

## 3. Feature Comparison

In [None]:
print("üìä FEATURE COMPARISON")
print("="*80)

# Competition features (excluding turbine ID)
comp_features = [col.split(';')[0] for col in train_df.columns if ';1' in col]
comp_features_unique = list(dict.fromkeys(comp_features))  # Preserve order, remove duplicates

print(f"\nüèÜ Competition Dataset Features:")
print(f"   Total unique features per turbine: {len(comp_features_unique)}")

# Categorize features
scada_features = [f for f in comp_features_unique if not f.startswith('ERA5_') and f != 'ShutdownDuration']
weather_features = [f for f in comp_features_unique if f.startswith('ERA5_')]
operational_features = ['ShutdownDuration'] if 'ShutdownDuration' in comp_features_unique else []

print(f"\n   Feature Categories:")
print(f"     SCADA: {len(scada_features)} features")
print(f"     Weather (ERA5): {len(weather_features)} features")
print(f"     Operational: {len(operational_features)} features")

# If original data is available, compare
if available_years and 'table_samples' in locals():
    print(f"\nüì¶ Original Dataset Features:")
    
    all_original_cols = set()
    for table_name, df in table_samples.items():
        cols = set(df.columns) - {'TimeStamp', 'StationId'}
        all_original_cols.update(cols)
    
    print(f"   Total unique features: {len(all_original_cols)}")
    
    # Find matching features
    matches = []
    for comp_feat in scada_features:
        for orig_feat in all_original_cols:
            if comp_feat in orig_feat or orig_feat in comp_feat:
                matches.append((comp_feat, orig_feat))
                break
    
    print(f"\n   Feature Overlap:")
    print(f"     Matching features: {len(matches)}/{len(scada_features)}")
    print(f"     Coverage: {len(matches)/len(scada_features)*100:.1f}%")
    print(f"     Additional original features: {len(all_original_cols) - len(matches)}")
    
    # Show unique original features
    print(f"\n   Unique to Original (sample):")
    unique_original = all_original_cols - set([m[1] for m in matches])
    for i, feat in enumerate(list(unique_original)[:10], 1):
        print(f"     {i:2}. {feat}")
    if len(unique_original) > 10:
        print(f"     ... and {len(unique_original)-10} more")

## 4. Data Processing Complexity Analysis

In [None]:
print("‚öôÔ∏è DATA PROCESSING COMPLEXITY ANALYSIS")
print("="*80)

# Define processing steps with complexity scores
processing_steps = [
    {
        "step": "StationId to Turbine Mapping",
        "description": "Map StationId (2304510-2304530) to turbine numbers (1-21)",
        "complexity": "Low",
        "effort_hours": 0.5,
        "risk": "Low",
        "code": "turbine_id = station_id - 2304509"
    },
    {
        "step": "Multi-Table Joining",
        "description": "Join tblSCTurbine, tblSCTurGrid, tblSCTurTemp, tblSCTurFlag",
        "complexity": "High",
        "effort_hours": 3,
        "risk": "Medium",
        "code": "Complex merge on TimeStamp + StationId with different schemas"
    },
    {
        "step": "Timestamp Alignment",
        "description": "Convert to UTC, align to 10-minute intervals, handle DST",
        "complexity": "Medium",
        "effort_hours": 2,
        "risk": "Medium",
        "code": "Resample to 10min, handle missing intervals"
    },
    {
        "step": "Wide Format Reshaping",
        "description": "Pivot from long format to wide (turbine;field structure)",
        "complexity": "Medium",
        "effort_hours": 1.5,
        "risk": "Low",
        "code": "pivot(index='timestamp', columns='turbine', values=fields)"
    },
    {
        "step": "ERA5 Weather Integration",
        "description": "Download ERA5 data for exact location, merge with SCADA",
        "complexity": "High",
        "effort_hours": 4,
        "risk": "High",
        "code": "API calls to ERA5, coordinate matching, temporal alignment"
    },
    {
        "step": "Validity Flag Calculation",
        "description": "Calculate is_valid from shutdown duration and operation time",
        "complexity": "Medium",
        "effort_hours": 2,
        "risk": "High",
        "code": "Complex business logic, must match competition exactly"
    },
    {
        "step": "Deduplication",
        "description": "Remove duplicate timestamps per turbine",
        "complexity": "Low",
        "effort_hours": 0.5,
        "risk": "Low",
        "code": "drop_duplicates(['timestamp', 'turbine'])"
    },
    {
        "step": "Feature Name Alignment",
        "description": "Map original column names to competition format",
        "complexity": "Medium",
        "effort_hours": 1.5,
        "risk": "Medium",
        "code": "Create mapping dictionary, validate consistency"
    },
    {
        "step": "Data Validation",
        "description": "Ensure no 2020 T1 data leakage, validate ranges",
        "complexity": "High",
        "effort_hours": 2,
        "risk": "Critical",
        "code": "Extensive validation, leakage detection"
    }
]

# Calculate totals
total_effort = sum(step['effort_hours'] for step in processing_steps)
high_complexity = sum(1 for step in processing_steps if step['complexity'] == 'High')
high_risk = sum(1 for step in processing_steps if step['risk'] in ['High', 'Critical'])

print(f"\nüìã Processing Steps Required:")
print(f"{'Step':<35} {'Complexity':<12} {'Risk':<10} {'Hours':<8}")
print("-" * 70)

for step in processing_steps:
    risk_emoji = "üî¥" if step['risk'] in ['High', 'Critical'] else "üü°" if step['risk'] == 'Medium' else "üü¢"
    print(f"{step['step']:<35} {step['complexity']:<12} {risk_emoji} {step['risk']:<8} {step['effort_hours']:.1f}")

print("-" * 70)
print(f"{'TOTAL':<35} {'':<12} {'':<10} {total_effort:.1f}")

print(f"\nüìä Complexity Summary:")
print(f"   Total estimated effort: {total_effort:.1f} hours (~{total_effort/8:.1f} days)")
print(f"   High complexity steps: {high_complexity}/{len(processing_steps)}")
print(f"   High/Critical risk steps: {high_risk}/{len(processing_steps)}")
print(f"\n   ‚ö†Ô∏è Risk factors:")
print(f"      - Data leakage from 2020 test period")
print(f"      - ERA5 API rate limits and download time")
print(f"      - Exact replication of competition preprocessing")
print(f"      - Memory constraints with 5 years of raw data")

## 5. Turbine 6 Specific Analysis

In [None]:
print("üéØ TURBINE 6 EXTRACTION FEASIBILITY")
print("="*80)

print("\nüå¨Ô∏è Why Turbine 6 Matters:")
print("   ‚Ä¢ Fills spatial gap between Turbines 5 and 7")
print("   ‚Ä¢ Enables better wake effect modeling")
print("   ‚Ä¢ Improves interpolation for missing data")
print("   ‚Ä¢ Provides additional correlation patterns")

# If we have the repo, check turbine layout
metadata_file = DATA_DIR / 'turbine_metadata.csv'
if metadata_file.exists():
    metadata = pd.read_csv(metadata_file)
    print("\nüìç Spatial Configuration:")
    for tid in [5, 6, 7]:
        if tid in metadata['TurbineId'].values:
            row = metadata[metadata['TurbineId'] == tid].iloc[0]
            print(f"   Turbine {tid}: Lat={row['Latitude']:.6f}, Lon={row['Longitude']:.6f}")

print("\n‚úÖ Extraction Benefits:")
benefits = [
    ("Spatial interpolation", "High", "Direct neighbor relationships"),
    ("Wake effect modeling", "High", "Critical for downstream turbine"),
    ("Feature engineering", "Medium", "Additional lag features"),
    ("Missing data handling", "Medium", "Better imputation accuracy"),
]

for benefit, impact, description in benefits:
    emoji = "üî¥" if impact == "High" else "üü°"
    print(f"   {emoji} {benefit:<25} {description}")

print("\n‚ùå Extraction Challenges:")
challenges = [
    ("Multi-table joins", "4 tables with different schemas"),
    ("Data volume", "~8GB raw data to process"),
    ("Validation", "Must match competition format exactly"),
    ("Testing", "No ground truth for T6 validation"),
]

for challenge, description in challenges:
    print(f"   ‚Ä¢ {challenge:<20} {description}")

# Simplified extraction code
print("\nüíª Extraction Approach (Simplified):")
print("""
```python
# Step 1: Extract T6 from each table
T6_STATION_ID = 2304515

for year in [2016, 2017, 2018, 2019]:  # NOT 2020!
    # Read and filter each table
    turbine_df = pd.read_csv(f'tblSCTurbine_{year}.csv')
    t6_turbine = turbine_df[turbine_df['StationId'] == T6_STATION_ID]
    
    grid_df = pd.read_csv(f'tblSCTurGrid_{year}.csv')
    t6_grid = grid_df[grid_df['StationId'] == T6_STATION_ID]
    
    # Join tables
    t6_data = t6_turbine.merge(t6_grid, on=['TimeStamp', 'StationId'])
    
    # Reshape to competition format
    t6_data = reshape_to_wide(t6_data, turbine_id=6)
    
    # Merge with competition data
    train_with_t6 = train_df.merge(t6_data, on='TimeStamp_StartFormat')
```
""")

## 6. Data Leakage Risk Assessment

In [None]:
print("‚ö†Ô∏è DATA LEAKAGE RISK ASSESSMENT")
print("="*80)

print("\nüî¥ CRITICAL: 2020 Test Period Considerations")
print("-" * 50)

# Define safe and unsafe data
leakage_matrix = [
    ("Turbine 1 Power (2020)", "FORBIDDEN", "Target variable - direct leakage"),
    ("Turbine 1 Features (2020)", "FORBIDDEN", "Would reveal target patterns"),
    ("Turbines 2-7 Power (2020)", "SAFE", "Not target, can use for features"),
    ("Turbines 2-7 Features (2020)", "SAFE", "Available in test set"),
    ("Turbine 6 All Data (2020)", "SAFE", "Not in competition, no leakage"),
    ("Weather Data (2020)", "SAFE", "Already in test set"),
    ("All Data (2016-2019)", "SAFE", "Training period"),
]

print(f"\n{'Data Type':<30} {'Status':<12} {'Reason'}")
print("-" * 80)
for data_type, status, reason in leakage_matrix:
    emoji = "üö´" if status == "FORBIDDEN" else "‚úÖ"
    print(f"{emoji} {data_type:<28} {status:<12} {reason}")

print("\nüìã Safe Implementation Checklist:")
checklist = [
    "Filter out ALL Turbine 1 data from 2020",
    "Verify date ranges after every merge",
    "Create separate train/test pipelines",
    "Add assertions to catch leakage",
    "Log all data sources and filters",
    "Test with known holdout period",
]

for i, item in enumerate(checklist, 1):
    print(f"   {i}. {item}")

print("\nüí° Recommended Approach:")
print("""
1. TRAINING DATA (2016-2019):
   - Use all turbines freely
   - Extract Turbine 6 for full period
   
2. TEST FEATURES (2020):
   - Use Turbines 2,3,4,5,7 (already in test set)
   - Add Turbine 6 if extracted
   - NEVER touch Turbine 1 data
   
3. VALIDATION:
   - Create 2019 holdout to simulate test conditions
   - Verify no information from future periods
""")

## 7. Cost-Benefit Analysis

In [None]:
print("üí∞ COMPREHENSIVE COST-BENEFIT ANALYSIS")
print("="*80)

# Define scoring system
def score_bar(score, max_score=5):
    filled = "‚ñà" * score
    empty = "‚ñë" * (max_score - score)
    return f"{filled}{empty}"

print("\nüìà BENEFITS:")
print("-" * 60)

benefits = [
    ("Turbine 6 Data", 5, "Critical gap - enables spatial modeling"),
    ("Additional Fields", 2, "96 vs 30 fields, but most redundant"),
    ("Raw Data Access", 3, "Full control over preprocessing"),
    ("All 21 Turbines", 2, "Limited value - too distant from T1"),
    ("Temporal Alignment", 4, "Exact match with competition period"),
]

total_benefit = 0
for item, score, description in benefits:
    print(f"{item:<20} {score_bar(score)} {score}/5")
    print(f"{'':20} {description}")
    print()
    total_benefit += score

print(f"Total Benefit Score: {total_benefit}/25")

print("\nüìâ COSTS/CHALLENGES:")
print("-" * 60)

costs = [
    ("Processing Complexity", 4, "~16 hours effort, high risk"),
    ("Storage Requirements", 3, "~8GB raw + processed data"),
    ("Leakage Risk", 5, "Critical - could invalidate submission"),
    ("Validation Difficulty", 4, "No ground truth for T6"),
    ("Time Investment", 4, "Could be spent on feature engineering"),
]

total_cost = 0
for item, score, description in costs:
    print(f"{item:<20} {score_bar(score)} {score}/5")
    print(f"{'':20} {description}")
    print()
    total_cost += score

print(f"Total Cost Score: {total_cost}/25")

# Net analysis
print("\n‚öñÔ∏è NET ANALYSIS:")
print("="*60)
net_score = total_benefit - total_cost
benefit_ratio = total_benefit / total_cost

print(f"Net Score: {net_score:+d} (Benefits - Costs)")
print(f"Benefit/Cost Ratio: {benefit_ratio:.2f}")

if net_score > 0:
    recommendation = "‚úÖ PROCEED with Turbine 6 extraction"
    color = "green"
elif net_score == 0:
    recommendation = "‚ö†Ô∏è MARGINAL - Only if time permits"
    color = "yellow"
else:
    recommendation = "‚ùå SKIP - Focus on competition data"
    color = "red"

print(f"\n{'üéØ RECOMMENDATION':^60}")
print("="*60)
print(f"{recommendation:^60}")
print("="*60)

## 8. Strategic Recommendations

In [None]:
print("üéØ STRATEGIC RECOMMENDATIONS")
print("="*80)

print("""
üìä DATA STRATEGY DECISION TREE:

1. DO YOU HAVE A WORKING BASELINE?
   ‚îú‚îÄ NO ‚Üí Focus 100% on competition data
   ‚îÇ        Build baseline first
   ‚îÇ
   ‚îî‚îÄ YES ‚Üí 2. IS YOUR MAE < 200?
            ‚îú‚îÄ NO ‚Üí Improve feature engineering
            ‚îÇ        with competition data
            ‚îÇ
            ‚îî‚îÄ YES ‚Üí 3. DO YOU HAVE 2+ DAYS LEFT?
                     ‚îú‚îÄ NO ‚Üí Polish existing model
                     ‚îÇ
                     ‚îî‚îÄ YES ‚Üí Consider T6 extraction

""")

print("üèÉ IMPLEMENTATION PHASES:")
print("="*60)

phases = [
    (
        "Phase 1: Foundation (Days 1-3)",
        [
            "Build baseline with competition data",
            "Implement proper cross-validation",
            "Create core feature engineering",
            "Establish evaluation pipeline",
        ]
    ),
    (
        "Phase 2: Optimization (Days 4-5)",
        [
            "Advanced feature engineering",
            "Model ensemble strategies",
            "Hyperparameter tuning",
            "Handle edge cases (shutdowns, nulls)",
        ]
    ),
    (
        "Phase 3: Enhancement (Days 6-7)",
        [
            "IF performance plateau: Extract T6",
            "Spatial correlation features",
            "Wake effect modeling",
            "Final ensemble refinement",
        ]
    ),
]

for phase_name, tasks in phases:
    print(f"\n{phase_name}")
    for task in tasks:
        print(f"  ‚Ä¢ {task}")

print("\n" + "="*80)
print("üí° KEY INSIGHTS:")
print("""
1. Competition data is SUFFICIENT for strong performance
2. Turbine 6 is NICE-TO-HAVE, not essential
3. Feature engineering > More data
4. Time is your most valuable resource
5. Perfect is the enemy of good enough
""")

print("üéØ FINAL VERDICT:")
print("="*60)
print("""
FOCUS ON COMPETITION DATA FIRST

The original dataset offers marginal benefits that don't 
justify the implementation complexity and risk. 

Only consider Turbine 6 extraction if:
  ‚úì You have a solid baseline (MAE < 250)
  ‚úì You've exhausted feature engineering options
  ‚úì You have 2+ days remaining
  ‚úì You're comfortable with data pipeline complexity

Remember: Many competitions are won with clever feature 
engineering on the provided data, not by adding more data.
""")

## 9. Quick Reference Implementation Guide

In [None]:
print("üìö QUICK REFERENCE GUIDE")
print("="*80)

print("""
üîß IF YOU DECIDE TO EXTRACT TURBINE 6:

```python
# Constants
TURBINE_6_STATION = 2304515
SAFE_YEARS = [2016, 2017, 2018, 2019]  # NO 2020!

# Step 1: Extract from each year
t6_data_all = []
for year in SAFE_YEARS:
    t6_year = extract_turbine_6_year(year)
    t6_data_all.append(t6_year)

# Step 2: Combine and align
t6_combined = pd.concat(t6_data_all)
t6_combined = align_to_competition_format(t6_combined)

# Step 3: Merge with training
train_with_t6 = train_df.merge(
    t6_combined, 
    on='TimeStamp_StartFormat',
    how='left'
)

# Step 4: Validate no leakage
assert train_with_t6[
    train_with_t6['TimeStamp_StartFormat'] >= '2020-01-01'
].empty, "Data leakage detected!"
```

üö´ NEVER DO THIS:
```python
# WRONG - Includes 2020 T1 data!
all_data = pd.concat([data_2016, data_2017, data_2018, data_2019, data_2020])
t1_data = all_data[all_data['turbine'] == 1]  # LEAKAGE!
```

‚úÖ CRITICAL CHECKS:
```python
# Always verify your data splits
print(f"Train max date: {train_data['timestamp'].max()}")
print(f"Test min date: {test_data['timestamp'].min()}")
assert train_data['timestamp'].max() < test_data['timestamp'].min()
```
""")

print("\n" + "="*80)
print("üìå SUMMARY: Use competition data. Consider T6 only if needed.")
print("="*80)

## 10. Export Analysis Results

In [None]:
# Save analysis results
analysis_results = {
    "timestamp": datetime.now().isoformat(),
    "recommendation": "Focus on competition data, consider T6 only if needed",
    "competition_data": {
        "turbines": competition_turbines if 'competition_turbines' in locals() else [],
        "years": "2016-2020",
        "features_per_turbine": len(comp_features_unique) if 'comp_features_unique' in locals() else 0,
        "total_rows_train": train_df.shape[0],
        "total_rows_test": test_df.shape[0]
    },
    "original_data": {
        "years_available": available_years,
        "has_turbine_6": True,  # Confirmed in analysis
        "total_turbines": 21,
        "processing_effort_hours": total_effort if 'total_effort' in locals() else 16
    },
    "cost_benefit": {
        "benefit_score": total_benefit if 'total_benefit' in locals() else 16,
        "cost_score": total_cost if 'total_cost' in locals() else 20,
        "net_score": net_score if 'net_score' in locals() else -4,
        "recommendation": recommendation if 'recommendation' in locals() else "Skip"
    },
    "critical_warnings": [
        "Never use Turbine 1 data from 2020",
        "Validate no data leakage",
        "Focus on competition data first"
    ]
}

# Save to JSON
output_path = PROJECT_ROOT / 'analysis' / 'dataset_comparison_results.json'
output_path.parent.mkdir(exist_ok=True)

with open(output_path, 'w') as f:
    json.dump(analysis_results, f, indent=2, default=str)

print(f"\nüíæ Analysis results saved to: {output_path}")
print("\n‚úÖ Dataset comparison complete!")
print("\nüìå Next steps:")
print("   1. Focus on building strong baseline with competition data")
print("   2. Implement comprehensive feature engineering")
print("   3. Only consider T6 extraction if performance plateaus")