# V5 vs V7 Final - Ensemble Prediction Comparison

Detailed comparison of predictions on test well 15_9-F-4


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append('config')
from optimization_metrics import custom_sand_weighted_f1
from sklearn.metrics import f1_score

# Load predictions
v5 = pd.read_csv('v5_full_progressive/predictions/15_9-F-4/ensemble_predictions.csv')
v7 = pd.read_csv('v7_final_output/predictions/15_9-F-4/ensemble_predictions.csv')

print(f"V5 samples: {len(v5)}")
print(f"V7 samples: {len(v7)}")


V5 samples: 1138
V7 samples: 1138


## 1. Test Well Performance

**Important Note:** 
- Models were optimized for **custom_sand** metric (reservoir-focused weighted F1)
- Raw accuracy is reference only
- Fair comparison = custom_sand score on test well


In [6]:
# Calculate all metrics on test well
y_true = v5['Facies_True'].values
y_pred_v5 = v5['Facies_Predicted'].values
y_pred_v7 = v7['Facies_Predicted'].values

# 1. Raw accuracy (reference)
acc_v5 = (y_pred_v5 == y_true).mean()
acc_v7 = (y_pred_v7 == y_true).mean()

# 2. Custom Sand F1 (PRIMARY METRIC - what models were optimized for!)
custom_sand_v5 = custom_sand_weighted_f1(y_true, y_pred_v5)
custom_sand_v7 = custom_sand_weighted_f1(y_true, y_pred_v7)

# 3. F1 Weighted (SECONDARY METRIC)
f1_v5 = f1_score(y_true, y_pred_v5, average='weighted', zero_division=0)
f1_v7 = f1_score(y_true, y_pred_v7, average='weighted', zero_division=0)

print("="*70)
print("TEST WELL PERFORMANCE (15_9-F-4)")
print("="*70)

print("\n1. RAW ACCURACY (reference only):")
print(f"   V5: {acc_v5:.4f} ({acc_v5*100:.2f}%)")
print(f"   V7: {acc_v7:.4f} ({acc_v7*100:.2f}%)")
print(f"   Difference: {(acc_v7-acc_v5)*100:+.2f}% ‚Üí {'V7 wins' if acc_v7 > acc_v5 else 'V5 wins'}")

print("\n2. CUSTOM SAND F1 (PRIMARY - what V7 optimized for!):")
print(f"   V5: {custom_sand_v5:.4f}")
print(f"   V7: {custom_sand_v7:.4f}")
print(f"   Difference: {(custom_sand_v7-custom_sand_v5)*100:+.2f}% ‚Üí {'V7 wins' if custom_sand_v7 > custom_sand_v5 else 'V5 wins'}")

print("\n3. F1 WEIGHTED (SECONDARY - balanced metric):")
print(f"   V5: {f1_v5:.4f}")
print(f"   V7: {f1_v7:.4f}")
print(f"   Difference: {(f1_v7-f1_v5)*100:+.2f}% ‚Üí {'V7 wins' if f1_v7 > f1_v5 else 'V5 wins'}")

print("\n" + "="*70)
print("WINNER ON TEST WELL:")
if custom_sand_v7 > custom_sand_v5:
    print(f"üèÜ V7 Final (custom_sand: {custom_sand_v7:.4f} vs {custom_sand_v5:.4f})")
else:
    print(f"üèÜ V5 Full Progressive (custom_sand: {custom_sand_v5:.4f} vs {custom_sand_v7:.4f})")
print("="*70)


TEST WELL PERFORMANCE (15_9-F-4)

1. RAW ACCURACY (reference only):
   V5: 0.4895 (48.95%)
   V7: 0.4534 (45.34%)
   Difference: -3.60% ‚Üí V5 wins

2. CUSTOM SAND F1 (PRIMARY - what V7 optimized for!):
   V5: 0.4771
   V7: 0.4411
   Difference: -3.59% ‚Üí V5 wins

3. F1 WEIGHTED (SECONDARY - balanced metric):
   V5: 0.4509
   V7: 0.4222
   Difference: -2.87% ‚Üí V5 wins

WINNER ON TEST WELL:
üèÜ V5 Full Progressive (custom_sand: 0.4771 vs 0.4411)


## 2. Training vs Test Performance Gap

**This shows generalization ability**


In [7]:
# Training CV scores (from model rankings)
v5_best_cv = 0.3917  # FC_13 custom_sand
v7_best_cv = 0.4902  # V7_Rank_04 custom_sand

print("CUSTOM SAND METRIC (Primary Optimization Target):")
print("-"*70)
print(f"{'Model':<15} {'Train CV':<12} {'Test Score':<12} {'Gap':<12} {'Generalization'}")
print("-"*70)

v5_gap = custom_sand_v5 - v5_best_cv
v7_gap = custom_sand_v7 - v7_best_cv

print(f"{'V5':<15} {v5_best_cv:.4f}      {custom_sand_v5:.4f}      {v5_gap:+.4f}      {'Good' if v5_gap >= 0 else 'Overfit'}")
print(f"{'V7 Final':<15} {v7_best_cv:.4f}      {custom_sand_v7:.4f}      {v7_gap:+.4f}      {'Good' if v7_gap >= 0 else 'Overfit'}")

print("\n" + "="*70)
if custom_sand_v7 > custom_sand_v5:
    print(f"WINNER: V7 Final by {(custom_sand_v7-custom_sand_v5):.4f}")
else:
    print(f"WINNER: V5 by {(custom_sand_v5-custom_sand_v7):.4f}")
print("="*70)


CUSTOM SAND METRIC (Primary Optimization Target):
----------------------------------------------------------------------
Model           Train CV     Test Score   Gap          Generalization
----------------------------------------------------------------------
V5              0.3917      0.4771      +0.0854      Good
V7 Final        0.4902      0.4411      -0.0491      Overfit

WINNER: V5 by 0.0359


## 3. Uncertainty Metrics


In [8]:
entropy_v5 = v5['Uncertainty_Entropy'].mean()
entropy_v7 = v7['Uncertainty_Entropy'].mean()
agreement_v5 = v5['Uncertainty_Agreement'].mean()
agreement_v7 = v7['Uncertainty_Agreement'].mean()
margin_v5 = v5['Uncertainty_Margin'].mean()
margin_v7 = v7['Uncertainty_Margin'].mean()

print("Uncertainty Metrics:")
print(f"  Entropy (lower=better):   V5={entropy_v5:.4f}, V7={entropy_v7:.4f} ‚Üí {'V7 wins' if entropy_v7 < entropy_v5 else 'V5 wins'}")
print(f"  Agreement (higher=better): V5={agreement_v5:.4f}, V7={agreement_v7:.4f} ‚Üí {'V7 wins' if agreement_v7 > agreement_v5 else 'V5 wins'}")
print(f"  Margin (higher=better):    V5={margin_v5:.4f}, V7={margin_v7:.4f} ‚Üí {'V7 wins' if margin_v7 > margin_v5 else 'V5 wins'}")


Uncertainty Metrics:
  Entropy (lower=better):   V5=1.1126, V7=1.1116 ‚Üí V7 wins
  Agreement (higher=better): V5=0.7852, V7=0.8511 ‚Üí V7 wins
  Margin (higher=better):    V5=0.3616, V7=0.3344 ‚Üí V5 wins


## 4. Agreement Analysis


In [9]:
both_correct = ((v5['Facies_Predicted'] == v5['Facies_True']) & (v7['Facies_Predicted'] == v7['Facies_True'])).sum()
both_wrong = ((v5['Facies_Predicted'] != v5['Facies_True']) & (v7['Facies_Predicted'] != v7['Facies_True'])).sum()
v5_only = ((v5['Facies_Predicted'] == v5['Facies_True']) & (v7['Facies_Predicted'] != v7['Facies_True'])).sum()
v7_only = ((v5['Facies_Predicted'] != v5['Facies_True']) & (v7['Facies_Predicted'] == v7['Facies_True'])).sum()

print("Prediction Agreement:")
print(f"  Both correct: {both_correct}/{len(v5)} ({both_correct/len(v5)*100:.1f}%)")
print(f"  Both wrong: {both_wrong}/{len(v5)} ({both_wrong/len(v5)*100:.1f}%)")
print(f"  Only V5 correct: {v5_only}/{len(v5)} ({v5_only/len(v5)*100:.1f}%)")
print(f"  Only V7 correct: {v7_only}/{len(v7)} ({v7_only/len(v7)*100:.1f}%)")
print(f"\nNet improvement: {v7_only - v5_only:+d} samples")


Prediction Agreement:
  Both correct: 428/1138 (37.6%)
  Both wrong: 493/1138 (43.3%)
  Only V5 correct: 129/1138 (11.3%)
  Only V7 correct: 88/1138 (7.7%)

Net improvement: -41 samples
