# An√°lise de Modelos ML - Neural Hive Mind

Notebook para an√°lise explorat√≥ria de modelos de especialistas treinados com MLflow.

**Funcionalidades:**
- Status atual de todos os especialistas
- Compara√ß√£o de vers√µes (Production vs Staging)
- Evolu√ß√£o temporal de m√©tricas
- Feature importance
- Confusion matrix
- Recomenda√ß√µes automatizadas

## 1. Setup e Imports

In [None]:
# Imports
import mlflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

# Configura√ß√£o de estilo
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Constantes
MLFLOW_URI = 'http://mlflow.mlflow:5000'
SPECIALISTS = ['technical', 'business', 'behavior', 'evolution', 'architecture']
THRESHOLDS = {
    'precision': 0.75,
    'recall': 0.70,
    'f1': 0.72,
    'improvement': 0.05
}

# Configurar MLflow
mlflow.set_tracking_uri(MLFLOW_URI)

print(f"MLflow Tracking URI: {MLFLOW_URI}")
print(f"Specialists: {', '.join(SPECIALISTS)}")
print(f"Thresholds: {THRESHOLDS}")

## 2. Fun√ß√µes Auxiliares

In [None]:
def get_model_versions(specialist_type):
    """Buscar todas as vers√µes de um modelo no MLflow."""
    model_name = f"{specialist_type}-evaluator"
    client = mlflow.tracking.MlflowClient()
    
    try:
        versions = client.search_model_versions(f"name='{model_name}'")
        return versions
    except Exception as e:
        print(f"Erro ao buscar vers√µes de {model_name}: {e}")
        return []

def get_model_metrics(run_id):
    """Extrair m√©tricas de um run espec√≠fico."""
    client = mlflow.tracking.MlflowClient()
    
    try:
        run = client.get_run(run_id)
        metrics = run.data.metrics
        return metrics
    except Exception as e:
        print(f"Erro ao buscar m√©tricas do run {run_id}: {e}")
        return {}

def load_model_from_mlflow(model_name, version):
    """Carregar modelo espec√≠fico do MLflow."""
    try:
        model_uri = f"models:/{model_name}/{version}"
        model = mlflow.sklearn.load_model(model_uri)
        return model
    except Exception as e:
        print(f"Erro ao carregar modelo {model_name} v{version}: {e}")
        return None

def compare_models(specialist_type, version1, version2):
    """Comparar m√©tricas entre duas vers√µes."""
    versions = get_model_versions(specialist_type)
    
    v1_metrics = None
    v2_metrics = None
    
    for v in versions:
        if v.version == str(version1):
            v1_metrics = get_model_metrics(v.run_id)
        if v.version == str(version2):
            v2_metrics = get_model_metrics(v.run_id)
    
    return v1_metrics, v2_metrics

def get_all_specialists_status():
    """Obter status de todos os especialistas."""
    data = []
    
    for specialist in SPECIALISTS:
        versions = get_model_versions(specialist)
        
        prod_version = None
        staging_version = None
        prod_metrics = {}
        last_updated = None
        
        for v in versions:
            if v.current_stage == 'Production':
                prod_version = v.version
                prod_metrics = get_model_metrics(v.run_id)
                last_updated = datetime.fromtimestamp(v.last_updated_timestamp / 1000)
            elif v.current_stage == 'Staging':
                staging_version = v.version
        
        data.append({
            'Specialist': specialist,
            'Production Version': prod_version or 'N/A',
            'Staging Version': staging_version or 'N/A',
            'Precision': prod_metrics.get('precision', 0),
            'Recall': prod_metrics.get('recall', 0),
            'F1': prod_metrics.get('f1', 0),
            'Accuracy': prod_metrics.get('accuracy', 0),
            'Last Updated': last_updated or 'N/A'
        })
    
    return pd.DataFrame(data)

print("Fun√ß√µes auxiliares carregadas com sucesso.")

## 3. An√°lise de Status Atual

In [None]:
# Obter status de todos os especialistas
status_df = get_all_specialists_status()

# Aplicar formata√ß√£o de cores baseado em thresholds
def highlight_metrics(row):
    colors = []
    for col in row.index:
        if col == 'Precision':
            colors.append('background-color: lightgreen' if row[col] >= THRESHOLDS['precision'] else 'background-color: lightyellow')
        elif col == 'Recall':
            colors.append('background-color: lightgreen' if row[col] >= THRESHOLDS['recall'] else 'background-color: lightyellow')
        elif col == 'F1':
            colors.append('background-color: lightgreen' if row[col] >= THRESHOLDS['f1'] else 'background-color: lightyellow')
        else:
            colors.append('')
    return colors

display(status_df.style.apply(highlight_metrics, axis=1))

# Identificar especialistas com modelos desatualizados
print("\n=== Modelos Desatualizados (> 30 dias) ===")
for idx, row in status_df.iterrows():
    if row['Last Updated'] != 'N/A':
        days_old = (datetime.now() - row['Last Updated']).days
        if days_old > 30:
            print(f"‚ö†Ô∏è  {row['Specialist']}: {days_old} dias desde √∫ltima atualiza√ß√£o")

# Identificar especialistas abaixo dos thresholds
print("\n=== Modelos Abaixo dos Thresholds ===")
for idx, row in status_df.iterrows():
    issues = []
    if row['Precision'] < THRESHOLDS['precision']:
        issues.append(f"Precision ({row['Precision']:.3f} < {THRESHOLDS['precision']})")
    if row['Recall'] < THRESHOLDS['recall']:
        issues.append(f"Recall ({row['Recall']:.3f} < {THRESHOLDS['recall']})")
    if row['F1'] < THRESHOLDS['f1']:
        issues.append(f"F1 ({row['F1']:.3f} < {THRESHOLDS['f1']})")
    
    if issues:
        print(f"‚ö†Ô∏è  {row['Specialist']}: {', '.join(issues)}")

## 4. Compara√ß√£o de Vers√µes (Production vs Staging)

In [None]:
# Comparar Production vs Staging para cada especialista
for specialist in SPECIALISTS:
    versions = get_model_versions(specialist)
    
    prod_version = None
    staging_version = None
    prod_metrics = None
    staging_metrics = None
    
    for v in versions:
        if v.current_stage == 'Production':
            prod_version = v.version
            prod_metrics = get_model_metrics(v.run_id)
        elif v.current_stage == 'Staging':
            staging_version = v.version
            staging_metrics = get_model_metrics(v.run_id)
    
    if prod_metrics and staging_metrics:
        print(f"\n=== {specialist.upper()} - Production v{prod_version} vs Staging v{staging_version} ===")
        
        # Criar dataframe de compara√ß√£o
        comparison_df = pd.DataFrame({
            'Production': [prod_metrics.get(m, 0) for m in ['precision', 'recall', 'f1', 'accuracy']],
            'Staging': [staging_metrics.get(m, 0) for m in ['precision', 'recall', 'f1', 'accuracy']]
        }, index=['Precision', 'Recall', 'F1', 'Accuracy'])
        
        # Calcular deltas
        comparison_df['Delta'] = comparison_df['Staging'] - comparison_df['Production']
        comparison_df['Delta %'] = (comparison_df['Delta'] / comparison_df['Production'] * 100).round(2)
        
        display(comparison_df)
        
        # Gr√°fico de barras
        fig, ax = plt.subplots(figsize=(10, 5))
        comparison_df[['Production', 'Staging']].plot(kind='bar', ax=ax)
        ax.set_title(f'{specialist.capitalize()} - Production vs Staging')
        ax.set_ylabel('Score')
        ax.set_ylim([0, 1])
        ax.legend()
        plt.xticks(rotation=0)
        plt.tight_layout()
        plt.show()
        
        # Recomenda√ß√£o
        f1_improvement = comparison_df.loc['F1', 'Delta %']
        if f1_improvement > 5:
            print(f"‚úÖ RECOMENDA√á√ÉO: Promover Staging para Production (+{f1_improvement:.1f}% F1)")
        elif f1_improvement < -5:
            print(f"‚ö†Ô∏è  ATEN√á√ÉO: Staging √© PIOR que Production ({f1_improvement:.1f}% F1)")
    elif prod_metrics and not staging_metrics:
        print(f"\n=== {specialist.upper()} - Apenas Production dispon√≠vel ===")
    elif not prod_metrics:
        print(f"\n=== {specialist.upper()} - Nenhum modelo em Production ===")

## 5. Evolu√ß√£o Temporal de M√©tricas

In [None]:
# Para cada especialista, plotar evolu√ß√£o de m√©tricas
for specialist in SPECIALISTS:
    versions = get_model_versions(specialist)
    
    if not versions:
        print(f"Nenhuma vers√£o encontrada para {specialist}")
        continue
    
    # Ordenar por vers√£o
    versions_sorted = sorted(versions, key=lambda x: int(x.version))
    
    # Coletar m√©tricas
    data = []
    for v in versions_sorted:
        metrics = get_model_metrics(v.run_id)
        data.append({
            'Version': int(v.version),
            'Precision': metrics.get('precision', 0),
            'Recall': metrics.get('recall', 0),
            'F1': metrics.get('f1', 0),
            'Accuracy': metrics.get('accuracy', 0),
            'Stage': v.current_stage
        })
    
    df = pd.DataFrame(data)
    
    # Plot
    fig, ax = plt.subplots(figsize=(12, 6))
    
    ax.plot(df['Version'], df['Precision'], marker='o', label='Precision', linewidth=2)
    ax.plot(df['Version'], df['Recall'], marker='s', label='Recall', linewidth=2)
    ax.plot(df['Version'], df['F1'], marker='^', label='F1', linewidth=2)
    ax.plot(df['Version'], df['Accuracy'], marker='d', label='Accuracy', linewidth=2)
    
    # Marcar vers√µes em Production
    prod_versions = df[df['Stage'] == 'Production']
    for idx, row in prod_versions.iterrows():
        ax.axvline(x=row['Version'], color='green', linestyle='--', alpha=0.3)
        ax.text(row['Version'], 0.95, f"v{row['Version']}\n(Prod)", 
                ha='center', va='top', fontsize=8, color='green')
    
    # Thresholds
    ax.axhline(y=THRESHOLDS['precision'], color='red', linestyle=':', alpha=0.5, label='Precision Threshold')
    ax.axhline(y=THRESHOLDS['recall'], color='orange', linestyle=':', alpha=0.5, label='Recall Threshold')
    ax.axhline(y=THRESHOLDS['f1'], color='purple', linestyle=':', alpha=0.5, label='F1 Threshold')
    
    ax.set_xlabel('Version')
    ax.set_ylabel('Score')
    ax.set_title(f'{specialist.capitalize()} - Evolu√ß√£o de M√©tricas')
    ax.set_ylim([0, 1])
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # An√°lise de tend√™ncia
    if len(df) >= 3:
        recent_f1 = df.tail(3)['F1'].mean()
        older_f1 = df.head(3)['F1'].mean()
        trend = (recent_f1 - older_f1) / older_f1 * 100 if older_f1 > 0 else 0
        
        if trend > 5:
            print(f"üìà {specialist}: Tend√™ncia de MELHORIA (+{trend:.1f}% F1)")
        elif trend < -5:
            print(f"üìâ {specialist}: Tend√™ncia de DEGRADA√á√ÉO ({trend:.1f}% F1)")
        else:
            print(f"‚û°Ô∏è  {specialist}: Tend√™ncia EST√ÅVEL")
    
    print()

## 6. An√°lise de Feature Importance

In [None]:
# Carregar modelos em Production e extrair feature importance
for specialist in SPECIALISTS:
    model_name = f"{specialist}-evaluator"
    
    # Buscar vers√£o em Production
    versions = get_model_versions(specialist)
    prod_version = None
    
    for v in versions:
        if v.current_stage == 'Production':
            prod_version = v.version
            break
    
    if not prod_version:
        print(f"Nenhum modelo em Production para {specialist}")
        continue
    
    # Carregar modelo
    model = load_model_from_mlflow(model_name, prod_version)
    
    if model is None:
        print(f"Falha ao carregar modelo {model_name} v{prod_version}")
        continue
    
    # Verificar se modelo tem feature_importances_
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        
        # Feature names (assumindo 26 features padr√£o)
        feature_names = [
            'complexity_score', 'technical_debt', 'code_quality', 'test_coverage',
            'performance_impact', 'security_risk', 'integration_complexity',
            'technical_feasibility', 'tech_stack_alignment',
            'business_value', 'roi_score', 'strategic_alignment', 'market_demand',
            'competitive_advantage', 'revenue_impact', 'cost_efficiency', 'customer_satisfaction',
            'user_experience', 'accessibility', 'usability_score', 'user_engagement', 'adoption_rate',
            'scalability', 'maintainability', 'extensibility', 'future_proof'
        ]
        
        # Limitar ao n√∫mero de features do modelo
        feature_names = feature_names[:len(importances)]
        
        # Criar dataframe
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values('Importance', ascending=False)
        
        # Top 15
        top_15 = importance_df.head(15)
        
        # Plot
        fig, ax = plt.subplots(figsize=(10, 8))
        ax.barh(top_15['Feature'], top_15['Importance'], color='steelblue')
        ax.set_xlabel('Importance')
        ax.set_title(f'{specialist.capitalize()} - Top 15 Features')
        ax.invert_yaxis()
        plt.tight_layout()
        plt.show()
        
        print(f"\nTop 5 features para {specialist}:")
        for idx, row in top_15.head(5).iterrows():
            print(f"  {row['Feature']}: {row['Importance']:.4f}")
    else:
        print(f"Modelo {specialist} n√£o suporta feature_importances_ (pode ser neural network)")
    
    print()

## 11. Recomenda√ß√µes Automatizadas

In [None]:
def generate_recommendations():
    """Gerar lista de recomenda√ß√µes automatizadas."""
    recommendations = []
    
    for specialist in SPECIALISTS:
        versions = get_model_versions(specialist)
        
        prod_version = None
        staging_version = None
        prod_metrics = None
        staging_metrics = None
        prod_timestamp = None
        
        for v in versions:
            if v.current_stage == 'Production':
                prod_version = v.version
                prod_metrics = get_model_metrics(v.run_id)
                prod_timestamp = datetime.fromtimestamp(v.last_updated_timestamp / 1000)
            elif v.current_stage == 'Staging':
                staging_version = v.version
                staging_metrics = get_model_metrics(v.run_id)
        
        # Verificar se modelo existe em Production
        if not prod_metrics:
            recommendations.append({
                'severity': 'critical',
                'specialist': specialist,
                'message': f"CRITICAL: Nenhum modelo em Production para {specialist}"
            })
            continue
        
        # Verificar thresholds
        precision = prod_metrics.get('precision', 0)
        recall = prod_metrics.get('recall', 0)
        f1 = prod_metrics.get('f1', 0)
        
        if precision < THRESHOLDS['precision']:
            recommendations.append({
                'severity': 'high',
                'specialist': specialist,
                'message': f"Retrain {specialist}: precision abaixo do threshold ({precision:.3f} < {THRESHOLDS['precision']})"
            })
        
        if recall < THRESHOLDS['recall']:
            recommendations.append({
                'severity': 'high',
                'specialist': specialist,
                'message': f"Retrain {specialist}: recall abaixo do threshold ({recall:.3f} < {THRESHOLDS['recall']})"
            })
        
        if f1 < THRESHOLDS['f1']:
            recommendations.append({
                'severity': 'high',
                'specialist': specialist,
                'message': f"Retrain {specialist}: F1 abaixo do threshold ({f1:.3f} < {THRESHOLDS['f1']})"
            })
        
        # Verificar se modelo est√° desatualizado
        if prod_timestamp:
            days_old = (datetime.now() - prod_timestamp).days
            if days_old > 45:
                recommendations.append({
                    'severity': 'medium',
                    'specialist': specialist,
                    'message': f"Update {specialist}: modelo desatualizado ({days_old} dias desde √∫ltima atualiza√ß√£o)"
                })
        
        # Comparar com Staging
        if staging_metrics and prod_metrics:
            staging_f1 = staging_metrics.get('f1', 0)
            prod_f1 = prod_metrics.get('f1', 0)
            
            if prod_f1 > 0:
                improvement = (staging_f1 - prod_f1) / prod_f1 * 100
                
                if improvement > 8:
                    recommendations.append({
                        'severity': 'high',
                        'specialist': specialist,
                        'message': f"Promote {specialist} Staging to Production: +{improvement:.1f}% improvement in F1"
                    })
                elif improvement < -5:
                    recommendations.append({
                        'severity': 'medium',
                        'specialist': specialist,
                        'message': f"Rollback {specialist}: Staging √© PIOR que Production ({improvement:.1f}% F1)"
                    })
    
    # Ordenar por severidade
    severity_order = {'critical': 0, 'high': 1, 'medium': 2, 'low': 3}
    recommendations.sort(key=lambda x: severity_order[x['severity']])
    
    return recommendations

# Gerar e exibir recomenda√ß√µes
recommendations = generate_recommendations()

print("\n=== RECOMENDA√á√ïES AUTOMATIZADAS ===")
print(f"Total: {len(recommendations)} recomenda√ß√µes\n")

for rec in recommendations:
    severity_emoji = {
        'critical': 'üî¥',
        'high': 'üü†',
        'medium': 'üü°',
        'low': 'üîµ'
    }
    
    print(f"{severity_emoji[rec['severity']]} [{rec['severity'].upper()}] {rec['message']}")

if not recommendations:
    print("‚úÖ Nenhuma recomenda√ß√£o - todos os modelos est√£o OK!")

In [None]:
## 12. Export de Relat√≥rio HTML

## 10. An√°lise de Feedback Humano (se dispon√≠vel)

In [None]:
# Agregar m√©tricas de todos os especialistas e fazer an√°lise comparativa
metrics_data = []

for specialist in SPECIALISTS:
    versions = get_model_versions(specialist)
    
    prod_version = None
    prod_metrics = {}
    
    for v in versions:
        if v.current_stage == 'Production':
            prod_version = v.version
            prod_metrics = get_model_metrics(v.run_id)
            break
    
    if prod_metrics:
        metrics_data.append({
            'Specialist': specialist,
            'Precision': prod_metrics.get('precision', 0),
            'Recall': prod_metrics.get('recall', 0),
            'F1': prod_metrics.get('f1', 0),
            'Accuracy': prod_metrics.get('accuracy', 0)
        })

# Criar DataFrame agregado
cross_df = pd.DataFrame(metrics_data)

if not cross_df.empty:
    print("=== M√©tricas Agregadas de Todos os Especialistas ===")
    display(cross_df)
    
    # Boxplot de m√©tricas
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    metrics_to_plot = ['Precision', 'Recall', 'F1', 'Accuracy']
    colors = ['steelblue', 'coral', 'lightgreen', 'gold']
    
    for idx, metric in enumerate(metrics_to_plot):
        ax = axes[idx // 2, idx % 2]
        
        # Boxplot
        bp = ax.boxplot([cross_df[metric]], labels=[metric], patch_artist=True)
        bp['boxes'][0].set_facecolor(colors[idx])
        
        # Scatter dos pontos individuais
        y_values = cross_df[metric].values
        x_values = np.ones(len(y_values))
        ax.scatter(x_values, y_values, color='darkblue', s=100, zorder=3, alpha=0.6)
        
        # Adicionar labels dos especialistas
        for i, specialist in enumerate(cross_df['Specialist']):
            ax.text(1.05, y_values[i], specialist, fontsize=9, va='center')
        
        # Linha de threshold
        threshold_map = {
            'Precision': 0.75,
            'Recall': 0.70,
            'F1': 0.72,
            'Accuracy': 0.70
        }
        if metric in threshold_map:
            ax.axhline(y=threshold_map[metric], color='red', linestyle='--', 
                       linewidth=2, label=f'Threshold ({threshold_map[metric]})')
            ax.legend()
        
        ax.set_ylabel('Score')
        ax.set_title(f'Distribui√ß√£o de {metric} entre Especialistas')
        ax.set_ylim([0, 1])
        ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Heatmap de correla√ß√£o entre especialistas
    # Criar matriz onde cada linha √© um especialista e colunas s√£o m√©tricas
    corr_matrix = cross_df[['Precision', 'Recall', 'F1', 'Accuracy']].T.corr()
    
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                xticklabels=cross_df['Specialist'], 
                yticklabels=cross_df['Specialist'],
                vmin=-1, vmax=1, center=0, ax=ax)
    ax.set_title('Correla√ß√£o de M√©tricas entre Especialistas')
    plt.tight_layout()
    plt.show()
    
    # Estat√≠sticas descritivas
    print("\n=== Estat√≠sticas Descritivas ===")
    print(cross_df[['Precision', 'Recall', 'F1', 'Accuracy']].describe())
    
    # Identificar melhor e pior especialista
    print("\n=== Ranking por F1 Score ===")
    ranked = cross_df.sort_values('F1', ascending=False)
    for idx, row in ranked.iterrows():
        print(f"{row['Specialist']}: F1={row['F1']:.3f}")
    
    best_specialist = ranked.iloc[0]['Specialist']
    worst_specialist = ranked.iloc[-1]['Specialist']
    print(f"\nMelhor: {best_specialist} (F1={ranked.iloc[0]['F1']:.3f})")
    print(f"Pior: {worst_specialist} (F1={ranked.iloc[-1]['F1']:.3f})")
    
else:
    print("Nenhuma m√©trica dispon√≠vel para an√°lise cross-specialist")

## 9. An√°lise Cross-Specialist

In [None]:
# Analisar distribui√ß√£o de confidence scores e risk scores
for specialist in SPECIALISTS:
    model_name = f"{specialist}-evaluator"
    
    # Buscar vers√£o em Production
    versions = get_model_versions(specialist)
    prod_version = None
    
    for v in versions:
        if v.current_stage == 'Production':
            prod_version = v.version
            break
    
    if not prod_version:
        continue
    
    # Carregar modelo
    model = load_model_from_mlflow(model_name, prod_version)
    if model is None:
        continue
    
    # Tentar carregar dataset
    dataset_path = f"/data/training/specialist_{specialist}_base.parquet"
    try:
        import os
        if not os.path.exists(dataset_path):
            continue
        
        df = pd.read_parquet(dataset_path)
        
        if 'recommendation' not in df.columns:
            continue
        
        # Separar features
        label_col = 'recommendation'
        feature_cols = [col for col in df.columns if col != label_col]
        X = df[feature_cols]
        
        # Obter probabilidades (se modelo suportar)
        if hasattr(model, 'predict_proba'):
            probas = model.predict_proba(X)
            
            # Confidence score = max probability
            confidence_scores = np.max(probas, axis=1)
            
            # Risk score = 1 - confidence (inversamente proporcional)
            risk_scores = 1 - confidence_scores
            
            # Criar subplots
            fig, axes = plt.subplots(2, 2, figsize=(14, 10))
            
            # Histograma de confidence
            axes[0, 0].hist(confidence_scores, bins=50, color='steelblue', edgecolor='black')
            axes[0, 0].set_xlabel('Confidence Score')
            axes[0, 0].set_ylabel('Frequ√™ncia')
            axes[0, 0].set_title(f'{specialist.capitalize()} - Distribui√ß√£o de Confidence')
            axes[0, 0].axvline(confidence_scores.mean(), color='red', linestyle='--', 
                               label=f'M√©dia: {confidence_scores.mean():.3f}')
            axes[0, 0].legend()
            
            # Histograma de risk
            axes[0, 1].hist(risk_scores, bins=50, color='coral', edgecolor='black')
            axes[0, 1].set_xlabel('Risk Score')
            axes[0, 1].set_ylabel('Frequ√™ncia')
            axes[0, 1].set_title(f'{specialist.capitalize()} - Distribui√ß√£o de Risk')
            axes[0, 1].axvline(risk_scores.mean(), color='red', linestyle='--',
                               label=f'M√©dia: {risk_scores.mean():.3f}')
            axes[0, 1].legend()
            
            # Scatter: confidence vs risk
            axes[1, 0].scatter(confidence_scores, risk_scores, alpha=0.5, s=10)
            axes[1, 0].set_xlabel('Confidence Score')
            axes[1, 0].set_ylabel('Risk Score')
            axes[1, 0].set_title(f'{specialist.capitalize()} - Confidence vs Risk')
            axes[1, 0].plot([0, 1], [1, 0], 'r--', label='Te√≥rico (Risk = 1 - Confidence)')
            axes[1, 0].legend()
            
            # Boxplot de confidence por classe predita
            predictions = model.predict(X)
            df_plot = pd.DataFrame({
                'prediction': predictions,
                'confidence': confidence_scores
            })
            
            # Ordenar por ordem de severidade
            class_order = ['approve', 'approve_with_conditions', 'review_required', 'reject']
            df_plot['prediction'] = pd.Categorical(df_plot['prediction'], categories=class_order, ordered=True)
            
            df_plot.boxplot(column='confidence', by='prediction', ax=axes[1, 1])
            axes[1, 1].set_xlabel('Predi√ß√£o')
            axes[1, 1].set_ylabel('Confidence Score')
            axes[1, 1].set_title(f'{specialist.capitalize()} - Confidence por Classe')
            plt.sca(axes[1, 1])
            plt.xticks(rotation=15, ha='right')
            
            plt.suptitle('')  # Remover t√≠tulo autom√°tico do boxplot
            plt.tight_layout()
            plt.show()
            
            # Estat√≠sticas
            print(f"\n=== {specialist.upper()} - Estat√≠sticas de Scores ===")
            print(f"Confidence - M√©dia: {confidence_scores.mean():.3f}, Std: {confidence_scores.std():.3f}")
            print(f"Confidence - Min: {confidence_scores.min():.3f}, Max: {confidence_scores.max():.3f}")
            print(f"Risk - M√©dia: {risk_scores.mean():.3f}, Std: {risk_scores.std():.3f}")
            
            # Contar predi√ß√µes de alta incerteza (low confidence)
            low_confidence_count = (confidence_scores < 0.6).sum()
            print(f"Predi√ß√µes com baixa confidence (<0.6): {low_confidence_count} ({low_confidence_count/len(confidence_scores)*100:.1f}%)")
            
        else:
            print(f"{specialist}: Modelo n√£o suporta predict_proba, pulando distribui√ß√£o de scores")
    
    except Exception as e:
        print(f"Erro ao processar {specialist}: {e}")
        continue
    
    print()

## 8. Distribui√ß√£o de Scores de Predi√ß√£o

In [None]:
# Importar confusion_matrix
from sklearn.metrics import confusion_matrix

# Carregar datasets de valida√ß√£o e calcular confusion matrix para cada especialista
for specialist in SPECIALISTS:
    model_name = f"{specialist}-evaluator"
    
    # Buscar vers√£o em Production
    versions = get_model_versions(specialist)
    prod_version = None
    
    for v in versions:
        if v.current_stage == 'Production':
            prod_version = v.version
            break
    
    if not prod_version:
        print(f"Nenhum modelo em Production para {specialist}")
        continue
    
    # Carregar modelo
    model = load_model_from_mlflow(model_name, prod_version)
    
    if model is None:
        print(f"Falha ao carregar modelo {model_name} v{prod_version}")
        continue
    
    # Tentar carregar dataset de valida√ß√£o
    dataset_path = f"/data/training/specialist_{specialist}_base.parquet"
    try:
        import os
        if not os.path.exists(dataset_path):
            print(f"Dataset n√£o encontrado: {dataset_path}")
            print(f"Pulando confusion matrix para {specialist}")
            continue
        
        df = pd.read_parquet(dataset_path)
        
        # Assumir que dataset tem coluna 'recommendation' com labels
        if 'recommendation' not in df.columns:
            print(f"Coluna 'recommendation' n√£o encontrada em dataset de {specialist}")
            continue
        
        # Separar features e labels
        label_col = 'recommendation'
        feature_cols = [col for col in df.columns if col != label_col]
        
        X = df[feature_cols]
        y_true = df[label_col]
        
        # Fazer predi√ß√µes
        y_pred = model.predict(X)
        
        # Calcular confusion matrix
        labels = ['approve', 'approve_with_conditions', 'review_required', 'reject']
        cm = confusion_matrix(y_true, y_pred, labels=labels)
        
        # Plot confusion matrix
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=labels, yticklabels=labels, ax=ax)
        ax.set_xlabel('Predi√ß√£o')
        ax.set_ylabel('Real')
        ax.set_title(f'{specialist.capitalize()} - Confusion Matrix (v{prod_version})')
        plt.tight_layout()
        plt.show()
        
        # Calcular accuracy por classe
        print(f"\nAccuracy por classe para {specialist}:")
        for i, label in enumerate(labels):
            class_accuracy = cm[i, i] / cm[i, :].sum() if cm[i, :].sum() > 0 else 0
            print(f"  {label}: {class_accuracy:.2%}")
        
    except Exception as e:
        print(f"Erro ao processar {specialist}: {e}")
        continue
    
    print()

## 7. Confusion Matrix por Especialista

## 8. Export de Relat√≥rio HTML

In [None]:
import os
from datetime import datetime

# Criar diret√≥rio de reports se n√£o existir
reports_dir = os.path.join(os.path.dirname(os.path.abspath('__file__')), 'reports')
os.makedirs(reports_dir, exist_ok=True)

# Gerar timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
report_file = os.path.join(reports_dir, f'model_analysis_{timestamp}.html')

# Criar HTML
html_content = f"""
<html>
<head>
    <title>Model Analysis Report - {timestamp}</title>
    <style>
        body {{ font-family: Arial, sans-serif; margin: 20px; }}
        h1 {{ color: #2c3e50; }}
        h2 {{ color: #34495e; margin-top: 30px; }}
        table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
        th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
        th {{ background-color: #3498db; color: white; }}
        .critical {{ color: #e74c3c; font-weight: bold; }}
        .high {{ color: #e67e22; font-weight: bold; }}
        .medium {{ color: #f39c12; }}
        .low {{ color: #3498db; }}
    </style>
</head>
<body>
    <h1>Neural Hive Mind - Model Analysis Report</h1>
    <p><strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
    <p><strong>MLflow URI:</strong> {MLFLOW_URI}</p>
    
    <h2>Status Summary</h2>
    {status_df.to_html(index=False)}
    
    <h2>Recommendations</h2>
    <ul>
"""

for rec in recommendations:
    html_content += f'<li class="{rec["severity"]}">[{rec["severity"].upper()}] {rec["message"]}</li>\n'

html_content += """
    </ul>
    
    <h2>Thresholds</h2>
    <ul>
        <li>Precision: >= 0.75</li>
        <li>Recall: >= 0.70</li>
        <li>F1 Score: >= 0.72</li>
        <li>Improvement: >= 5%</li>
    </ul>
</body>
</html>
"""

# Salvar relat√≥rio
with open(report_file, 'w') as f:
    f.write(html_content)

print(f"\n‚úÖ Relat√≥rio HTML salvo em: {report_file}")
print(f"Para visualizar: open {report_file}")