In [None]:
"""
Week 4: RAG Classifier Accuracy Testing and Optimization
Test the classifier against known decisions and optimize parameters
"""

import sys
sys.path.append('../..')

import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time
import json
from pathlib import Path

from config.settings import *
from src.rag_engine.rag_classifier import RAGPublishabilityClassifier
from src.rag_engine.vector_store import NewsVectorStore

class AccuracyTester:
    def __init__(self):
        self.classifier = RAGPublishabilityClassifier()
        self.results = []
        
    def load_test_data(self, sample_size=100):
        """Load balanced test set of approved/rejected articles"""
        
        # Load main dataset
        df = pd.read_excel(MAIN_DATASET)
        df['is_approved'] = df['TrackId'].notna() & (df['TrackId'] != 'NULL')
        
        # Create balanced test set
        approved = df[df['is_approved']].sample(n=min(sample_size//2, 50))
        rejected = df[~df['is_approved']].sample(n=min(sample_size//2, 50))
        
        test_data = pd.concat([approved, rejected]).reset_index(drop=True)
        test_data['expected_decision'] = test_data['is_approved'].map({True: 'موافق', False: 'مرفوض'})
        
        return test_data
        
    def test_classifier_accuracy(self, test_data, save_results=True):
        """Test classifier on known decisions"""
        
        print(f"Testing classifier accuracy on {len(test_data)} articles...")
        
        results = []
        processing_times = []
        
        for idx, row in tqdm(test_data.iterrows(), total=len(test_data)):
            start_time = time.time()
            
            try:
                # Get classifier prediction
                result = self.classifier.classify_article(row['Story'])
                
                processing_time = time.time() - start_time
                processing_times.append(processing_time)
                
                # Store results
                test_result = {
                    'article_id': row['StoryId'],
                    'expected_decision': row['expected_decision'],
                    'predicted_decision': result['decision'],
                    'confidence': result['confidence'],
                    'processing_time': processing_time,
                    'correct': result['decision'] == row['expected_decision'],
                    'article_length': len(str(row['Story'])),
                    'reasoning': result['reasoning'][:200] if result['reasoning'] else ''
                }
                
                results.append(test_result)
                
            except Exception as e:
                print(f"Error processing article {row['StoryId']}: {e}")
                results.append({
                    'article_id': row['StoryId'],
                    'expected_decision': row['expected_decision'],
                    'predicted_decision': 'ERROR',
                    'confidence': 0.0,
                    'processing_time': 0.0,
                    'correct': False,
                    'article_length': len(str(row['Story'])),
                    'reasoning': f'Error: {e}'
                })
        
        # Calculate metrics
        results_df = pd.DataFrame(results)
        
        # Filter out errors for accuracy calculation
        valid_results = results_df[results_df['predicted_decision'] != 'ERROR']
        
        if len(valid_results) > 0:
            accuracy = valid_results['correct'].mean()
            avg_confidence = valid_results['confidence'].mean()
            avg_processing_time = np.mean(processing_times) if processing_times else 0
            
            # Separate metrics for approved/rejected
            approved_results = valid_results[valid_results['expected_decision'] == 'موافق']
            rejected_results = valid_results[valid_results['expected_decision'] == 'مرفوض']
            
            approved_accuracy = approved_results['correct'].mean() if len(approved_results) > 0 else 0
            rejected_accuracy = rejected_results['correct'].mean() if len(rejected_results) > 0 else 0
            
            metrics = {
                'overall_accuracy': accuracy,
                'approved_accuracy': approved_accuracy,
                'rejected_accuracy': rejected_accuracy,
                'avg_confidence': avg_confidence,
                'avg_processing_time': avg_processing_time,
                'total_tested': len(results),
                'valid_predictions': len(valid_results),
                'errors': len(results) - len(valid_results)
            }
            
            # Print results
            print(f"\n=== ACCURACY TEST RESULTS ===")
            print(f"Overall Accuracy: {accuracy:.2%}")
            print(f"Approved Articles Accuracy: {approved_accuracy:.2%}")
            print(f"Rejected Articles Accuracy: {rejected_accuracy:.2%}")
            print(f"Average Confidence: {avg_confidence:.2f}")
            print(f"Average Processing Time: {avg_processing_time:.2f}s")
            print(f"Articles per Hour: {3600/avg_processing_time:.0f}" if avg_processing_time > 0 else "N/A")
            print(f"Errors: {metrics['errors']}/{len(results)}")
            
            # Save results
            if save_results:
                results_df.to_csv(ANALYTICS_DIR / 'week4_accuracy_test.csv', index=False)
                
                with open(ANALYTICS_DIR / 'week4_metrics.json', 'w', encoding='utf-8') as f:
                    json.dump(metrics, f, ensure_ascii=False, indent=2)
                    
                print(f"\nResults saved to {ANALYTICS_DIR}")
            
            self.results = results_df
            return metrics
        else:
            print("No valid predictions to analyze!")
            return None
            
    def analyze_errors(self):
        """Analyze incorrect predictions to find patterns"""
        
        if len(self.results) == 0:
            print("No results to analyze. Run test_classifier_accuracy first.")
            return
            
        incorrect = self.results[~self.results['correct']]
        
        if len(incorrect) == 0:
            print("Perfect accuracy! No errors to analyze.")
            return
            
        print(f"\n=== ERROR ANALYSIS ===")
        print(f"Total errors: {len(incorrect)}/{len(self.results)}")
        
        # Analyze by expected vs predicted
        error_types = incorrect.groupby(['expected_decision', 'predicted_decision']).size()
        print(f"\nError breakdown:")
        for (expected, predicted), count in error_types.items():
            print(f"  Expected {expected}, Got {predicted}: {count}")
            
        # Analyze by confidence
        low_confidence_errors = incorrect[incorrect['confidence'] < 0.5]
        print(f"\nLow confidence errors (<0.5): {len(low_confidence_errors)}")
        
        # Analyze by article length
        print(f"\nAverage length of incorrect predictions: {incorrect['article_length'].mean():.0f}")
        print(f"Average length of correct predictions: {self.results[self.results['correct']]['article_length'].mean():.0f}")
        
        return incorrect
        
    def optimize_retrieval_parameters(self, test_data):
        """Test different RAG retrieval parameters"""
        
        print("Optimizing RAG retrieval parameters...")
        
        # Test different k values (number of similar articles to retrieve)
        k_values = [3, 5, 7, 10]
        results = []
        
        for k in k_values:
            print(f"Testing k={k}...")
            
            # Update classifier retrieval parameter
            original_k = self.classifier.retrieval_k
            self.classifier.retrieval_k = k
            
            # Test on small sample
            sample_data = test_data.sample(n=min(20, len(test_data)))
            metrics = self.test_classifier_accuracy(sample_data, save_results=False)
            
            if metrics:
                results.append({
                    'k': k,
                    'accuracy': metrics['overall_accuracy'],
                    'avg_confidence': metrics['avg_confidence'],
                    'avg_time': metrics['avg_processing_time']
                })
            
            # Restore original
            self.classifier.retrieval_k = original_k
            
        # Find best k
        if results:
            best_result = max(results, key=lambda x: x['accuracy'])
            print(f"\nBest k value: {best_result['k']} (accuracy: {best_result['accuracy']:.2%})")
            
            return results
        
        return None

def run_week4_accuracy_tests():
    """Complete Week 4 accuracy testing workflow"""
    
    print("Starting Week 4 Accuracy Testing...")
    
    # Initialize tester
    tester = AccuracyTester()
    
    # Load test data
    test_data = tester.load_test_data(sample_size=100)
    print(f"Loaded test dataset: {len(test_data)} articles")
    
    # Run accuracy test
    metrics = tester.test_classifier_accuracy(test_data)
    
    if metrics:
        # Analyze errors
        tester.analyze_errors()
        
        # Optimize parameters if accuracy is below target
        if metrics['overall_accuracy'] < 0.85:
            print(f"\nAccuracy {metrics['overall_accuracy']:.2%} below target 85%. Optimizing...")
            tester.optimize_retrieval_parameters(test_data)
        else:
            print(f"\nAccuracy target met: {metrics['overall_accuracy']:.2%} >= 85%")
            
        return metrics
    
    return None

if __name__ == "__main__":
    metrics = run_week4_accuracy_tests()