In [5]:
import torch
import clip
from PIL import Image
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple

class CLIPEvaluator:
    def __init__(self):
        # Initialize paths using your existing structure
        self.base_path = os.path.join('..', 'images')
        self.results_path = os.path.join('..', 'output_files')
        self.prompts_file = os.path.join(self.results_path, 'generated_prompts.json')
        
        # Create results directory for analysis outputs
        self.analysis_path = os.path.join(self.results_path, 'clip_analysis')
        os.makedirs(self.analysis_path, exist_ok=True)
        
        # Initialize CLIP model and attributes
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        model, preprocess = clip.load('ViT-L/14@336px', device=self.device)
        self.model = model
        self.preprocess = preprocess
        
        # Template names
        self.templates = [
            "Basic Object Focus",
            "Contextual Scene",
            "Educational Layout", 
            "Multi-Level Detail",
            "Grid Layout"
        ]
        
    def load_prompts(self) -> List[Dict]:
        """Load the generated prompts from JSON file"""
        try:
            with open(self.prompts_file, 'r') as f:
                return json.load(f)
        except Exception as e:
            print(f"Error loading prompts file: {str(e)}")
            return []
        
        # Create results directory for analysis outputs
        self.analysis_path = os.path.join(self.results_path, 'clip_analysis')
        os.makedirs(self.analysis_path, exist_ok=True)
        
        # Create results directory for analysis outputs
        self.analysis_path = os.path.join(self.results_path, 'clip_analysis')
        os.makedirs(self.analysis_path, exist_ok=True)
        
        # Load CLIP model
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        model, preprocess = clip.load('ViT-L/14@336px', device=self.device)
        self.model = model
        self.preprocess = preprocess
        
        self.templates = [
            "Basic Object Focus",
            "Contextual Scene",
            "Educational Layout", 
            "Multi-Level Detail",
            "Grid Layout"
        ]

    def analyze_template_performance(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Comprehensive template analysis considering multiple metrics:
        1. Mean CLIP score - Overall performance
        2. Standard deviation - Consistency
        3. Success rate - Reliability
        4. Top performance count - How often each template is the best
        5. Bottom performance count - How often each template is the worst
        """
        # Calculate basic statistics
        stats = df.groupby('template')['clip_score'].agg([
            ('mean_score', 'mean'),
            ('std_dev', 'std'),
            ('count', 'count')
        ])
        
        # Calculate success rate (percentage of images above mean CLIP score)
        mean_clip = df['clip_score'].mean()
        success_rates = []
        
        for template in self.templates:
            template_scores = df[df['template'] == template]['clip_score']
            success_rate = (template_scores > mean_clip).mean() * 100
            success_rates.append(success_rate)
        
        stats['success_rate'] = success_rates
        
        # Count times each template performs best/worst per sample
        best_counts = []
        worst_counts = []
        
        for _, sample_group in df.groupby('sample_id'):
            best_template = sample_group.loc[sample_group['clip_score'].idxmax(), 'template']
            worst_template = sample_group.loc[sample_group['clip_score'].idxmin(), 'template']
            
            for template in self.templates:
                best_counts.append((template, template == best_template))
                worst_counts.append((template, template == worst_template))
        
        best_df = pd.DataFrame(best_counts, columns=['template', 'is_best'])
        worst_df = pd.DataFrame(worst_counts, columns=['template', 'is_worst'])
        
        stats['times_best'] = best_df.groupby('template')['is_best'].sum()
        stats['times_worst'] = worst_df.groupby('template')['is_worst'].sum()
        
        # Calculate a composite score (you can adjust weights based on importance)
        stats['composite_score'] = (
            0.4 * stats['mean_score'] +  # Overall performance
            0.2 * (1 / stats['std_dev']) +  # Consistency (inverse of std dev)
            0.2 * (stats['success_rate'] / 100) +  # Success rate
            0.1 * (stats['times_best'] / len(df['sample_id'].unique())) +  # Best performance rate
            0.1 * (1 - stats['times_worst'] / len(df['sample_id'].unique()))  # Inverse of worst performance rate
        )
        
        return stats.round(4)

    def visualize_results(self, df: pd.DataFrame, stats: pd.DataFrame):
        """Create comprehensive visualizations for the analysis"""
        # 1. Template Performance Distribution (Box Plot)
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='template', y='clip_score', data=df)
        plt.xticks(rotation=45)
        plt.title('CLIP Score Distribution by Template')
        plt.tight_layout()
        plt.savefig(os.path.join(self.analysis_path, 'template_distribution.png'))
        plt.close()

        # 2. Success Rate Comparison (Bar Plot)
        plt.figure(figsize=(10, 6))
        stats['success_rate'].plot(kind='bar')
        plt.title('Template Success Rates')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(self.analysis_path, 'success_rates.png'))
        plt.close()

        # 3. Composite Score Visualization
        plt.figure(figsize=(10, 6))
        stats['composite_score'].plot(kind='bar')
        plt.title('Template Composite Scores')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(self.analysis_path, 'composite_scores.png'))
        plt.close()

    def get_clip_score(self, image_path: str, text: str) -> float:
        """Calculate CLIP score for a single image-text pair"""
        try:
            # Load and preprocess image
            image = Image.open(image_path)
            image_input = self.preprocess(image).unsqueeze(0).to(self.device)
            
            # Prepare text
            text_input = clip.tokenize([text]).to(self.device)
            
            # Generate embeddings
            with torch.no_grad():
                image_features = self.model.encode_image(image_input)
                text_features = self.model.encode_text(text_input)
            
            # Normalize and calculate similarity
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
            
            return torch.matmul(image_features, text_features.T).item()
            
        except Exception as e:
            print(f"Error processing {image_path}: {str(e)}")
            return 0.0

    def evaluate_all_images(self):
        """Evaluate all images and perform comprehensive analysis"""
        results = []
        prompts_data = self.load_prompts()
        
        # Process each sample
        for sample in tqdm(prompts_data, desc="Evaluating samples"):
            sample_id = sample['id']
            simplified_text = sample['simplified_text']
            
            for template_prompt in sample['template_prompts']:
                template_name = template_prompt['template_name']
                
                # Check both image organizations
                image_name = f"{template_name.lower().replace(' ', '_')}.png"
                sample_path = os.path.join(self.base_path, 'by_sample', sample_id, image_name)
                
                if os.path.exists(sample_path):
                    clip_score = self.get_clip_score(sample_path, simplified_text)
                    results.append({
                        'sample_id': sample_id,
                        'template': template_name,
                        'clip_score': clip_score
                    })
        
        # Convert to DataFrame
        df = pd.DataFrame(results)
        
        # Perform comprehensive analysis
        stats = self.analyze_template_performance(df)
        
        # Create visualizations
        self.visualize_results(df, stats)
        
        # Save detailed results
        df.to_csv(os.path.join(self.analysis_path, 'clip_scores_detailed.csv'), index=False)
        stats.to_csv(os.path.join(self.analysis_path, 'clip_scores_analysis.csv'))
        
        # Print comprehensive report
        self.print_analysis_report(stats)
        
        return df, stats

    def print_analysis_report(self, stats: pd.DataFrame):
        """Print a detailed analysis report"""
        print("\n=== TEMPLATE EVALUATION REPORT ===\n")
        
        # Overall best template
        best_template = stats['composite_score'].idxmax()
        print(f"Best Overall Template: {best_template}")
        print(f"Composite Score: {stats.loc[best_template, 'composite_score']:.4f}\n")
        
        print("Detailed Metrics:")
        print("-----------------")
        for template in self.templates:
            print(f"\n{template}:")
            print(f"  Mean CLIP Score: {stats.loc[template, 'mean_score']:.4f}")
            print(f"  Standard Deviation: {stats.loc[template, 'std_dev']:.4f}")
            print(f"  Success Rate: {stats.loc[template, 'success_rate']:.1f}%")
            print(f"  Times Best: {stats.loc[template, 'times_best']}")
            print(f"  Times Worst: {stats.loc[template, 'times_worst']}")
        
        print("\nAnalysis files saved in:", self.analysis_path)

def main():
    """Main execution function"""
    try:
        evaluator = CLIPEvaluator()
        results_df, stats = evaluator.evaluate_all_images()
        print("\nEvaluation completed successfully!")
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

Using device: cpu


Evaluating samples: 100%|██████████| 100/100 [16:00<00:00,  9.61s/it]



=== TEMPLATE EVALUATION REPORT ===

Best Overall Template: Basic Object Focus
Composite Score: 5.3112

Detailed Metrics:
-----------------

Basic Object Focus:
  Mean CLIP Score: 0.2108
  Standard Deviation: 0.0400
  Success Rate: 56.6%
  Times Best: 25
  Times Worst: 11

Contextual Scene:
  Mean CLIP Score: 0.2099
  Standard Deviation: 0.0439
  Success Rate: 52.0%
  Times Best: 27
  Times Worst: 15

Educational Layout:
  Mean CLIP Score: 0.2024
  Standard Deviation: 0.0497
  Success Rate: 46.5%
  Times Best: 17
  Times Worst: 28

Multi-Level Detail:
  Mean CLIP Score: 0.2005
  Standard Deviation: 0.0431
  Success Rate: 48.5%
  Times Best: 19
  Times Worst: 13

Grid Layout:
  Mean CLIP Score: 0.1988
  Standard Deviation: 0.0424
  Success Rate: 44.8%
  Times Best: 11
  Times Worst: 32

Analysis files saved in: ..\output_files\clip_analysis

Evaluation completed successfully!


In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Read the data
data = pd.DataFrame({
    'Template': ['Basic Object Focus', 'Contextual Scene', 'Educational Layout', 'Grid Layout', 'Multi-Level Detail'],
    'CLIP Score': [0.2108, 0.2099, 0.2024, 0.1988, 0.2005],
    'Std Dev': [0.04, 0.0439, 0.0497, 0.0424, 0.0431],
    'Success Rate': [56.5657, 52.0408, 46.4646, 44.7917, 48.4536],
    'Times Best': [25, 27, 17, 11, 19],
    'Times Worst': [11, 15, 28, 32, 13],
    'Composite Score': [5.3112, 4.8541, 4.2894, 4.9646, 4.9284]
})

# Set basic style parameters
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12

# Figure 1: CLIP Scores with error bars
plt.figure(figsize=(10, 6))
plt.grid(True, linestyle='--', alpha=0.7)
plt.errorbar(data['Template'], data['CLIP Score'], 
            yerr=data['Std Dev'], 
            fmt='o', capsize=5, 
            capthick=1.5, elinewidth=1.5,
            color='royalblue',
            ecolor='darkblue')
plt.xticks(rotation=45, ha='right')
plt.ylabel('CLIP Score')
plt.title('CLIP Scores by Template with Standard Deviation')
plt.tight_layout()
plt.savefig('clip_scores.pdf', bbox_inches='tight', dpi=300)
plt.close()

# Figure 2: Success Rates
plt.figure(figsize=(10, 6))
plt.grid(True, linestyle='--', alpha=0.7)
bars = plt.bar(data['Template'], data['Success Rate'], color='royalblue', alpha=0.7)
plt.axhline(y=50, color='red', linestyle='--', alpha=0.5, label='50% threshold')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Success Rate (%)')
plt.title('Template Success Rates')
# Add value labels on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f}%',
             ha='center', va='bottom')
plt.legend()
plt.tight_layout()
plt.savefig('success_rates.pdf', bbox_inches='tight', dpi=300)
plt.close()

# Figure 3: Best/Worst Analysis
plt.figure(figsize=(10, 6))
plt.grid(True, linestyle='--', alpha=0.7)
x = np.arange(len(data['Template']))
width = 0.35
plt.bar(x - width/2, data['Times Best'], width, label='Times Best', color='forestgreen', alpha=0.6)
plt.bar(x + width/2, data['Times Worst'], width, label='Times Worst', color='crimson', alpha=0.6)
plt.xticks(x, data['Template'], rotation=45, ha='right')
plt.ylabel('Count')
plt.title('Best vs Worst Performance by Template')
plt.legend()
plt.tight_layout()
plt.savefig('best_worst.pdf', bbox_inches='tight', dpi=300)
plt.close()

# Figure 4: Composite Scores
plt.figure(figsize=(10, 6))
plt.grid(True, linestyle='--', alpha=0.7)
bars = plt.bar(data['Template'], data['Composite Score'], color='royalblue', alpha=0.7)
plt.xticks(rotation=45, ha='right')
plt.ylabel('Composite Score')
plt.title('Overall Template Performance (Composite Score)')
# Add value labels on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}',
             ha='center', va='bottom')
plt.tight_layout()
plt.savefig('composite_scores.pdf', bbox_inches='tight', dpi=300)
plt.close()