<a href="https://colab.research.google.com/github/ashwin-yedte/visual-intelligence-travel-finance/blob/main/notebooks/Visual%20Intelligence%20Layer/theme_aggregation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

VLM INTELLIGENCE LAYER

STEP 3: THEME EXTRACTION AND AGGREGATION
Majority vote theme extraction and smart re-ranking

Features:
- Analyze theme distribution from Step 2 matches
- Identify dominant theme via majority vote
- Re-rank destinations using multi-factor scoring
- Prioritize destinations that appear in multiple images
- Calculate confidence metrics

# =================================================================
 PREREQUISITES
# =================================================================

MUST RUN BEFORE THIS:
1. Step 1 cells (image analysis)
2. Step 2 cells (destination matching)
   This creates step2_matches.json



# =================================================================
Step 1: CONFIGURATION
# =================================================================


In [None]:
print("="*80)
print("STEP 3: THEME EXTRACTION AND AGGREGATION - VLM INTELLIGENCE LAYER")
print("="*80)

class Step3Config:
    """Configuration for Step 3"""

    # Scoring weights for final ranking
    AVG_SIMILARITY_WEIGHT = 0.4
    MAX_SIMILARITY_WEIGHT = 0.3
    FREQUENCY_WEIGHT = 0.2
    THEME_MATCH_WEIGHT = 0.1

    # Bonus points
    THEME_MATCH_BONUS = 20.0
    FREQUENCY_BONUS_PER_APPEARANCE = 10.0

    # Output
    OUTPUT_FILE = "step3_refined_ranking.json"
    TOP_N_RESULTS = 10

print("Configuration loaded")
print("="*80)


STEP 3: THEME EXTRACTION AND AGGREGATION - VLM INTELLIGENCE LAYER
Configuration loaded


# =================================================================
Step 2: IMPORT LIBRARY
# =================================================================


In [None]:
import json
import numpy as np
from typing import Dict, List, Any
from collections import Counter, defaultdict

print("Imports complete")
print("="*80)



Imports complete


# =================================================================
Step 3: THEME ANALYSIS FUNCTIONS
# =================================================================


In [None]:
def analyze_theme_distribution(per_image_matches: Dict) -> Dict[str, Any]:
    """
    Analyze theme distribution across all matched destinations.

    Args:
        per_image_matches: Results from Step 2

    Returns:
        Dictionary with theme analysis
    """

    print("\n" + "="*80)
    print("ANALYZING THEME DISTRIBUTION")
    print("="*80)

    theme_counter = Counter()
    total_matches = 0

    # Count themes from all images
    for image_key, image_data in per_image_matches.items():
        matches = image_data['matches']
        for match in matches:
            theme = match['theme']
            theme_counter[theme] += 1
            total_matches += 1

    # Find dominant theme
    if not theme_counter:
        return {
            'dominant_theme': 'Unknown',
            'theme_confidence': 0.0,
            'theme_distribution': {},
            'total_matches': 0
        }

    dominant_theme = theme_counter.most_common(1)[0][0]
    dominant_count = theme_counter[dominant_theme]
    theme_confidence = dominant_count / total_matches if total_matches > 0 else 0

    print("Total matches analyzed: " + str(total_matches))
    print("\nTheme distribution:")
    for theme, count in theme_counter.most_common():
        percentage = (count / total_matches) * 100
        print("  " + theme + ": " + str(count) + " (" + str(round(percentage, 1)) + "%)")

    print("\nDominant theme: " + dominant_theme)
    print("Confidence: " + str(round(theme_confidence * 100, 1)) + "%")
    print("="*80)

    return {
        'dominant_theme': dominant_theme,
        'theme_confidence': theme_confidence,
        'theme_distribution': dict(theme_counter),
        'total_matches': total_matches
    }


print("Theme analysis functions loaded")
print("="*80)


Theme analysis functions loaded


# =================================================================
Step 4: DESTINATION AGGREGATION FUNCTIONS
# =================================================================


In [None]:
def aggregate_destination_data(per_image_matches: Dict) -> Dict[str, Any]:
    """
    Aggregate data for each unique destination across all images.

    For each destination, tracks:
    - All similarity scores
    - Number of appearances
    - Theme

    Args:
        per_image_matches: Results from Step 2

    Returns:
        Dictionary mapping destination_id to aggregated data
    """

    print("\n" + "="*80)
    print("AGGREGATING DESTINATION DATA")
    print("="*80)

    destination_data = defaultdict(lambda: {
        'scores': [],
        'theme': None,
        'name': None,
        'state': None,
        'appearances': 0,
        'appeared_in_images': []
    })

    # Aggregate across all images
    for image_key, image_data in per_image_matches.items():
        matches = image_data['matches']

        for match in matches:
            dest_id = match['destination_id']

            destination_data[dest_id]['scores'].append(match['raw_score'])
            destination_data[dest_id]['theme'] = match['theme']
            destination_data[dest_id]['name'] = match['destination_name']
            destination_data[dest_id]['state'] = match['state']
            destination_data[dest_id]['appearances'] += 1
            destination_data[dest_id]['appeared_in_images'].append(image_key)

    print("Total unique destinations: " + str(len(destination_data)))

    # Show top destinations by frequency
    sorted_by_freq = sorted(
        destination_data.items(),
        key=lambda x: x[1]['appearances'],
        reverse=True
    )

    print("\nMost frequently matched destinations:")
    for i, (dest_id, data) in enumerate(sorted_by_freq[:5], 1):
        print("  " + str(i) + ". " + data['name'] + " - appeared in " +
              str(data['appearances']) + " image(s)")

    print("="*80)

    return dict(destination_data)


print("Aggregation functions loaded")
print("="*80)

Aggregation functions loaded


# =================================================================
Step 5: RE-RANKING FUNCTION
# =================================================================


In [None]:
def rerank_destinations(destination_data: Dict, theme_analysis: Dict) -> List[Dict]:
    """
    Re-rank destinations using multi-factor weighted scoring.

    Scoring formula:
    - Average similarity (40%)
    - Max similarity (30%)
    - Frequency bonus (20%)
    - Theme match bonus (10%)

    Args:
        destination_data: Aggregated destination data
        theme_analysis: Theme analysis results

    Returns:
        Sorted list of destinations with final scores
    """

    print("\n" + "="*80)
    print("RE-RANKING DESTINATIONS")
    print("="*80)

    dominant_theme = theme_analysis['dominant_theme']
    print("Dominant theme: " + dominant_theme)
    print("\nApplying weighted scoring...")

    ranked_destinations = []

    for dest_id, data in destination_data.items():
        # Calculate score components
        avg_score = np.mean(data['scores'])
        max_score = max(data['scores'])
        frequency = data['appearances']
        theme_match = 1.0 if data['theme'] == dominant_theme else 0.0

        # Weighted final score
        final_score = (
            avg_score * Step3Config.AVG_SIMILARITY_WEIGHT +
            max_score * Step3Config.MAX_SIMILARITY_WEIGHT +
            (frequency * Step3Config.FREQUENCY_BONUS_PER_APPEARANCE) * Step3Config.FREQUENCY_WEIGHT +
            (theme_match * Step3Config.THEME_MATCH_BONUS) * Step3Config.THEME_MATCH_WEIGHT
        )

        ranked_destinations.append({
            'destination_id': dest_id,
            'destination_name': data['name'],
            'state': data['state'],
            'theme': data['theme'],
            'avg_similarity': round(avg_score * 100, 2),
            'max_similarity': round(max_score * 100, 2),
            'appearances': frequency,
            'appeared_in_images': data['appeared_in_images'],
            'theme_match': data['theme'] == dominant_theme,
            'final_score': round(final_score, 2)
        })

    # Sort by final score
    ranked_destinations.sort(key=lambda x: x['final_score'], reverse=True)

    print("\nTop 10 ranked destinations:")
    for i, dest in enumerate(ranked_destinations[:10], 1):
        stars = "**" if dest['appearances'] > 1 else ""
        theme_indicator = " (THEME MATCH)" if dest['theme_match'] else ""
        print("  " + str(i) + ". " + dest['destination_name'] + stars +
              " - Score: " + str(dest['final_score']) + theme_indicator)
        print("     Avg: " + str(dest['avg_similarity']) + "%, " +
              "Max: " + str(dest['max_similarity']) + "%, " +
              "Appears: " + str(dest['appearances']) + "x")

    print("\n** = Appeared in multiple images")
    print("="*80)

    return ranked_destinations


print("Re-ranking function loaded")
print("="*80)


Re-ranking function loaded


# =================================================================
Step 6: SAVE STEP 3 OUTPUTS
# =================================================================


In [None]:
def save_step3_outputs(theme_analysis: Dict, ranked_destinations: List[Dict]) -> None:
    """
    Save Step 3 outputs for Step 4 (UI display).

    Saves:
    - step3_refined_ranking.json: Final ranked results
    """

    print("\n" + "="*80)
    print("SAVING STEP 3 OUTPUTS")
    print("="*80)

    output_data = {
        'theme_analysis': {
            'dominant_theme': theme_analysis['dominant_theme'],
            'theme_confidence': round(theme_analysis['theme_confidence'] * 100, 2),
            'theme_distribution': theme_analysis['theme_distribution'],
            'total_matches': theme_analysis['total_matches']
        },
        'ranked_destinations': ranked_destinations[:Step3Config.TOP_N_RESULTS],
        'total_destinations': len(ranked_destinations)
    }

    # Save to JSON
    output_path = '/content/' + Step3Config.OUTPUT_FILE
    with open(output_path, 'w') as f:
        json.dump(output_data, f, indent=2)

    print("Saved to: " + output_path)
    print("Top " + str(Step3Config.TOP_N_RESULTS) + " destinations saved")
    print("="*80)
    print("\nReady for Step 4: Visual Gallery Display")
    print("="*80)


print("Save function loaded")
print("="*80)

Save function loaded


# =================================================================
Step 7: MAIN EXECUTION FUNCTION
# =================================================================


In [None]:
def run_step3(step2_output_path: str = '/content/step2_matches.json'):
    """
    Complete Step 3 execution.

    Args:
        step2_output_path: Path to Step 2 output file

    Returns:
        Dictionary with theme analysis and ranked destinations
    """

    print("\n" + "="*80)
    print("EXECUTING STEP 3: THEME EXTRACTION AND AGGREGATION")
    print("="*80)

    # Load Step 2 results
    print("\nLoading Step 2 results...")
    try:
        with open(step2_output_path, 'r') as f:
            step2_data = json.load(f)
        print("Loaded Step 2 results")
    except FileNotFoundError:
        print("ERROR: Step 2 results not found at " + step2_output_path)
        print("Please run Step 2 first")
        return None

    per_image_matches = step2_data['per_image_matches']

    # Step 1: Analyze themes
    theme_analysis = analyze_theme_distribution(per_image_matches)

    # Step 2: Aggregate destination data
    destination_data = aggregate_destination_data(per_image_matches)

    # Step 3: Re-rank destinations
    ranked_destinations = rerank_destinations(destination_data, theme_analysis)

    # Step 4: Save outputs
    save_step3_outputs(theme_analysis, ranked_destinations)

    print("\n" + "="*80)
    print("STEP 3 COMPLETE")
    print("="*80)
    print("\nSummary:")
    print("  Dominant theme: " + theme_analysis['dominant_theme'])
    print("  Confidence: " + str(round(theme_analysis['theme_confidence'] * 100, 1)) + "%")
    print("  Top destination: " + ranked_destinations[0]['destination_name'])
    print("  Final score: " + str(ranked_destinations[0]['final_score']))
    print("="*80)

    return {
        'theme_analysis': theme_analysis,
        'ranked_destinations': ranked_destinations
    }


print("Main execution function loaded")
print("="*80)
print("\nSTEP 3 INITIALIZED - Ready to extract themes and rank destinations")
print("="*80)
print("\nTO RUN:")
print("  result = run_step3()")
print("="*80)

Main execution function loaded

STEP 3 INITIALIZED - Ready to extract themes and rank destinations

TO RUN:
  result = run_step3()
