In [84]:
from config import CATEGORY_PATH, GRANTS_FILE, LOGS_DIR, RESULTS_DIR, REFINED_CATEGORY_PATH
from inspect_ai.analysis import messages_df, evals_df
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import json

In [85]:
# Load refined categories data
with open(REFINED_CATEGORY_PATH, 'r') as f:
    refined_categories = json.load(f)

# Load detailed categories data (with keywords)
with open(CATEGORY_PATH, 'r') as f:
    detailed_categories = json.load(f)

# Try to load classification results if available
classification_path = RESULTS_DIR / "classification.json"
classification_results = []

if classification_path.exists():
    with open(classification_path, 'r') as f:
        classification_results = json.load(f)
    
    print(f"Found {len(classification_results)} grant classifications")
else:
    print("No classification results found - run classify task first")

print(f"Found {len(refined_categories)} refined categories")
print(f"Found {len(detailed_categories)} detailed categories")

# Create mapping from detailed category name to keywords
detailed_category_map = {cat['name']: cat.get('keywords', []) for cat in detailed_categories}

# Create mappings for grants based on what's available in the classification results
strategic_category_to_grants = {}  # Strategic level classifications
detailed_category_to_grants = {}   # Detailed level classifications

for result in classification_results:
    # Strategic level mapping (always available)
    for category_name in result.get('selected_categories', []):
        if category_name not in strategic_category_to_grants:
            strategic_category_to_grants[category_name] = []
        strategic_category_to_grants[category_name].append({
            'title': result['title'],
            'grant_id': result['grant_id']
        })
    
    # Detailed level mapping (only if subcategories were classified)
    for subcat_name in result.get('selected_subcategories', []):
        if subcat_name not in detailed_category_to_grants:
            detailed_category_to_grants[subcat_name] = []
        detailed_category_to_grants[subcat_name].append({
            'title': result['title'],
            'grant_id': result['grant_id']
        })

# Display summary
total_subcategories = sum(len(cat['subcategories']) for cat in refined_categories)
total_keywords = sum(len(cat.get('keywords', [])) for cat in detailed_categories)
total_grants = len(classification_results)
strategic_grants = sum(len(grants) for grants in strategic_category_to_grants.values())
detailed_grants = sum(len(grants) for grants in detailed_category_to_grants.values())

print(f"Total keywords: {total_keywords}")
print(f"Total grants: {total_grants}")
print(f"Strategic level classifications: {strategic_grants}")
print(f"Detailed level classifications: {detailed_grants}")
print(f"Average keywords per detailed category: {total_keywords/len(detailed_categories):.1f}")

# Determine which visualization mode to use
has_detailed_classifications = detailed_grants > 0
print(f"Visualization mode: {'Detailed level' if has_detailed_classifications else 'Strategic level'}")

Found 10 grant classifications
Found 16 refined categories
Found 239 detailed categories
Total keywords: 2430
Total grants: 10
Strategic level classifications: 0
Detailed level classifications: 0
Average keywords per detailed category: 10.2
Visualization mode: Strategic level


In [86]:
# Create comprehensive hierarchy based on available classification data
treemap_data = []

# Level 1: Strategic domains (refined categories)
for refined_cat in refined_categories:
    # Calculate total keywords for this strategic domain
    domain_keyword_count = 0
    
    for subcat_name in refined_cat['subcategories']:
        domain_keyword_count += len(detailed_category_map.get(subcat_name, []))
    
    # Size by total keywords
    domain_size = max(1, domain_keyword_count)
    
    treemap_data.append({
        'id': refined_cat['name'],
        'parent': '',
        'name': refined_cat['name'],
        'value': domain_size,
        'level': 'Strategic Domain',
        'item_type': 'domain'
    })
    
    # Level 2: Detailed categories (subcategories)
    for subcat_name in refined_cat['subcategories']:
        keywords = detailed_category_map.get(subcat_name, [])
        
        # Size by keywords only
        category_size = max(1, len(keywords))
        
        treemap_data.append({
            'id': f"{refined_cat['name']} >> {subcat_name}",
            'parent': refined_cat['name'],
            'name': subcat_name,
            'value': category_size,
            'level': 'Detailed Category',
            'item_type': 'category'
        })
        
        # Level 3: Keywords under detailed categories
        for keyword in keywords:
            treemap_data.append({
                'id': f"{refined_cat['name']} >> {subcat_name} >> KW: {keyword}",
                'parent': f"{refined_cat['name']} >> {subcat_name}",
                'name': keyword,
                'value': 1,
                'level': 'Keyword',
                'item_type': 'keyword'
            })

treemap_df = pd.DataFrame(treemap_data)

# Create the treemap with 3 different colors (removed grants)
color_map = {
    'Strategic Domain': '#1f77b4',    # Blue
    'Detailed Category': '#ff7f0e',   # Orange  
    'Keyword': '#2ca02c',             # Green
}

# Create title
title = 'Research Landscape: Domains → Categories → Keywords'

fig = px.treemap(
    treemap_df,
    ids='id',
    names='name',
    parents='parent', 
    values='value',
    title=title,
    color='level',
    color_discrete_map=color_map,
    hover_data=['level', 'value', 'item_type']
)

fig.update_layout(
    font_size=9,
    title_font_size=16,
    height=800,
    margin=dict(t=60, l=25, r=25, b=25)
)

# Update traces for better text visibility
fig.update_traces(
    textinfo="label",
    textfont_size=9,
    textposition="middle center"
)

# Print summary
keyword_count = len(treemap_df[treemap_df['level'] == 'Keyword'])

print(f"Treemap contains {len(treemap_df)} total elements:")
print(f"  - {len(refined_categories)} Strategic Domains (Blue)")  
print(f"  - {total_subcategories} Detailed Categories (Orange)")
print(f"  - {keyword_count} Keywords (Green)")

print(f"\nClassification summary:")
if strategic_grants > 0:
    print(f"Strategic level grants: {strategic_grants}")
if detailed_grants > 0:
    print(f"Detailed level grants: {detailed_grants}")

if strategic_grants > 0 or detailed_grants > 0:
    strategic_dist = {k: len(v) for k, v in strategic_category_to_grants.items() if v}
    detailed_dist = {k: len(v) for k, v in detailed_category_to_grants.items() if v}
    
    if strategic_dist:
        print("  Strategic level distribution:")
        for domain, count in sorted(strategic_dist.items(), key=lambda x: x[1], reverse=True)[:3]:
            print(f"    - {domain}: {count} grants")
    
    if detailed_dist:
        print("  Detailed level distribution:")
        for category, count in sorted(detailed_dist.items(), key=lambda x: x[1], reverse=True)[:3]:
            print(f"    - {category}: {count} grants")

fig.show()

Treemap contains 1709 total elements:
  - 16 Strategic Domains (Blue)
  - 152 Detailed Categories (Orange)
  - 1541 Keywords (Green)

Classification summary:
