# Research Keywords Analysis Demo

This notebook demonstrates how to analyze research grant keywords using the two-step harmonisation approach:

1. **Load extracted keywords** from evaluation results
2. **Load two-step harmonisation results** (latest approach) 
3. **Apply harmonisation mappings** to keywords
4. **Join with grants metadata** for comprehensive analysis

The two-step approach avoids token limits by first identifying groups of similar keywords, then harmonising each group individually.

In [6]:
# Import required libraries
%load_ext autoreload
%autoreload 2
import json
import pandas as pd
from inspect_ai.analysis import messages_df, evals_df, samples_df
from tasks import (
    load_extracted_keywords, load_grants_data, 
    combine_two_step_harmonisation_results,
    LOGS_DIR, DATA_DIR, GRANTS_FILE
)

## Helper Functions

Define the core functions for loading and processing evaluation data.

In [2]:
def get_eval_ids(evals):
    """Return extract eval ID from evals dataframe."""
    eval_ids = {}
    
    if not evals[evals.task_name == "extract"].empty:
        eval_ids['extract'] = evals[evals.task_name == "extract"].eval_id.item()
    
    return eval_ids

In [3]:
def get_messages_and_samples(logs_dir):
    """Load and index messages and samples dataframes."""
    messages = messages_df(logs_dir).set_index('message_id')
    samples = samples_df(logs_dir).set_index('sample_id')
    return messages, samples

In [4]:
def process_extracted_keywords(messages, samples, extract_id):
    """Return extracted keywords dataframe joined with sample metadata."""
    extracted_samples = samples[samples.eval_id == extract_id]
    extracted_messages = messages[(messages.eval_id == extract_id) & (messages.role == "assistant")]
    extracted_keywords_df = pd.json_normalize(extracted_messages.content.map(json.loads)).set_index(extracted_messages.sample_id)
    extracted_keywords_df = extracted_keywords_df.join(extracted_samples.metadata_grant_id)
    return extracted_keywords_df

## Harmonisation Functions

Functions to load and process the two-step harmonisation results.

In [7]:
def get_harmonised_keywords(logs_dir):
    """
    Get harmonised keywords using the latest two-step approach.
    Modified to return original keywords instead of indices.
    
    Args:
        logs_dir: Directory containing evaluation logs
        
    Returns:
        Dictionary with harmonised keyword mappings or None if not available
    """
    from tasks import load_keyword_groups, load_group_harmonisations, load_extracted_keywords
    
    # Load results from both steps
    keyword_groups = load_keyword_groups(logs_dir)
    group_harmonisations = load_group_harmonisations(logs_dir)
    
    if not keyword_groups or not group_harmonisations:
        return None
    
    # Load original keywords
    all_keywords = load_extracted_keywords(logs_dir)
    flat_keywords = [kw for sublist in all_keywords.values() for kw in sublist]
    unique_keywords = sorted(set(flat_keywords))
    
    # Build result with original keywords instead of indices
    keyword_mappings = []
    string_mappings = {}
    
    for group_indices in keyword_groups:
        if not group_indices:
            continue
            
        # Get the harmonised keyword for this group
        if group_indices[0] < len(unique_keywords):
            first_keyword = unique_keywords[group_indices[0]]
            harmonised_keyword = group_harmonisations.get(first_keyword)
            
            if harmonised_keyword:
                # Create list of original keywords for this group
                original_keywords = [unique_keywords[idx] for idx in group_indices if 0 <= idx < len(unique_keywords)]
                
                mapping = {
                    'original': original_keywords,
                    'harmonised': harmonised_keyword
                }
                keyword_mappings.append(mapping)
                
                # Also create string mappings for backward compatibility
                for kw in original_keywords:
                    string_mappings[kw] = harmonised_keyword
    
    return {
        "keyword_mappings": string_mappings,  # String-based mapping for compatibility
        "original_keyword_mappings": keyword_mappings  # New format with original keywords
    }

In [8]:
def join_with_grants(extracted_keywords_df, grants_file):
    """Join extracted keywords dataframe with grants dataframe."""
    grants_df = load_grants_data(grants_file, as_dataframe=True)
    results = extracted_keywords_df.join(grants_df, on='metadata_grant_id', how='left')
    return results

In [9]:
def apply_harmonisation(results, harmonised_keywords):
    """Apply harmonisation mappings to keywords in results dataframe."""
    if not harmonised_keywords or 'keyword_mappings' not in harmonised_keywords:
        return results
        
    keyword_mappings = harmonised_keywords['keyword_mappings']
    
    # Apply harmonisation to each keyword category
    for column in ['keywords', 'methodology_keywords', 'application_keywords', 'technology_keywords']:
        if column in results.columns:
            results[f'{column}_harmonised'] = results[column].map(
                lambda x: [keyword_mappings.get(kw, kw) for kw in x] if x else []
            )
    
    return results

## Load and Process Data

Execute the main analysis pipeline to load evaluation data and apply harmonisation.

In [10]:
# Load evaluation data
evals = evals_df(LOGS_DIR)
eval_ids = get_eval_ids(evals)

print(f"Available evaluations: {list(eval_ids.keys())}")

if 'extract' not in eval_ids:
    print("❌ No keyword extraction results found. Run: inspect eval modeling/tasks.py@extract")
else:
    print("✓ Found keyword extraction results")

Available evaluations: ['extract']
✓ Found keyword extraction results


In [11]:
# Load and process extracted keywords
messages, samples = get_messages_and_samples(LOGS_DIR)
extracted_keywords_df = process_extracted_keywords(messages, samples, eval_ids['extract'])

print(f"✓ Loaded extracted keywords for {len(extracted_keywords_df)} grants")
print(f"Keyword categories: {[col for col in extracted_keywords_df.columns if col != 'metadata_grant_id']}")

✓ Loaded extracted keywords for 100 grants
Keyword categories: ['keywords', 'methodology_keywords', 'application_keywords', 'technology_keywords']


In [12]:
# Load harmonisation results
harmonised_keywords = get_harmonised_keywords(LOGS_DIR)

if harmonised_keywords:
    keyword_mappings = harmonised_keywords.get('keyword_mappings', {})
    original_mappings = harmonised_keywords.get('original_keyword_mappings', [])
    
    print(f"✓ Loaded harmonisation results:")
    print(f"  - {len(keyword_mappings)} keyword mappings")
    print(f"  - {len(original_mappings)} harmonisation groups")
    
    # Show statistics
    unique_harmonised = len(set(keyword_mappings.values()))
    consolidation_ratio = len(keyword_mappings) / unique_harmonised if unique_harmonised > 0 else 0
    print(f"  - Original keywords: {len(keyword_mappings)}")
    print(f"  - Harmonised keywords: {unique_harmonised}")
    print(f"  - Consolidation ratio: {consolidation_ratio:.2f}x")
else:
    print("⚠️ No harmonisation results found")

✓ Loaded harmonisation results:
  - 73 keyword mappings
  - 37 harmonisation groups
  - Original keywords: 73
  - Harmonised keywords: 37
  - Consolidation ratio: 1.97x


In [13]:
# Join with grants data and apply harmonisation
results = join_with_grants(extracted_keywords_df, GRANTS_FILE)
print(f"✓ Joined with grant metadata")

if harmonised_keywords:
    results = apply_harmonisation(results, harmonised_keywords)
    print(f"✓ Applied harmonisation to keywords")

print(f"Final dataset shape: {results.shape}")

✓ Joined with grant metadata
✓ Applied harmonisation to keywords
Final dataset shape: (100, 15)


## Explore Results

Examine the harmonised keyword structure and sample results.

In [14]:
# Display dataset info
print("Available columns:")
for col in results.columns:
    print(f"  - {col}")

Available columns:
  - keywords
  - methodology_keywords
  - application_keywords
  - technology_keywords
  - metadata_grant_id
  - title
  - grant_summary
  - funder
  - funding_amount
  - funding_scheme
  - status
  - keywords_harmonised
  - methodology_keywords_harmonised
  - application_keywords_harmonised
  - technology_keywords_harmonised


In [15]:
# Show sample harmonisation mappings
if harmonised_keywords and 'original_keyword_mappings' in harmonised_keywords:
    print("Sample harmonisation groups:")
    for i, mapping in enumerate(harmonised_keywords['original_keyword_mappings'][:5]):
        original_list = mapping['original']
        harmonised = mapping['harmonised']
        print(f"\n{i+1}. {original_list} → '{harmonised}'")

Sample harmonisation groups:

1. ['3-D imaging software', '3D volumetric image reconstruction', '3D volumetric imaging system'] → '3D Volumetric Imaging'

2. ['3-D seismic modeling', '3-D seismic structure'] → '3D seismic structure'

3. ['3D geological modelling'] → '3D geological modelling'

4. ['3D laser scanning', 'LiDAR scanning', 'LiDAR terrain mapping'] → 'LiDAR scanning'

5. ['3D peptide structure determination'] → '3D peptide structure determination'


In [16]:
# Show sample grant with keywords
if not results.empty:
    sample = results.iloc[0]
    print("Sample grant:")
    print(f"Title: {sample['title'][:100]}...")
    print(f"Funder: {sample.get('funder', 'N/A')}")
    print(f"Funding: ${sample.get('funding_amount', 'N/A'):,}" if sample.get('funding_amount') else "Funding: N/A")
    
    print(f"\nOriginal keywords: {sample['keywords'][:3]}...")
    if 'keywords_harmonised' in results.columns:
        print(f"Harmonised keywords: {sample['keywords_harmonised'][:3]}...")

Sample grant:
Title: Industrial Transformation Training Centres - Grant ID: IC230100035...
Funder: Australian Research Council
Funding: $5,000,000.0

Original keywords: ['Critical resources', 'Critical minerals', 'Mineral systems science']...
Harmonised keywords: ['Critical resources', 'Critical minerals', 'Mineral systems science']...


In [25]:
# Create SQLite database with all tables
import sqlite3
import uuid
import pandas as pd
from pathlib import Path

# Database setup
db_path = Path("../data/research_keywords.db")
db_path.parent.mkdir(exist_ok=True)
conn = sqlite3.connect(db_path)

try:
    # 1. Create grants table
    grants_table = results[['metadata_grant_id', 'title', 'grant_summary', 'funder', 
                           'funding_amount', 'funding_scheme', 'status']].copy()
    grants_table = grants_table.rename(columns={'metadata_grant_id': 'grant_id'})
    grants_table.to_sql('grants', conn, if_exists='replace', index=False)

    # 2. Create harmonised keywords table
    if harmonised_keywords and 'original_keyword_mappings' in harmonised_keywords:
        keywords_data = []
        for mapping in harmonised_keywords['original_keyword_mappings']:
            keywords_data.append({
                'keyword_id': str(uuid.uuid4()),
                'harmonised_keyword': mapping['harmonised'],
                'original_keywords': ', '.join(mapping['original']),
                'num_variants': len(mapping['original'])
            })
        keywords_df = pd.DataFrame(keywords_data)
        keywords_df.to_sql('harmonised_keywords', conn, if_exists='replace', index=False)

        # 3. Create junction table
        keyword_id_map = dict(zip(keywords_df['harmonised_keyword'], keywords_df['keyword_id']))
        junction_data = []
        
        for _, grant_row in results.iterrows():
            grant_id = grant_row['metadata_grant_id']
            all_harmonised_keywords = set()
            
            for category in ['keywords_harmonised', 'methodology_keywords_harmonised', 
                            'application_keywords_harmonised', 'technology_keywords_harmonised']:
                if category in grant_row and grant_row[category]:
                    all_harmonised_keywords.update(grant_row[category])
            
            for harmonised_kw in all_harmonised_keywords:
                if harmonised_kw in keyword_id_map:
                    junction_data.append({
                        'grant_id': grant_id,
                        'keyword_id': keyword_id_map[harmonised_kw],
                        'harmonised_keyword': harmonised_kw
                    })
        
        junction_df = pd.DataFrame(junction_data)
        junction_df.to_sql('grant_keywords', conn, if_exists='replace', index=False)
    
    print(f"✅ Database created: {db_path.absolute()}")
    print(f"📊 {len(grants_table)} grants")
    print(f"🏷️ {len(keywords_df)} harmonised keywords") 
    print(f"🔗 {len(junction_df)} grant-keyword links")

finally:
    conn.close()

✅ Database created: /Users/luhancheng/Desktop/research-link-technology-landscaping/modeling/../data/research_keywords.db
📊 100 grants
🏷️ 37 harmonised keywords
🔗 61 grant-keyword links
