In [1]:
import requests
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import json
from tqdm import tqdm
import random
from datetime import datetime
import os

In [2]:
# Set up the API key and base URL
API_KEY = "XdRpO0NtVp4QANFnSeIcmIMl71cU3xplJr1lQ500"  
BASE_URL = "https://api.semanticscholar.org/graph/v1"
HEADERS = {
    "x-api-key": API_KEY
}

In [3]:
# handle API requests with rate limiting
def make_api_request(url, params=None, max_retries=8):
    """
    Make an API request with advanced error handling and exponential backoff
    
    Args:
        url: API endpoint URL
        params: Query parameters
        max_retries: Maximum number of retry attempts
    
    Returns:
        JSON response or None if failed
    """
    retry_count = 0
    base_wait_time = 2  # Start with 2 seconds
    
    while retry_count < max_retries:
        try:
            # Add a session with increased timeouts
            session = requests.Session()
            session.mount('https://', requests.adapters.HTTPAdapter(
                max_retries=3,
                pool_connections=10,
                pool_maxsize=10
            ))
            
            # Make the request with extended timeout
            response = session.get(
                url, 
                headers=HEADERS, 
                params=params, 
                timeout=(10, 30)  # (connect timeout, read timeout)
            )
            
            # Handle rate limiting
            if response.status_code == 429:
                retry_count += 1
                # More aggressive backoff for rate limits
                wait_time = base_wait_time * (2 ** retry_count) + random.uniform(0, 1)
                print(f"Rate limit reached. Waiting {wait_time:.2f} seconds before retry {retry_count}/{max_retries}")
                time.sleep(wait_time)
                continue
            
            # Handle server errors
            elif response.status_code >= 500:
                retry_count += 1
                wait_time = base_wait_time * (2 ** retry_count) + random.uniform(0, 1)
                print(f"Server error {response.status_code}. Waiting {wait_time:.2f} seconds before retry {retry_count}/{max_retries}")
                time.sleep(wait_time)
                continue
            
            # Success
            elif response.status_code == 200:
                # Always wait between requests to respect rate limiting
                wait_time = 1 + random.uniform(0, 0.5)  # Add randomness to avoid synchronized requests
                time.sleep(wait_time)
                return response.json()
            
            # Other error
            else:
                print(f"Error: {response.status_code}, {response.text}")
                if retry_count < max_retries - 1:
                    retry_count += 1
                    wait_time = base_wait_time * (2 ** retry_count) + random.uniform(0, 1)
                    print(f"Retrying in {wait_time:.2f} seconds...")
                    time.sleep(wait_time)
                    continue
                return None
            
        except (requests.exceptions.ConnectionError, 
                requests.exceptions.Timeout, 
                requests.exceptions.ChunkedEncodingError,
                ConnectionResetError) as e:
            # Handle connection-related errors
            retry_count += 1
            wait_time = base_wait_time * (2 ** retry_count) + random.uniform(0, 1)
            print(f"Connection error: {type(e).__name__}: {e}. Waiting {wait_time:.2f} seconds before retry {retry_count}/{max_retries}")
            time.sleep(wait_time)
            continue
            
        except Exception as e:
            # Catch other exceptions
            print(f"Unexpected error: {type(e).__name__}: {e}")
            retry_count += 1
            wait_time = base_wait_time * (2 ** retry_count) + random.uniform(0, 1)
            print(f"Waiting {wait_time:.2f} seconds before retry {retry_count}/{max_retries}")
            time.sleep(wait_time)
            continue
    
    print(f"Failed after {max_retries} retries")
    return None

In [4]:
# Step 1: Get a list of random NLP researchers
def get_nlp_researchers(n):
    """
    Get random NLP researchers from Semantic Scholar
    Uses a combination of paper search and author extraction
    """
    researchers = set()
    fields_of_study = ["Natural Language Processing", "Computational Linguistics", "Language Understanding", "Text Generation"]
    
    # We'll retrieve papers in these fields and extract their authors
    for field in fields_of_study:
        offset = 0
        while len(researchers) < n and offset < 5000:
            # Search for papers in the specified field
            search_url = f"{BASE_URL}/paper/search"
            search_params = {
                "query": field,
                "fields": "authors",
                "limit": 100,
                "offset": offset
            }
            
            paper_results = make_api_request(search_url, search_params)
            
            if not paper_results or not paper_results.get("data"):
                break
            
            # Extract authors from the papers
            for paper in paper_results.get("data", []):
                for author in paper.get("authors", []):
                    if author.get("authorId") and len(researchers) < n:
                        researchers.add(author.get("authorId"))
            
            offset += 100
            print(f"Found {len(researchers)} researchers so far...")
    
    # Convert to list and take the first n (or fewer if we couldn't find n)
    researcher_list = list(researchers)[:n]
    return researcher_list


In [5]:
# Step 2: Get all information for each researcher
def get_researcher_details(author_id):
    """Get detailed information about a researcher"""
    author_url = f"{BASE_URL}/author/{author_id}"
    author_params = {
        "fields": "name,paperCount,hIndex,citationCount,papers.year,papers.title,papers.venue,papers.authors,papers.citationCount,papers.url,papers.fieldsOfStudy,papers.publicationVenue,affiliations"
    }
    
    
    return make_api_request(author_url, author_params)

In [6]:
# Step 3: Calculate metrics for each researcher
def calculate_metrics(researcher_data):
    """Calculate various metrics for a researcher"""
    if not researcher_data:
        return None
    
    metrics = {
        "author_id": researcher_data.get("authorId"),
        "name": researcher_data.get("name"),
        "total_paper_count": researcher_data.get("paperCount", 0),
        "total_citation_count": researcher_data.get("citationCount", 0),
        "h_index": researcher_data.get("hIndex", 0),
    }
    
    # Add institutional affiliation if available
    affiliations = researcher_data.get("affiliations", [])
    if affiliations:
        metrics["institution"] = affiliations[0]  # Use the first affiliation listed
    
    papers = researcher_data.get("papers", [])
    
    # Skip if no papers are available
    if not papers:
        return metrics
    
    # Calculate career span
    years = [paper.get("year") for paper in papers if paper.get("year")]
    if years:
        current_year = datetime.now().year
        # Filter out future years (might be errors in the data)
        years = [y for y in years if y and y <= current_year]
        if years:
            metrics["first_publication_year"] = min(years)
            metrics["last_publication_year"] = max(years)
            metrics["career_span"] = metrics["last_publication_year"] - metrics["first_publication_year"] + 1
            # Avoid division by zero
            if metrics["career_span"] > 0:
                metrics["avg_papers_per_year"] = metrics["total_paper_count"] / metrics["career_span"]
            else:
                metrics["avg_papers_per_year"] = metrics["total_paper_count"]
    
    # Calculate first/last author counts, assume authors are correctly ordered
    first_author_count = 0
    last_author_count = 0
    
    for paper in papers:
        authors = paper.get("authors", [])
        author_positions = [i for i, author in enumerate(authors) 
                          if author.get("authorId") == metrics["author_id"]]
        
        if author_positions:
            if 0 in author_positions:
                first_author_count += 1
            if authors and len(authors) - 1 in author_positions:
                last_author_count += 1
    
    metrics["first_author_count"] = first_author_count
    metrics["last_author_count"] = last_author_count
    
    # Extract venue information properly
    venues = []
    venue_types = []
    
    for paper in papers:
        # Get structured publication venue information
        pub_venue = paper.get("publicationVenue", {})
        if pub_venue:
            venue_name = pub_venue.get("name")
            venue_type = pub_venue.get("type")
            
            if venue_name:
                venues.append(venue_name)
                
            if venue_type:
                venue_types.append(venue_type)
        # Fallback to paper.venue if publicationVenue isn't available
        elif paper.get("venue"):
            venues.append(paper.get("venue"))
    
    # Calculate mode venue and venue diversity
    if venues:
        venue_counter = Counter(venues)
        metrics["mode_venue"] = venue_counter.most_common(1)[0][0]
        metrics["unique_venues"] = len(venue_counter)
        
        # Calculate venue diversity using normalized entropy
        total_venues = len(venues)
        probabilities = [count/total_venues for count in venue_counter.values()]
        entropy = -sum(p * np.log(p) for p in probabilities)
        max_entropy = np.log(len(venue_counter))
        if max_entropy > 0:
            metrics["venue_diversity"] = entropy / max_entropy
        else:
            metrics["venue_diversity"] = 0
    
    # Calculate venue type distribution if available
    if venue_types:
        type_counter = Counter(venue_types)
        metrics["venue_types"] = dict(type_counter)
        
        # Calculate conference vs journal ratio
        conf_count = type_counter.get("conference", 0)
        journal_count = type_counter.get("journal", 0)
        
        if journal_count > 0:
            metrics["conference_journal_ratio"] = conf_count / journal_count
        else:
            metrics["conference_journal_ratio"] = float('inf') if conf_count > 0 else float('nan')
    
    # Approximate workshop vs main conference ratio
    # This is an approximation as we'd need more detailed venue information
    # workshop_keywords = ['workshop', 'w-', 'symposium', 'sig']
    # conference_keywords = ['conference', 'conf', 'meeting', 'acl', 'emnlp', 'naacl', 'coling']
    
    # workshop_count = sum(1 for venue in venues if any(kw in venue.lower() for kw in workshop_keywords))
    # conference_count = sum(1 for venue in venues if any(kw in venue.lower() for kw in conference_keywords))
    
    # if conference_count > 0:
    #     metrics["workshop_conference_ratio"] = workshop_count / conference_count
    # else:
    #     metrics["workshop_conference_ratio"] = float('nan')
    
    # Citations per paper
    if metrics["total_paper_count"] > 0:
        metrics["citations_per_paper"] = metrics["total_citation_count"] / metrics["total_paper_count"]
    else:
        metrics["citations_per_paper"] = 0
    
    return metrics


In [7]:
# Execute the pipeline
def run_researcher_metrics_pipeline(num_researchers, resume_from=None, checkpoint_interval=10):
    """
    Run the full pipeline to collect and analyze researcher metrics with checkpointing
    
    Args:
        num_researchers: Number of researchers to collect
        resume_from: Path to a CSV file to resume from
        checkpoint_interval: How often to save progress (in researchers processed)
    
    Returns:
        DataFrame with researcher metrics
    """
    # Setup directories
    raw_data_dir = "raw_researcher_data"
    paper_data_dir = "researcher_papers"
    checkpoint_dir = "checkpoints"
    os.makedirs(raw_data_dir, exist_ok=True)
    os.makedirs(paper_data_dir, exist_ok=True)
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # Initialize variables
    all_metrics = []
    processed_ids = set()
    
    # Check if we're resuming from a previous run
    if resume_from and os.path.exists(resume_from):
        print(f"Resuming from {resume_from}")
        metrics_df = pd.read_csv(resume_from)
        all_metrics = metrics_df.to_dict('records')
        processed_ids = set(metrics_df['author_id'].astype(str).values)  # Ensure strings
        print(f"Already processed {len(processed_ids)} researchers")
    
    # Check if we already have researcher IDs saved
    researcher_ids_file = os.path.join(checkpoint_dir, "researcher_ids.json")
    if os.path.exists(researcher_ids_file):
        print(f"Loading researcher IDs from {researcher_ids_file}")
        with open(researcher_ids_file, 'r') as f:
            researcher_ids = json.load(f)
        print(f"Loaded {len(researcher_ids)} researcher IDs")
    else:
        # Get new researcher IDs
        print(f"Step 1: Finding {num_researchers} researchers...")
        researcher_ids = get_nlp_researchers(num_researchers)
        
        # Save the researcher IDs
        with open(researcher_ids_file, 'w') as f:
            json.dump(researcher_ids, f)
        print(f"Saved {len(researcher_ids)} researcher IDs")
    
    # Filter out already processed researchers
    remaining_ids = [id for id in researcher_ids if id not in processed_ids]
    print(f"Remaining researchers to process: {len(remaining_ids)}")
    
    # Process each researcher
    if remaining_ids:
        print(f"Step 2: Collecting detailed information for {len(remaining_ids)} researchers...")
        
        try:
            for i, researcher_id in enumerate(tqdm(remaining_ids)):
                # Check if we already have raw data for this researcher
                raw_data_file = os.path.join(raw_data_dir, f"{researcher_id}.json")
                researcher_data = None
                
                if os.path.exists(raw_data_file):
                    print(f"Loading existing data for researcher {researcher_id}")
                    try:
                        with open(raw_data_file, 'r') as f:
                            researcher_data = json.load(f)
                    except json.JSONDecodeError:
                        print(f"Error loading file {raw_data_file}, will re-fetch data")
                
                # If we don't have data, fetch it
                if not researcher_data:
                    researcher_data = get_researcher_details(researcher_id)
                    
                    # Save the raw data
                    if researcher_data:
                        with open(raw_data_file, 'w') as f:
                            json.dump(researcher_data, f, indent=2)
                
                # Process metrics
                if researcher_data:
                    metrics = calculate_metrics(researcher_data)
                    
                    if metrics:
                        all_metrics.append(metrics)
                        processed_ids.add(researcher_id)
                        
                        # Save researcher papers
                        if "papers" in researcher_data:
                            paper_file = os.path.join(paper_data_dir, f"{researcher_id}.json")
                            with open(paper_file, 'w') as f:
                                json.dump(researcher_data["papers"], f)
                
                # Save checkpoint at regular intervals
                if (i + 1) % checkpoint_interval == 0 or (i + 1) == len(remaining_ids):
                    checkpoint_file = os.path.join(checkpoint_dir, f"metrics_checkpoint_{len(all_metrics)}.csv")
                    temp_df = pd.DataFrame(all_metrics)
                    temp_df.to_csv(checkpoint_file, index=False)
                    
                    # Also save to the main output file
                    temp_df.to_csv("nlp_researcher_metrics_in_progress.csv", index=False)
                    print(f"Checkpoint saved: {len(all_metrics)}/{num_researchers} researchers processed")
                
                # Print progress every 100 researchers
                if (i + 1) % 100 == 0:
                    print(f"Processed {i + 1}/{len(remaining_ids)} researchers")
                    
        except KeyboardInterrupt:
            print("Process interrupted by user. Saving progress...")
            temp_df = pd.DataFrame(all_metrics)
            temp_df.to_csv("nlp_researcher_metrics_interrupted.csv", index=False)
            print(f"Progress saved: {len(all_metrics)}/{num_researchers} researchers processed")
            return temp_df
            
        except Exception as e:
            print(f"Unexpected error in pipeline: {type(e).__name__}: {e}")
            temp_df = pd.DataFrame(all_metrics)
            temp_df.to_csv("nlp_researcher_metrics_error.csv", index=False)
            print(f"Progress saved after error: {len(all_metrics)}/{num_researchers} researchers processed")
            return temp_df
    
    # Create final DataFrame
    metrics_df = pd.DataFrame(all_metrics)
    
    # Save results to CSV
    output_file = "nlp_researcher_metrics.csv"
    metrics_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
    
    return metrics_df


In [8]:
# Function to rank researchers by different metrics
def rank_researchers(metrics_df):
    """Rank researchers by different metrics and compare rankings"""
    ranking_metrics = [
        "total_paper_count", 
        "career_span",
        "avg_papers_per_year",
        "first_author_count",
        "last_author_count",
        "unique_venues",
        "venue_diversity",
        "total_citation_count",
        "citations_per_paper",
        "h_index",
        "workshop_conference_ratio"
    ]
    
    # Create a dictionary to store rankings
    rankings = {}
    
    for metric in ranking_metrics:
        if metric in metrics_df.columns:
            # For most metrics, higher is better
            ascending = False
            
            # Calculate rankings (handling NaN values)
            metrics_df[f"{metric}_rank"] = metrics_df[metric].rank(ascending=ascending, method='min', na_option='bottom')
            rankings[metric] = metrics_df[["name", metric, f"{metric}_rank"]].sort_values(by=f"{metric}_rank")
    
    return rankings



In [9]:
# Function to analyze and visualize differences between rankings
def analyze_ranking_differences(metrics_df, rankings):
    """Analyze differences between rankings based on different metrics"""
    # Create correlation matrix of rankings
    ranking_columns = [col for col in metrics_df.columns if col.endswith('_rank')]
    
    if not ranking_columns:
        print("No ranking columns found. Make sure to run rank_researchers() first.")
        return metrics_df
    
    # correlation_matrix = metrics_df[ranking_columns].corr(method='spearman')
    
    # # Save correlation matrix
    # correlation_matrix.to_csv("ranking_correlations.csv")
    
    # # Plot correlation matrix
    # plt.figure(figsize=(12, 10))
    # plt.imshow(correlation_matrix, cmap='coolwarm', vmin=-1, vmax=1)
    # plt.colorbar()
    # plt.xticks(range(len(ranking_columns)), [col.replace('_rank', '') for col in ranking_columns], rotation=90)
    # plt.yticks(range(len(ranking_columns)), [col.replace('_rank', '') for col in ranking_columns])
    # plt.title('Correlation between different ranking metrics')
    # plt.tight_layout()
    # plt.savefig("ranking_correlations.png")
    
    # Calculate the average ranking for each researcher
    # Only if there are multiple ranking columns
    if len(ranking_columns) > 1:
        average_rank_col = 'average_rank'
        metrics_df[average_rank_col] = metrics_df[[col for col in metrics_df.columns if col.endswith('_rank')]].mean(axis=1)
    
    # Find researchers with the largest differences between h-index rank and other metrics
    h_index_col = 'h_index_rank'
    if h_index_col in metrics_df.columns:
        for metric_rank in [col for col in metrics_df.columns if col.endswith('_rank') and col != h_index_col]:
            # Create the difference column if it doesn't exist
            diff_col = f'diff_{metric_rank}_vs_h_index'
            if diff_col not in metrics_df.columns:
                metrics_df[diff_col] = metrics_df[metric_rank] - metrics_df[h_index_col]
            
            # Get the base metric name without '_rank'
            base_metric = metric_rank.replace('_rank', '')
            
            # Check if the base metric exists in the dataframe
            if base_metric in metrics_df.columns:
                # Top 10 researchers ranked much better by this metric than by h-index
                better_by_metric = metrics_df.sort_values(by=diff_col, ascending=True).head(10)
                print(f"\nTop 10 researchers ranked better by {base_metric} than by h-index:")
                columns_to_show = ['name', base_metric, 'h_index', metric_rank, h_index_col, diff_col]
                # Filter to only include columns that actually exist in the dataframe
                columns_to_show = [col for col in columns_to_show if col in metrics_df.columns]
                print(better_by_metric[columns_to_show])
                
                # Top 10 researchers ranked much worse by this metric than by h-index
                worse_by_metric = metrics_df.sort_values(by=diff_col, ascending=False).head(10)
                print(f"\nTop 10 researchers ranked worse by {base_metric} than by h-index:")
                print(worse_by_metric[columns_to_show])
    
    return metrics_df

In [17]:
metrics_df = run_researcher_metrics_pipeline(
    num_researchers=1000, 
    resume_from="nlp_researcher_metrics_in_progress.csv",
    checkpoint_interval=100  # Save progress
)
metrics_df.head()

Step 1: Finding 1000 researchers...
Found 559 researchers so far...
Found 938 researchers so far...
Found 1000 researchers so far...
Saved 1000 researcher IDs
Remaining researchers to process: 1000
Step 2: Collecting detailed information for 1000 researchers...


 10%|█         | 100/1000 [02:58<26:40,  1.78s/it]

Checkpoint saved: 100/1000 researchers processed
Processed 100/1000 researchers


 20%|██        | 200/1000 [06:07<29:43,  2.23s/it]  

Checkpoint saved: 200/1000 researchers processed
Processed 200/1000 researchers


 30%|███       | 300/1000 [08:57<21:16,  1.82s/it]

Checkpoint saved: 300/1000 researchers processed
Processed 300/1000 researchers


 37%|███▋      | 367/1000 [10:50<18:35,  1.76s/it]

Rate limit reached. Waiting 4.53 seconds before retry 1/8


 40%|███▉      | 397/1000 [11:49<18:18,  1.82s/it]

Rate limit reached. Waiting 4.98 seconds before retry 1/8


 40%|████      | 400/1000 [12:00<25:16,  2.53s/it]

Checkpoint saved: 400/1000 researchers processed
Processed 400/1000 researchers


 43%|████▎     | 431/1000 [12:56<16:57,  1.79s/it]

Rate limit reached. Waiting 4.16 seconds before retry 1/8


 50%|█████     | 500/1000 [15:02<14:20,  1.72s/it]

Checkpoint saved: 500/1000 researchers processed
Processed 500/1000 researchers


 60%|██████    | 600/1000 [18:00<11:20,  1.70s/it]

Checkpoint saved: 600/1000 researchers processed
Processed 600/1000 researchers


 70%|███████   | 700/1000 [20:52<08:44,  1.75s/it]

Checkpoint saved: 700/1000 researchers processed
Processed 700/1000 researchers


 80%|███████▉  | 798/1000 [23:59<06:04,  1.81s/it]

Rate limit reached. Waiting 4.54 seconds before retry 1/8


 80%|████████  | 800/1000 [24:07<09:09,  2.75s/it]

Checkpoint saved: 800/1000 researchers processed
Processed 800/1000 researchers


 90%|█████████ | 900/1000 [27:12<02:55,  1.75s/it]

Checkpoint saved: 900/1000 researchers processed
Processed 900/1000 researchers


100%|██████████| 1000/1000 [30:12<00:00,  1.81s/it]

Checkpoint saved: 1000/1000 researchers processed
Processed 1000/1000 researchers
Results saved to nlp_researcher_metrics.csv





Unnamed: 0,author_id,name,total_paper_count,total_citation_count,h_index,institution,first_publication_year,last_publication_year,career_span,avg_papers_per_year,first_author_count,last_author_count,mode_venue,unique_venues,venue_diversity,venue_types,conference_journal_ratio,citations_per_paper
0,51394448,Shaoxiong Ji,50,3823,22,Technical University of Darmstadt,2018,2024,7,7.142857,23,7,arXiv.org,28.0,0.937115,"{'conference': 15, 'journal': 15}",1.0,76.46
1,2531268,Alexander M. Rush,138,24080,57,Cornell Tech,1998,2023,26,5.307692,12,72,Conference on Empirical Methods in Natural Lan...,35.0,0.781246,"{'conference': 90, 'journal': 12}",7.5,174.492754
2,2186852856,Phillip Schneider,17,96,4,,2022,2025,4,4.25,12,1,arXiv.org,13.0,0.966863,{'conference': 11},inf,5.647059
3,2719024,Peter Milder,83,2560,23,Stony Brook University,2005,2023,19,4.368421,17,27,Optics Express,51.0,0.949626,"{'conference': 39, 'journal': 12}",3.25,30.843373
4,2117008214,Edward Kim,12,1203,7,,2003,2023,21,0.571429,6,0,npj Computational Materials,7.0,0.97957,{'journal': 5},0.0,100.25


In [18]:
rankings = rank_researchers(metrics_df)
rankings

{'total_paper_count':                   name  total_paper_count  total_paper_count_rank
 229             R. Gur                982                     1.0
 941           G. Ceder                979                     2.0
 391             Lei He                912                     3.0
 300           Jie Zhou                826                     4.0
 932      Hsinchun Chen                820                     5.0
 ..                 ...                ...                     ...
 520     Hussein Sharif                  1                   962.0
 863          Meng Wang                  1                   962.0
 549            H. Khan                  1                   962.0
 480    Michael Elhadad                  1                   962.0
 331  Kanchan M.Tarwani                  1                   962.0
 
 [1000 rows x 3 columns],
 'career_span':                       name  career_span  career_span_rank
 760        Michael Yeomans          155               1.0
 651       Jul

In [19]:
analyze_ranking_differences(metrics_df, rankings)


Top 10 researchers ranked better by total_paper_count than by h-index:
               name  total_paper_count  h_index  total_paper_count_rank  \
682  Lysandre Debut                 65        6                   318.0   
239  Y. Haralambous                 92       10                   255.0   
983        Zhao Cai                 37        5                   450.0   
697  Sylvain Gugger                 31        4                   498.0   
122     Thomas Wolf                 54        7                   361.0   
6     Chu-Ren Huang                 58        8                   339.0   
725        Clara Ma                 28        4                   523.0   
206  Yacine Jernite                 49        7                   385.0   
148      Honghan Wu                 25        4                   554.0   
751       Canwen Xu                 41        7                   422.0   

     h_index_rank  diff_total_paper_count_rank_vs_h_index  
682         704.0                         

Unnamed: 0,author_id,name,total_paper_count,total_citation_count,h_index,institution,first_publication_year,last_publication_year,career_span,avg_papers_per_year,...,diff_total_paper_count_rank_vs_h_index,diff_career_span_rank_vs_h_index,diff_avg_papers_per_year_rank_vs_h_index,diff_first_author_count_rank_vs_h_index,diff_last_author_count_rank_vs_h_index,diff_unique_venues_rank_vs_h_index,diff_venue_diversity_rank_vs_h_index,diff_total_citation_count_rank_vs_h_index,diff_citations_per_paper_rank_vs_h_index,diff_average_rank_vs_h_index
0,51394448,Shaoxiong Ji,50,3823,22,Technical University of Darmstadt,2018,2024,7,7.142857,...,73.0,375.0,-144.0,-99.0,106.0,-6.0,95.0,0.0,5.0,40.5
1,2531268,Alexander M. Rush,138,24080,57,Cornell Tech,1998,2023,26,5.307692,...,108.0,104.0,189.0,346.0,64.0,177.0,833.0,-6.0,71.0,188.6
2,2186852856,Phillip Schneider,17,96,4,,2022,2025,4,4.250000,...,-154.0,61.0,-465.0,-402.0,-160.0,-280.0,-587.0,146.0,186.0,-165.5
3,2719024,Peter Milder,83,2560,23,Stony Brook University,2005,2023,19,4.368421,...,-22.0,-13.0,39.0,2.0,-39.0,-133.0,31.0,100.0,348.0,31.3
4,2117008214,Edward Kim,12,1203,7,,2003,2023,21,0.571429,...,64.0,-428.0,278.0,-81.0,101.0,31.0,-529.0,-110.0,-424.0,-109.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,13044536,Yanshan Wang,94,2254,21,,2015,2023,9,10.444444,...,-70.0,264.0,-243.0,-50.0,-18.0,-101.0,215.0,112.0,433.0,54.2
996,51302331,Vera Sorin,28,765,12,,2018,2025,8,3.500000,...,14.0,119.0,-91.0,-216.0,140.0,-112.0,-171.0,132.0,186.0,0.1
997,50793079,Hongfang Liu,495,11287,52,,1999,2022,24,20.625000,...,-65.0,105.0,-67.0,34.0,-64.0,-68.0,756.0,33.0,695.0,135.9
998,2136480655,Ilana Heinz,1,902,1,,2021,2021,1,1.000000,...,17.0,8.0,-142.0,-80.0,-175.0,0.0,0.0,-328.0,-934.0,-163.4


In [20]:
def pretty_print_rankings(rankings, top_n=10):
    """
    Pretty print the rankings from rank_researchers()
    
    Args:
        rankings: Dictionary of rankings from rank_researchers()
        top_n: Number of top researchers to show
    """
    print("\n" + "="*80)
    print("TOP RESEARCHERS BY DIFFERENT METRICS")
    print("="*80 + "\n")
    
    for metric, df in rankings.items():
        print(f"\n{'-'*40}")
        print(f"TOP {top_n} RESEARCHERS BY {metric.upper()}")
        print(f"{'-'*40}")
        
        # Display top researchers
        display_cols = ['name', metric]
        
        # Add h-index for reference if available and not the current metric
        if metric != 'h_index' and 'h_index' in df.columns:
            display_cols.append('h_index')
        
        # Format the table with proper alignment
        table = df[display_cols].head(top_n).copy()
        
        # Format the metric values
        for col in table.columns:
            if col != 'name' and pd.api.types.is_numeric_dtype(table[col]):
                table[col] = table[col].apply(lambda x: f"{x:,.2f}" if isinstance(x, float) else f"{x:,}")
        
        # Print as a formatted table
        headers = [col.replace('_', ' ').title() for col in table.columns]
        
        # Calculate column widths
        col_widths = [max(len(str(val)), len(headers[i])) + 2 for i, val in enumerate(table.iloc[0])]
        
        # Print header
        header_row = " ".join(f"{h:<{w}}" for h, w in zip(headers, col_widths))
        print(header_row)
        print("-" * sum(col_widths))
        
        # Print rows
        for _, row in table.iterrows():
            values = [str(val) for val in row.values]
            row_str = " ".join(f"{v:<{w}}" for v, w in zip(values, col_widths))
            print(row_str)

def pretty_print_ranking_differences(metrics_df, max_rows=10):
    """
    Pretty print the results from analyze_ranking_differences()
    
    Args:
        metrics_df: DataFrame with ranking differences
        max_rows: Maximum number of rows to display for each section
    """
    print("\n" + "="*80)
    print("RANKING DIFFERENCES COMPARED TO H-INDEX")
    print("="*80 + "\n")
    
    # Find all diff columns
    diff_cols = [col for col in metrics_df.columns if col.startswith('diff_') and '_vs_h_index' in col]
    
    for diff_col in diff_cols:
        # Get the base metric name (remove 'diff_' and '_rank_vs_h_index')
        metric = diff_col.replace('diff_', '').replace('_rank_vs_h_index', '')
        
        # Calculate the metric and h_index column names
        metric_col = metric
        metric_rank_col = f"{metric}_rank"
        h_index_col = 'h_index'
        h_index_rank_col = 'h_index_rank'
        
        # Check if columns exist
        required_cols = ['name', metric_col, h_index_col, metric_rank_col, h_index_rank_col, diff_col]
        if not all(col in metrics_df.columns for col in required_cols):
            continue
            
        # Better by this metric than h-index
        print(f"\n{'-'*80}")
        print(f"TOP {max_rows} RESEARCHERS RANKED BETTER BY {metric.upper()} THAN BY H-INDEX:")
        print(f"{'-'*80}")
        
        better = metrics_df.sort_values(by=diff_col, ascending=True).head(max_rows)
        
        # Format the table
        table = better[required_cols].copy()
        
        # Format the values
        for col in table.columns:
            if col != 'name' and pd.api.types.is_numeric_dtype(table[col]):
                table[col] = table[col].apply(lambda x: f"{x:,.2f}" if isinstance(x, float) else f"{x:,}")
        
        # Print as a formatted table
        headers = [col.replace('_', ' ').title() for col in table.columns]
        headers[-1] = 'Rank Difference'
        
        # Calculate column widths
        max_name_len = max(len(str(x)) for x in table['name'])
        col_widths = [max(max_name_len, len(headers[0])) + 2]  # Name column
        
        # Add widths for other columns
        for i, col in enumerate(table.columns[1:], 1):
            max_val_len = max(len(str(x)) for x in table[col])
            col_widths.append(max(max_val_len, len(headers[i])) + 2)
        
        # Print header
        header_row = "".join(f"{h:<{w}}" for h, w in zip(headers, col_widths))
        print(header_row)
        print("-" * sum(col_widths))
        
        # Print rows
        for _, row in table.iterrows():
            values = [str(val) for val in row.values]
            row_str = "".join(f"{v:<{w}}" for v, w in zip(values, col_widths))
            print(row_str)
        
        # Worse by this metric than h-index
        print(f"\n{'-'*80}")
        print(f"TOP {max_rows} RESEARCHERS RANKED WORSE BY {metric.upper()} THAN BY H-INDEX:")
        print(f"{'-'*80}")
        
        worse = metrics_df.sort_values(by=diff_col, ascending=False).head(max_rows)
        
        # Format the table
        table = worse[required_cols].copy()
        
        # Format the values
        for col in table.columns:
            if col != 'name' and pd.api.types.is_numeric_dtype(table[col]):
                table[col] = table[col].apply(lambda x: f"{x:,.2f}" if isinstance(x, float) else f"{x:,}")
        
        # Print as a formatted table
        # Header already printed for the "better" section
        
        # Print header
        header_row = "".join(f"{h:<{w}}" for h, w in zip(headers, col_widths))
        print(header_row)
        print("-" * sum(col_widths))
        
        # Print rows
        for _, row in table.iterrows():
            values = [str(val) for val in row.values]
            row_str = "".join(f"{v:<{w}}" for v, w in zip(values, col_widths))
            print(row_str)

In [21]:
pretty_print_rankings(rankings, top_n=20)
pretty_print_ranking_differences(metrics_df, max_rows=20)


TOP RESEARCHERS BY DIFFERENT METRICS


----------------------------------------
TOP 20 RESEARCHERS BY TOTAL_PAPER_COUNT
----------------------------------------
Name     Total Paper Count  
---------------------------
R. Gur   982                
G. Ceder 979                
Lei He   912                
Jie Zhou 826                
Hsinchun Chen 820                
W. Bruce Croft 726                
H. Krumholz 645                
E. Hovy  631                
E. Fox   598                
Graham Neubig 596                
D. Roth  586                
Dong Yu  572                
Walter Daelemans 571                
C. Bearden 539                
L. Briand 538                
Quan Z. Sheng 533                
S. Ananiadou 531                
L. Deng  508                
E. Cambria 504                
Hongfang Liu 495                

----------------------------------------
TOP 20 RESEARCHERS BY CAREER_SPAN
----------------------------------------
Name              Career Span  
-------