In [None]:
# -*- coding: utf-8 -*-
"""
PARETO ANALYSIS: WHAT PERCENTAGE OF AUTHORS ACCOUNT FOR 80% OF CITATIONS?
Based on: ../data/processed/processed_publications_for_mertonian_analysis.csv
Outputs saved to: ../output/figures/ and ../output/tables/
"""

# --- Import Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from IPython.display import display

# --- Define Relative Paths (relative to notebooks/) ---
input_file = "../data/processed/processed_publications_for_mertonian_analysis.csv"
output_figures = "../output/figures"
output_tables = "../output/tables"

# Create output directories if they don't exist
os.makedirs(output_figures, exist_ok=True)
os.makedirs(output_tables, exist_ok=True)

# --- 1. Load Data ---
try:
    data = pd.read_csv(input_file)
    print("✅ Dataset successfully loaded.")
except FileNotFoundError:
    raise FileNotFoundError(f"Input file not found: {input_file}")

print(f"📊 Dataset dimensions: {data.shape[0]} rows × {data.shape[1]} columns")

# --- 2. Flexible Column Detection ---
def find_column(data, possible_names):
    for name in possible_names:
        if name in data.columns:
            return name
    return None

col_author = find_column(data, [
    'Penulis_Utama', 'Penulis_pertama', 'Nama_Penulis', 
    'Author', 'Authors', 'First_Author', 'Primary_Author'
])

col_citation = find_column(data, [
    'Jumlah_Sitasi', 'Cited by', 'Citations', 'cited_by', 'Sitasi'
])

if not col_author:
    raise KeyError("Author column not found in the dataset.")
if not col_citation:
    raise KeyError("Citation column not found in the dataset.")

print(f"🔍 Author column detected: '{col_author}'")
print(f"🔍 Citation column detected: '{col_citation}'")

# Rename for consistency
data_clean = data.rename(columns={
    col_author: 'Author',
    col_citation: 'Citations'
})

# --- 3. Data Cleaning ---
data_clean['Citations'] = pd.to_numeric(data_clean['Citations'], errors='coerce')
data_clean = data_clean.dropna(subset=['Author', 'Citations'])
data_clean = data_clean[data_clean['Citations'] >= 0]

print(f"🧹 Cleaned dataset: {len(data_clean)} publications after filtering")

# --- 4. Aggregate Total Citations per Author ---
citations_per_author = data_clean.groupby('Author')['Citations'].sum().sort_values(ascending=False)
total_citations = citations_per_author.sum()
total_authors = len(citations_per_author)

print(f"👤 Total Unique Authors: {total_authors}")
print(f"🔗 Total Citations: {int(total_citations):,}")

# --- 5. Compute Cumulative Distributions ---
cumulative_citations = np.cumsum(citations_per_author.values)
cumulative_percentage = cumulative_citations / total_citations * 100
author_rank_percentage = np.arange(1, total_authors + 1) / total_authors * 100  # % of authors

# --- 6. Identify Pareto Thresholds ---
def find_threshold_index(cum_pct, threshold):
    """Return the index and author percentage where cumulative citation reaches threshold."""
    idx = np.argmax(cum_pct >= threshold)
    author_pct = (idx + 1) / total_authors * 100
    return idx + 1, author_pct

n_80, p_80 = find_threshold_index(cumulative_percentage, 80)
n_90, p_90 = find_threshold_index(cumulative_percentage, 90)
n_95, p_95 = find_threshold_index(cumulative_percentage, 95)

print("\n" + "📊 PARETO ANALYSIS: WHO CONTROLS THE MAJORITY OF ACADEMIC INFLUENCE?" + "\n" + "="*70)
print(f"🔹 80% of citations are held by {n_80} authors ({p_80:.1f}% of all authors)")
print(f"🔹 90% of citations are held by {n_90} authors ({p_90:.1f}% of all authors)")
print(f"🔹 95% of citations are held by {n_95} authors ({p_95:.1f}% of all authors)")

# --- 7. Summary Table ---
pareto_table = pd.DataFrame({
    'Threshold': ['80%', '90%', '95%'],
    'Number of Authors': [n_80, n_90, n_95],
    'Proportion of Authors (%)': [f"{p_80:.1f}", f"{p_90:.1f}", f"{p_95:.1f}"],
    'Cumulative Citations Covered': ['80%', '90%', '95%']
})

print("\n📋 Pareto Summary Table:")
display(pareto_table)

# Export table to Excel in tables/ folder
table_path = os.path.join(output_tables, "pareto_analysis_summary.xlsx")
with pd.ExcelWriter(table_path, engine='openpyxl') as writer:
    pareto_table.to_excel(writer, sheet_name='Pareto_Thresholds', index=False)
print(f"\n✅ Summary table exported to:\n {table_path}")

# --- 8. Pareto Curve Visualization ---
plt.figure(figsize=(10, 7), dpi=150)
plt.plot(author_rank_percentage, cumulative_percentage, 
         color='#D62728', linewidth=2.5, label='Cumulative Citation Share')

# Mark 80% threshold
plt.axhline(y=80, color='gray', linestyle='--', alpha=0.6, linewidth=1.2)
plt.axvline(x=p_80, color='gray', linestyle='--', alpha=0.6, linewidth=1.2)
plt.scatter(p_80, 80, color='#1F77B4', s=80, zorder=5, edgecolors='black', linewidth=1.2)

# Annotation
plt.annotate(
    f'{p_80:.1f}% of authors\naccount for 80% of citations',
    xy=(p_80, 80), xytext=(p_80 + 5, 75),
    fontsize=10, color='black', ha='left', va='top',
    bbox=dict(boxstyle="round,pad=0.4", facecolor="lightyellow", edgecolor="gray", alpha=0.8),
    arrowprops=dict(arrowstyle='->', color='gray', lw=1.0)
)

# Formatting
plt.title('Pareto Curve: Distribution of Academic Influence by Lead Authors', 
          fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Cumulative Proportion of Authors (%)', fontsize=12)
plt.ylabel('Cumulative Proportion of Citations (%)', fontsize=12)
plt.xlim(0, 100)
plt.ylim(0, 100)
plt.grid(True, which='both', linestyle='--', alpha=0.4, linewidth=0.8)
plt.legend(fontsize=11, loc='lower right')
plt.tight_layout()

# Save figure to figures/ folder
plot_path = os.path.join(output_figures, "pareto_curve_academic_influence.png")
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()
print(f"✅ Pareto curve saved to:\n {plot_path}")

# --- 9. Interpretation ---
print("\n" + "📝 INTERPRETATION" + "\n" + "-"*40)
print("The citation distribution exhibits extreme inequality:")
print(f"- Only {p_80:.1f}% of authors generate 80% of all citations.")
print("- This indicates a highly concentrated academic reward system.")
print("- Findings align with the Matthew Effect and Pareto’s 80/20 principle in social systems.")
print("- A small elite group dominates scholarly visibility and impact.")