In [None]:
# -*- coding: utf-8 -*-
"""
MATTHEW EFFECT ANALYSIS BY PERIOD
2015–2019 vs. 2020–2024
Input: ../data/processed/processed_publications_for_mertonian_analysis.csv
"""

# --- Import Required Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from IPython.display import display

# --- Define Relative Paths (relative to notebooks/) ---
input_file = "../data/processed/processed_publications_for_mertonian_analysis.csv"
output_figures = "../output/figures"
output_tables = "../output/tables"

# Create output directories if they don't exist
os.makedirs(output_figures, exist_ok=True)
os.makedirs(output_tables, exist_ok=True)

# --- 1. Load Dataset ---
try:
    data = pd.read_csv(input_file)
    print("✅ Dataset successfully loaded.")
except FileNotFoundError:
    raise FileNotFoundError(f"Dataset not found: {input_file}")

print(f"📊 Dimensions: {data.shape[0]} rows × {data.shape[1]} columns")
print("\n📋 Column headers detected:")
print(data.columns.tolist())
print()

# --- 2. Identify Key Columns Dynamically ---
def find_column(data, candidate_names):
    for name in candidate_names:
        if name in data.columns:
            return name
    return None

author_col = find_column(data, [
    'Penulis_Utama', 'Penulis_pertama', 'Penulis pertama',
    'Nama_Penulis', 'First_Author', 'Author', 'Authors'
])

citation_col = find_column(data, [
    'Jumlah_Sitasi', 'Cited by', 'Citations', 'cited_by',
    'Jumlah Sitasi', 'Sitasi'
])

year_col = find_column(data, ['Tahun', 'Year', 'PY'])

# Validate essential columns
if not author_col:
    raise KeyError("Primary author column not found in dataset.")
if not citation_col:
    raise KeyError("Citation count column not found in dataset.")
if not year_col:
    raise KeyError("Publication year column not found in dataset.")

print(f"🔍 Primary author column identified: '{author_col}'")
print(f"🔍 Citation count column identified: '{citation_col}'")
print(f"🔍 Publication year column identified: '{year_col}'")

# Standardize column names
data_clean = data.rename(columns={
    author_col: 'Primary_Author',
    citation_col: 'Citation_Count',
    year_col: 'Publication_Year'
})

# --- 3. Data Cleaning ---
data_clean['Publication_Year'] = pd.to_numeric(data_clean['Publication_Year'], errors='coerce')
data_clean['Citation_Count'] = pd.to_numeric(data_clean['Citation_Count'], errors='coerce')

# Drop rows with missing essential values
data_clean = data_clean.dropna(subset=['Primary_Author', 'Citation_Count', 'Publication_Year'])

# Filter to 2015–2024 and non-negative citations
data_clean = data_clean[(data_clean['Publication_Year'] >= 2015) & (data_clean['Publication_Year'] <= 2024)]
data_clean = data_clean[data_clean['Citation_Count'] >= 0]

print(f"🧹 Cleaned dataset (2015–2024): {len(data_clean)} publications retained after cleaning")

# --- 4. Split into Two Periods ---
period_1 = data_clean[(data_clean['Publication_Year'] >= 2015) & (data_clean['Publication_Year'] <= 2019)].copy()
period_2 = data_clean[(data_clean['Publication_Year'] >= 2020) & (data_clean['Publication_Year'] <= 2024)].copy()

print(f"📅 Period 1 (2015–2019): {len(period_1)} publications")
print(f"📅 Period 2 (2020–2024): {len(period_2)} publications")

# --- 5. Functions to Compute Inequality Indicators ---
def compute_gini(x):
    x = np.array(x)
    if len(x) == 0 or x.sum() == 0:
        return 0.0
    x = x[x > 0]
    x = np.sort(x)
    n = len(x)
    index = np.arange(1, n + 1)
    return (np.sum((2 * index - n - 1) * x)) / (n * np.sum(x))

def compute_metrics(df):
    if len(df) == 0:
        return None
    
    citations_per_author = df.groupby('Primary_Author')['Citation_Count'].sum().sort_values(ascending=False).values
    if len(citations_per_author) == 0:
        return None
        
    gini = compute_gini(citations_per_author)
    total_citations = citations_per_author.sum()
    total_authors = len(citations_per_author)
    
    n_top_10 = max(1, int(0.1 * total_authors))
    pct_top_10 = (citations_per_author[:n_top_10].sum() / total_citations) * 100 if total_citations > 0 else 0
    
    # Global h-index estimate
    citation_series = df.groupby('Primary_Author')['Citation_Count'].sum().sort_values(ascending=False).values
    h_index = np.sum(citation_series >= np.arange(1, len(citation_series) + 1))
    
    return {
        'Gini Coefficient': round(gini, 3),
        '% Citations by Top 10%': f"{pct_top_10:.1f}%",
        'Estimated h-index': h_index,
        'Total Unique Authors': total_authors,
        'Size of Top 10% Group': n_top_10
    }

# Compute metrics for each period
metrics_1 = compute_metrics(period_1)
metrics_2 = compute_metrics(period_2)

# --- 6. Comparative Metrics Table ---
metric_names = [
    'Gini Coefficient',
    '% Citations by Top 10%',
    'Estimated h-index',
    'Total Unique Authors',
    'Size of Top 10% Group'
]

comparison_table = pd.DataFrame({
    'Metric': metric_names,
    '2015–2019': [metrics_1[m] if metrics_1 and m in metrics_1 else '-' for m in metric_names],
    '2020–2024': [metrics_2[m] if metrics_2 and m in metrics_2 else '-' for m in metric_names]
})

print("\n" + "📊 COMPARATIVE ANALYSIS OF ACADEMIC INEQUALITY: 2015–2019 VS. 2020–2024" + "\n" + "="*70)
display(comparison_table)

# Export to Excel in tables/ folder
excel_path = os.path.join(output_tables, "matthew_effect_period_comparison.xlsx")
with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
    comparison_table.to_excel(writer, sheet_name='Summary', index=False)
    
    if len(period_1) > 0:
        top_2015_2019 = period_1.groupby('Primary_Author')['Citation_Count'].sum().sort_values(ascending=False).head(20).reset_index()
        top_2015_2019.to_excel(writer, sheet_name='Top_Authors_2015-2019', index=False)
    if len(period_2) > 0:
        top_2020_2024 = period_2.groupby('Primary_Author')['Citation_Count'].sum().sort_values(ascending=False).head(20).reset_index()
        top_2020_2024.to_excel(writer, sheet_name='Top_Authors_2020-2024', index=False)

print(f"\n✅ Results exported to:\n {excel_path}")

# --- 7. Visualization: Inequality Trends (4-Panel Figure) ---
plt.style.use('default')
fig, axes = plt.subplots(2, 2, figsize=(14, 10), dpi=150)
fig.suptitle('Academic Inequality Trends (2015–2024)', fontsize=16, fontweight='bold', y=0.98)

color1, color2 = '#4E79A7', '#F28E2B'

# Panel A: Gini Coefficient
gini_vals = [metrics_1['Gini Coefficient'], metrics_2['Gini Coefficient']]
bars = axes[0, 0].bar(['2015–2019', '2020–2024'], gini_vals, color=[color1, color2], edgecolor='black', linewidth=0.8)
axes[0, 0].set_title('Gini Coefficient', fontweight='bold')
axes[0, 0].set_ylabel('Gini Coefficient')
for i, val in enumerate(gini_vals):
    axes[0, 0].text(i, val + 0.02, f'{val}', ha='center', va='bottom', fontsize=10)

# Panel B: % Citations by Top 10%
pct_1 = float(metrics_1['% Citations by Top 10%'].strip('%'))
pct_2 = float(metrics_2['% Citations by Top 10%'].strip('%'))
pct_vals = [pct_1, pct_2]
bars = axes[0, 1].bar(['2015–2019', '2020–2024'], pct_vals, color=[color1, color2], edgecolor='black', linewidth=0.8)
axes[0, 1].set_title('% of Citations by Top 10% of Authors', fontweight='bold')
axes[0, 1].set_ylabel('Percentage (%)')
for i, val in enumerate(pct_vals):
    axes[0, 1].text(i, val + 1, f'{val:.1f}%', ha='center', va='bottom', fontsize=10)

# Panel C: Estimated h-index
h_index_vals = [metrics_1['Estimated h-index'], metrics_2['Estimated h-index']]
bars = axes[1, 0].bar(['2015–2019', '2020–2024'], h_index_vals, color=[color1, color2], edgecolor='black', linewidth=0.8)
axes[1, 0].set_title('Estimated Global h-index', fontweight='bold')
axes[1, 0].set_ylabel('h-index')
for i, val in enumerate(h_index_vals):
    axes[1, 0].text(i, val + 2, str(val), ha='center', va='bottom', fontsize=10)

# Panel D: Total Unique Authors
author_vals = [metrics_1['Total Unique Authors'], metrics_2['Total Unique Authors']]
bars = axes[1, 1].bar(['2015–2019', '2020–2024'], author_vals, color=[color1, color2], edgecolor='black', linewidth=0.8)
axes[1, 1].set_title('Total Number of Unique Authors', fontweight='bold')
axes[1, 1].set_ylabel('Count')
for i, val in enumerate(author_vals):
    axes[1, 1].text(i, val + 10, str(val), ha='center', va='bottom', fontsize=10)

# General formatting
for ax in axes.flat:
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.set_xticks([0, 1])
    ax.set_xticklabels(['2015–2019', '2020–2024'])
    ax.tick_params(axis='x', labelsize=10)
    ax.tick_params(axis='y', labelsize=9)

plt.tight_layout(rect=[0, 0, 1, 0.95])
trend_plot_path = os.path.join(output_figures, "academic_inequality_trends_by_period.png")
plt.savefig(trend_plot_path, dpi=300, bbox_inches='tight')
plt.show()
print(f"✅ Trend plot saved to:\n {trend_plot_path}")

# --- 8. Interpretive Summary ---
delta_gini = metrics_2['Gini Coefficient'] - metrics_1['Gini Coefficient']
delta_pct = float(metrics_2['% Citations by Top 10%'].strip('%')) - float(metrics_1['% Citations by Top 10%'].strip('%'))

print("\n" + "📝 BRIEF INTERPRETIVE SUMMARY" + "\n" + "-"*50)
print(f"• Gini coefficient increased from {metrics_1['Gini Coefficient']} to {metrics_2['Gini Coefficient']} (+{delta_gini:.3f})")
print(f"• Citation share of top 10% authors rose from {metrics_1['% Citations by Top 10%']} to {metrics_2['% Citations by Top 10%']} (+{delta_pct:.1f} percentage points)")
direction = "increased" if metrics_2['Estimated h-index'] > metrics_1['Estimated h-index'] else "decreased"
print(f"• Estimated global h-index {direction} from {metrics_1['Estimated h-index']} to {metrics_2['Estimated h-index']}")
print("\n➡️ These results indicate a worsening of inequality in the distribution of academic influence over time — consistent with the theoretical predictions of the Matthew Effect.")