In [None]:
# -*- coding: utf-8 -*-
"""
ANALYSIS OF THE MATTHEW EFFECT AND CITATION INEQUALITY
Utilizing Lorenz Curves, Gini Coefficients, and Temporal Trends (2015–2024)
"""

# --- Import Required Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from IPython.display import display

# --- Define Relative Paths (relative to notebooks/) ---
input_file = "../data/processed/processed_publications_for_mertonian_analysis.csv"
output_figures = "../output/figures"
output_tables = "../output/tables"

# Create output directories if they don't exist
os.makedirs(output_figures, exist_ok=True)
os.makedirs(output_tables, exist_ok=True)

# --- 1. Load Dataset ---
try:
    data = pd.read_csv(input_file)
    print("✅ Dataset successfully loaded.")
except FileNotFoundError:
    raise FileNotFoundError(f"Dataset not found: {input_file}")

print(f"📊 Dataset dimensions: {data.shape[0]} records × {data.shape[1]} variables")
print("\n📋 Column headers detected:")
print(data.columns.tolist())
print()

# --- 2. Identify Key Variables Dynamically ---
def identify_column(data, candidate_names):
    for name in candidate_names:
        if name in data.columns:
            return name
    return None

author_col = identify_column(data, [
    'Penulis_Utama', 'Penulis_pertama', 'Penulis pertama',
    'Nama_Penulis', 'First_Author', 'Author', 'Authors'
])

citation_col = identify_column(data, [
    'Jumlah_Sitasi', 'Cited by', 'Citations', 'cited_by',
    'Jumlah Sitasi', 'Sitasi'
])

year_col = identify_column(data, ['Tahun', 'Year', 'PY'])

# Validate presence of essential variables
if not author_col:
    raise KeyError("Primary author column not detected in dataset.")
if not citation_col:
    raise KeyError("Citation count column not detected in dataset.")
if not year_col:
    raise KeyError("Publication year column not detected in dataset.")

print(f"🔍 Primary author column identified: '{author_col}'")
print(f"🔍 Citation count column identified: '{citation_col}'")
print(f"🔍 Publication year column identified: '{year_col}'")

# Standardize column names
data_clean = data.rename(columns={
    author_col: 'Primary_Author',
    citation_col: 'Citation_Count',
    year_col: 'Publication_Year'
})

# --- 3. Data Preprocessing and Cleaning ---
data_clean['Publication_Year'] = pd.to_numeric(data_clean['Publication_Year'], errors='coerce')
data_clean['Citation_Count'] = pd.to_numeric(data_clean['Citation_Count'], errors='coerce')

# Drop missing essential values
data_clean = data_clean.dropna(subset=['Primary_Author', 'Citation_Count', 'Publication_Year'])

# Filter to 2015–2024 and non-negative citations
data_clean = data_clean[(data_clean['Publication_Year'] >= 2015) & (data_clean['Publication_Year'] <= 2024)]
data_clean = data_clean[data_clean['Citation_Count'] >= 0]

print(f"🧹 Cleaned dataset (2015–2024): {len(data_clean)} publications retained after preprocessing")

# --- 4. Temporal Segmentation ---
period_1 = data_clean[(data_clean['Publication_Year'] >= 2015) & (data_clean['Publication_Year'] <= 2019)].copy()
period_2 = data_clean[(data_clean['Publication_Year'] >= 2020) & (data_clean['Publication_Year'] <= 2024)].copy()

print(f"📅 Period 1 (2015–2019): {len(period_1)} publications")
print(f"📅 Period 2 (2020–2024): {len(period_2)} publications")

# --- 5. Functions for Inequality Metrics ---
def compute_gini_coefficient(x):
    x = np.array(x)
    if len(x) == 0 or x.sum() == 0:
        return 0.0
    x = x[x > 0]
    x = np.sort(x)
    n = len(x)
    index = np.arange(1, n + 1)
    return (np.sum((2 * index - n - 1) * x)) / (n * np.sum(x))

def compute_inequality_metrics(df):
    if len(df) == 0:
        return None
    
    citations_per_author = df.groupby('Primary_Author')['Citation_Count'].sum().sort_values(ascending=False).values
    if len(citations_per_author) == 0:
        return None
        
    gini = compute_gini_coefficient(citations_per_author)
    total_citations = citations_per_author.sum()
    total_authors = len(citations_per_author)
    
    n_top_10 = max(1, int(0.1 * total_authors))
    pct_top_10 = (citations_per_author[:n_top_10].sum() / total_citations) * 100 if total_citations > 0 else 0
    
    h_index = np.sum(citations_per_author >= np.arange(1, len(citations_per_author) + 1))
    
    return {
        'Gini_Coefficient': round(gini, 3),
        'Top_10_Percent_Citation_Share': f"{pct_top_10:.1f}%",
        'Estimated_Global_h_index': h_index,
        'Total_Unique_Authors': total_authors,
        'Size_of_Top_10_Percent_Group': n_top_10
    }

# Compute metrics
metrics_period_1 = compute_inequality_metrics(period_1)
metrics_period_2 = compute_inequality_metrics(period_2)

# --- 6. Comparative Metrics Table ---
metric_labels = [
    'Gini_Coefficient',
    'Top_10_Percent_Citation_Share',
    'Estimated_Global_h_index',
    'Total_Unique_Authors',
    'Size_of_Top_10_Percent_Group'
]

comparative_table = pd.DataFrame({
    'Metric': metric_labels,
    'Period_1_(2015–2019)': [metrics_period_1[m] if metrics_period_1 and m in metrics_period_1 else '-' for m in metric_labels],
    'Period_2_(2020–2024)': [metrics_period_2[m] if metrics_period_2 and m in metrics_period_2 else '-' for m in metric_labels]
})

print("\n" + "📊 COMPARATIVE ANALYSIS OF ACADEMIC INEQUALITY: 2015–2019 vs. 2020–2024" + "\n" + "="*70)
display(comparative_table)

# Export to Excel in tables/ folder
excel_output_path = os.path.join(output_tables, "matthew_effect_analysis_results.xlsx")
with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
    comparative_table.to_excel(writer, sheet_name='Summary', index=False)
    
    if len(period_1) > 0:
        top_authors_p1 = period_1.groupby('Primary_Author')['Citation_Count'].sum().sort_values(ascending=False).head(20).reset_index()
        top_authors_p1.to_excel(writer, sheet_name='Top_Authors_2015-2019', index=False)
    if len(period_2) > 0:
        top_authors_p2 = period_2.groupby('Primary_Author')['Citation_Count'].sum().sort_values(ascending=False).head(20).reset_index()
        top_authors_p2.to_excel(writer, sheet_name='Top_Authors_2020-2024', index=False)

print(f"\n✅ Analysis results exported to:\n {excel_output_path}")

# --- 7. Visualization: Temporal Trends (4-Panel Figure) ---
plt.style.use('default')
fig, axes = plt.subplots(2, 2, figsize=(14, 10), dpi=150)
fig.suptitle('Temporal Trends in Academic Inequality (2015–2024)', fontsize=16, fontweight='bold', y=0.98)

color_period_1, color_period_2 = '#4E79A7', '#F28E2B'

# Panel A: Gini Coefficient
gini_values = [metrics_period_1['Gini_Coefficient'], metrics_period_2['Gini_Coefficient']]
bars = axes[0, 0].bar(['2015–2019', '2020–2024'], gini_values, color=[color_period_1, color_period_2], edgecolor='black', linewidth=0.8)
axes[0, 0].set_title('Gini Coefficient', fontweight='bold')
axes[0, 0].set_ylabel('Gini Value')
for bar, val in zip(bars, gini_values):
    axes[0, 0].text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.02, f'{val}', ha='center', va='bottom', fontsize=10)

# Panel B: Top 10% Citation Share
pct_1 = float(metrics_period_1['Top_10_Percent_Citation_Share'].strip('%'))
pct_2 = float(metrics_period_2['Top_10_Percent_Citation_Share'].strip('%'))
pct_values = [pct_1, pct_2]
bars = axes[0, 1].bar(['2015–2019', '2020–2024'], pct_values, color=[color_period_1, color_period_2], edgecolor='black', linewidth=0.8)
axes[0, 1].set_title('Citation Share of Top 10% Authors', fontweight='bold')
axes[0, 1].set_ylabel('Percentage (%)')
for bar, val in zip(bars, pct_values):
    axes[0, 1].text(bar.get_x() + bar.get_width()/2., bar.get_height() + 1, f'{val:.1f}%', ha='center', va='bottom', fontsize=10)

# Panel C: Estimated Global h-index
h_index_values = [metrics_period_1['Estimated_Global_h_index'], metrics_period_2['Estimated_Global_h_index']]
bars = axes[1, 0].bar(['2015–2019', '2020–2024'], h_index_values, color=[color_period_1, color_period_2], edgecolor='black', linewidth=0.8)
axes[1, 0].set_title('Estimated Global h-index', fontweight='bold')
axes[1, 0].set_ylabel('h-index Value')
for bar, val in zip(bars, h_index_values):
    axes[1, 0].text(bar.get_x() + bar.get_width()/2., bar.get_height() + 2, str(val), ha='center', va='bottom', fontsize=10)

# Panel D: Total Unique Authors
author_counts = [metrics_period_1['Total_Unique_Authors'], metrics_period_2['Total_Unique_Authors']]
bars = axes[1, 1].bar(['2015–2019', '2020–2024'], author_counts, color=[color_period_1, color_period_2], edgecolor='black', linewidth=0.8)
axes[1, 1].set_title('Total Unique Authors', fontweight='bold')
axes[1, 1].set_ylabel('Count')
for bar, val in zip(bars, author_counts):
    axes[1, 1].text(bar.get_x() + bar.get_width()/2., bar.get_height() + 10, str(val), ha='center', va='bottom', fontsize=10)

# Formatting
for ax in axes.flat:
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.tick_params(axis='x', labelsize=10)
    ax.tick_params(axis='y', labelsize=9)

plt.tight_layout(rect=[0, 0, 1, 0.95])
trend_plot_path = os.path.join(output_figures, "academic_inequality_trends.png")
plt.savefig(trend_plot_path, dpi=300, bbox_inches='tight')
plt.show()
print(f"✅ Trend visualization saved to:\n {trend_plot_path}")

# --- 8. Lorenz Curve Visualization ---
def prepare_lorenz_data(citation_array):
    sorted_citations = np.sort(citation_array)
    cumulative_citation_share = np.cumsum(sorted_citations) / np.sum(sorted_citations)
    cumulative_author_share = np.arange(1, len(sorted_citations) + 1) / len(sorted_citations)
    return cumulative_author_share, cumulative_citation_share

# Extract citation distributions
citations_period_1 = period_1.groupby('Primary_Author')['Citation_Count'].sum().values
citations_period_2 = period_2.groupby('Primary_Author')['Citation_Count'].sum().values

# Plot Lorenz curves
plt.figure(figsize=(9, 7), dpi=150)
x1, y1 = prepare_lorenz_data(citations_period_1)
x2, y2 = prepare_lorenz_data(citations_period_2)

gini_p1 = compute_gini_coefficient(citations_period_1)
gini_p2 = compute_gini_coefficient(citations_period_2)

plt.plot(x1, y1, label=f'2015–2019 (Gini = {gini_p1:.3f})', color=color_period_1, linewidth=2.5, marker='o', markevery=max(1, len(x1)//15))
plt.plot(x2, y2, label=f'2020–2024 (Gini = {gini_p2:.3f})', color=color_period_2, linewidth=2.5, marker='s', markevery=max(1, len(x2)//15))
plt.plot([0, 1], [0, 1], '--', color='gray', linewidth=1.5, label='Line of Perfect Equality')

plt.xlabel('Cumulative Proportion of Authors', fontsize=12)
plt.ylabel('Cumulative Proportion of Citations', fontsize=12)
plt.title('Lorenz Curves: Citation Inequality Distribution\n(2015–2019 vs. 2020–2024)', fontsize=14, fontweight='bold')
plt.legend(fontsize=11, loc='lower right')
plt.grid(True, linestyle='--', alpha=0.4)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.tight_layout()

lorenz_path = os.path.join(output_figures, "lorenz_curves.png")
plt.savefig(lorenz_path, dpi=300, bbox_inches='tight')
plt.show()
print(f"✅ Lorenz curves saved to:\n {lorenz_path}")

# --- 9. Summary Interpretation ---
delta_gini = metrics_period_2['Gini_Coefficient'] - metrics_period_1['Gini_Coefficient']
delta_pct_top_10 = float(metrics_period_2['Top_10_Percent_Citation_Share'].strip('%')) - float(metrics_period_1['Top_10_Percent_Citation_Share'].strip('%'))

print("\n" + "📝 BRIEF INTERPRETIVE SUMMARY" + "\n" + "-"*50)
print(f"• Gini coefficient increased from {metrics_period_1['Gini_Coefficient']} to {metrics_period_2['Gini_Coefficient']} (Δ = +{delta_gini:.3f})")
print(f"• Citation share of top 10% authors rose from {metrics_period_1['Top_10_Percent_Citation_Share']} to {metrics_period_2['Top_10_Percent_Citation_Share']} (Δ = +{delta_pct_top_10:.1f} percentage points)")
print(f"• Estimated global h-index declined from {metrics_period_1['Estimated_Global_h_index']} to {metrics_period_2['Estimated_Global_h_index']}")
print("\n➡️ These findings indicate a marked intensification of inequality in academic influence distribution over time, despite substantial growth in the number of publishing authors — a pattern consistent with the theoretical predictions of the Matthew Effect.")