In [None]:
# -*- coding: utf-8 -*-
"""
STRUCTURAL BIAS ANALYSIS: GLOBAL NORTH VS. GLOBAL SOUTH
Assessing Geopolitical Inequality in Academic Knowledge Production
"""

# --- Import Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from IPython.display import display

# --- Define Relative Paths (relative to notebooks/) ---
input_file = "../data/processed/processed_publications_for_mertonian_analysis.csv"
output_figures = "../output/figures"
output_tables = "../output/tables"

# Create output directories if they don't exist
os.makedirs(output_figures, exist_ok=True)
os.makedirs(output_tables, exist_ok=True)

# --- 1. Load Data ---
try:
    data = pd.read_csv(input_file)
    print("✅ Dataset successfully loaded.")
except FileNotFoundError:
    raise FileNotFoundError(f"File not found: {input_file}")

print(f"📊 Dataset dimensions: {data.shape[0]} observations × {data.shape[1]} variables")

# --- 2. Flexible Column Detection ---
def find_column(data, possible_names):
    for name in possible_names:
        if name in data.columns:
            return name
    return None

col_country = find_column(data, [
    'Negara_norm', 'Country', 'Country_normalized', 'Nation', 'Affiliation_Country'
])
col_citation = find_column(data, [
    'Jumlah_Sitasi', 'Cited by', 'Citations', 'cited_by', 'Citation_Count'
])
col_affiliation = find_column(data, [
    'Afiliasi', 'Affiliation', 'Institution', 'Host_Institution'
])

# Validate required columns
if not col_country:
    raise KeyError("Required column not found: Country (candidates: 'Negara_norm', 'Country', etc.)")
if not col_citation:
    raise KeyError("Required column not found: Citations (candidates: 'Jumlah_Sitasi', 'Citations', etc.)")
if not col_affiliation:
    raise KeyError("Required column not found: Affiliation (candidates: 'Afiliasi', 'Affiliation', etc.)")

print(f"🔍 Detected country variable: '{col_country}'")
print(f"🔍 Detected citation variable: '{col_citation}'")
print(f"🔍 Detected affiliation variable: '{col_affiliation}'")

# Rename for consistency
data_clean = data.rename(columns={
    col_country: 'Country',
    col_citation: 'Citations',
    col_affiliation: 'Affiliation'
})

# --- 3. Data Cleaning ---
data_clean['Citations'] = pd.to_numeric(data_clean['Citations'], errors='coerce')
data_clean = data_clean.dropna(subset=['Country', 'Citations', 'Affiliation'])
data_clean = data_clean[data_clean['Citations'] >= 0]

print(f"🧹 Cleaned dataset: {len(data_clean)} publications retained after filtering")

# --- 4. Define Global North and South ---
global_north_countries = {
    'United States', 'USA', 'US', 'Canada', 'UK', 'United Kingdom', 'Germany',
    'France', 'Italy', 'Spain', 'Netherlands', 'Belgium', 'Sweden', 'Denmark',
    'Norway', 'Finland', 'Austria', 'Switzerland', 'Ireland', 'Portugal',
    'Australia', 'New Zealand', 'Japan', 'South Korea', 'Israel', 'Singapore',
    'Luxembourg', 'Iceland', 'Greece', 'Czech Republic', 'Poland', 'Hungary',
    'Slovakia', 'Slovenia', 'Estonia', 'Latvia', 'Lithuania', 'Malta', 'Cyprus'
}

def classify_region(country):
    country_str = str(country).strip()
    return 'Global North' if country_str in global_north_countries else 'Global South'

data_clean['Region'] = data_clean['Country'].apply(classify_region)

# --- 5. Proportion of Publications by Region ---
total_papers = len(data_clean)
north_papers = (data_clean['Region'] == 'Global North').sum()
south_papers = total_papers - north_papers

prop_north = (north_papers / total_papers) * 100
prop_south = (south_papers / total_papers) * 100

print(f"\n🌍 PROPORTION OF PUBLICATIONS BY REGION")
print(f"  • Global North: {north_papers} papers ({prop_north:.1f}%)")
print(f"  • Global South: {south_papers} papers ({prop_south:.1f}%)")

# --- 6. Median Citations by Region ---
median_north = data_clean[data_clean['Region'] == 'Global North']['Citations'].median()
median_south = data_clean[data_clean['Region'] == 'Global South']['Citations'].median()

print(f"\n📊 MEDIAN CITATIONS BY REGION")
print(f"  • Global North: {median_north:.1f}")
print(f"  • Global South: {median_south:.1f}")
print(f"  • Difference: +{median_north - median_south:.1f} citations (North advantage)")

# --- 7. Top 20 Institutions by Publication Volume ---
top_institutions = data_clean['Affiliation'].value_counts().head(20)
print(f"\n🏛️ TOP 20 INSTITUTIONS BY PUBLICATION OUTPUT")
for i, (inst, count) in enumerate(top_institutions.items(), 1):
    print(f"  {i:2}. {inst} ({count} publications)")

# --- 8. Summary Table ---
summary_table = pd.DataFrame({
    'Metric': [
        'Total Publications',
        'Publications from Global North',
        'Publications from Global South',
        'Proportion from Global North (%)',
        'Proportion from Global South (%)',
        'Median Citations (Global North)',
        'Median Citations (Global South)',
        'Difference in Median Citations',
        'Most Prolific Institution'
    ],
    'Value': [
        int(total_papers),
        int(north_papers),
        int(south_papers),
        f"{prop_north:.1f}",
        f"{prop_south:.1f}",
        round(median_north, 1),
        round(median_south, 1),
        round(median_north - median_south, 1),
        top_institutions.index[0] if len(top_institutions) > 0 else 'N/A'
    ]
})

print("\n" + "📋 SUMMARY TABLE: STRUCTURAL BIAS IN KNOWLEDGE PRODUCTION" + "\n" + "="*70)
display(summary_table)

# Export table to Excel in tables/ folder
table_path = os.path.join(output_tables, "structural_bias_summary.xlsx")
with pd.ExcelWriter(table_path, engine='openpyxl') as writer:
    summary_table.to_excel(writer, sheet_name='Bias_Overview', index=False)
    top_institutions.reset_index().rename(columns={'index': 'Affiliation', 'Affiliation': 'Publication_Count'}).to_excel(
        writer, sheet_name='Top_Institutions', index=False)
print(f"\n✅ Summary and institutional data exported to:\n {table_path}")

# --- 9. Visualization (4-panel Figure) ---
plt.style.use('default')
fig, axes = plt.subplots(2, 2, figsize=(14, 10), dpi=150)
fig.suptitle('Structural Bias in Academic Knowledge Production\nGlobal North vs. Global South', 
             fontsize=16, fontweight='bold', y=0.98)

color_north = '#4E79A7'   # Blue
color_south = '#F28E2B'   # Orange

# A. Pie Chart: Publication Share by Region
axes[0, 0].pie([prop_north, prop_south],
               labels=['Global North', 'Global South'],
               autopct='%1.1f%%',
               colors=[color_north, color_south],
               startangle=90,
               wedgeprops={'linewidth': 1, 'edgecolor': 'black'})
axes[0, 0].set_title('Proportion of Publications', fontsize=12, fontweight='bold')

# B. Bar Chart: Median Citations per Region
med_vals = [median_north, median_south]
bars = axes[0, 1].bar(['Global North', 'Global South'], med_vals,
                      color=[color_north, color_south], edgecolor='black', linewidth=0.8)
axes[0, 1].set_title('Median Citations per Publication', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Median Citation Count')
for i, val in enumerate(med_vals):
    axes[0, 1].text(i, val + 0.1, f"{val:.1f}", ha='center', va='bottom', fontsize=10)

# C. Horizontal Bar: Top 10 Institutions by Output
top_10_inst = top_institutions.head(10)[::-1]  # Reverse order for ascending bar chart
y_pos = np.arange(len(top_10_inst))
axes[1, 0].barh(y_pos, top_10_inst.values, color='steelblue', edgecolor='black', linewidth=0.8)
axes[1, 0].set_yticks(y_pos)
axes[1, 0].set_yticklabels([aff[:40] + '...' if len(aff) > 40 else aff for aff in top_10_inst.index], fontsize=9)
axes[1, 0].set_xlabel('Number of Publications')
axes[1, 0].set_title('Top 10 Institutions by Publication Volume', fontsize=12, fontweight='bold')

# D. Boxplot: Citation Distribution by Region (log scale, filtered)
filtered_data = data_clean[data_clean['Citations'] <= 100]  # Exclude extreme outliers
sns.boxplot(data=filtered_data, x='Region', y='Citations', ax=axes[1, 1], 
            palette={'Global North': color_north, 'Global South': color_south})
axes[1, 1].set_yscale('log')
axes[1, 1].set_title('Distribution of Citations (Log Scale)', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Citation Count (log scale)')
axes[1, 1].set_xlabel('Geopolitical Region')

# General formatting
for ax in axes.flat:
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.tick_params(axis='x', labelsize=10)
    ax.tick_params(axis='y', labelsize=9)

plt.tight_layout(rect=[0, 0, 1, 0.95])

# Save figure to figures/ folder
plot_path = os.path.join(output_figures, "geopolitical_bias_analysis.png")
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()
print(f"✅ Visualization saved to:\n {plot_path}")

# --- 10. Interpretation ---
print("\n" + "📝 INTERPRETATION" + "\n" + "-"*50)
print("The analysis reveals significant structural disparities in academic knowledge production:")
if prop_north > 70:
    print("• The Global North dominates publication output, accounting for over 70% of all papers.")
elif prop_north > 55:
    print("• The Global North maintains a substantial majority in scholarly output.")
else:
    print("• Publication distribution is relatively balanced, indicating growing Southern participation.")

if median_north > median_south * 1.5:
    print("• Papers from the Global North receive significantly higher median citations, suggesting systemic citation bias.")
else:
    print("• Citation impact is relatively comparable across regions, despite differences in volume.")

print("• The most prolific institutions are predominantly based in high-income countries.")
print("\n➡️ These findings support critiques of epistemic inequality and the coloniality of knowledge, where geopolitical position continues to shape both visibility and impact in global academia.")