In [None]:
# -*- coding: utf-8 -*-
"""
STATISTICAL TESTING OF CITATION INEQUALITY
Global North vs. Global South — With Inferential Statistics and Effect Size
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from itertools import combinations
import os
import warnings
warnings.filterwarnings("ignore")

# --- Define relative paths (relative to notebooks/) ---
input_file = "../data/processed/processed_publications_for_mertonian_analysis.csv"
output_figures = "../output/figures"
output_tables = "../output/tables"

# Create output directories if they don't exist
os.makedirs(output_figures, exist_ok=True)
os.makedirs(output_tables, exist_ok=True)

# --- Load data ---
try:
    data = pd.read_csv(input_file)
    print("✅ Dataset successfully loaded.")
except FileNotFoundError:
    raise FileNotFoundError(f"File not found: {input_file}")

print(f"📊 Dataset dimensions: {data.shape[0]} observations × {data.shape[1]} variables")
print("\n📋 Columns in dataset:")
print(data.columns.tolist())
print()

# --- Identify key columns dynamically ---
def find_column(data, candidate_names):
    for name in candidate_names:
        if name.lower() in [col.lower() for col in data.columns]:
            return [col for col in data.columns if col.lower() == name.lower()][0]
    return None

# Find citation and region/group columns
citation_col = find_column(data, [
    'Jumlah_Sitasi', 'Cited by', 'Citations', 'cited_by',
    'Jumlah Sitasi', 'Sitasi', 'citation_count'
])

region_col = find_column(data, [
    'Region', 'Country_Group', 'Global_North_South', 'Affiliation_Region',
    'Region_Group', 'North_South', 'Global_South'
])

if not citation_col:
    raise KeyError("Citation column not found. Please check column names.")
if not region_col:
    raise KeyError("Region/Group column (e.g., 'Region', 'Country_Group') not found.")

print(f"🔍 Citation column identified: '{citation_col}'")
print(f"🔍 Region/Group column identified: '{region_col}'")

# Standardize column names
data_clean = data.rename(columns={
    citation_col: 'Citation_Count',
    region_col: 'Region_Group'
})

# --- Data cleaning ---
data_clean['Citation_Count'] = pd.to_numeric(data_clean['Citation_Count'], errors='coerce')
data_clean = data_clean.dropna(subset=['Citation_Count', 'Region_Group'])

# Keep only non-negative citations
data_clean = data_clean[data_clean['Citation_Count'] >= 0]

# Ensure Region_Group values are standardized (case-insensitive)
data_clean['Region_Group'] = data_clean['Region_Group'].astype(str).str.strip().str.title()

# Identify unique groups
unique_groups = sorted(data_clean['Region_Group'].unique())
print(f"🌍 Unique region/group categories detected: {unique_groups}")

# --- Validate expected groups: Global North vs. Global South ---
# We assume the data uses labels like "Global North", "Global South", or similar.
# If your data uses different labels (e.g., "North", "South"), adjust the mapping below.

# Common mappings (customize if needed)
north_labels = {'Global North', 'North', 'Developed', 'High Income'}
south_labels = {'Global South', 'South', 'Developing', 'Low Income', 'Middle Income'}

# Map to standardized categories
def classify_region(label):
    if label in north_labels:
        return 'Global North'
    elif label in south_labels:
        return 'Global South'
    else:
        # If unsure, keep original but warn
        print(f"⚠️  Unrecognized region label: '{label}'. Keeping as-is.")
        return label

data_clean['Region_Standardized'] = data_clean['Region_Group'].apply(classify_region)

# Filter to only Global North and Global South if both exist
if 'Global North' in data_clean['Region_Standardized'].values and 'Global South' in data_clean['Region_Standardized'].values:
    data_filtered = data_clean[data_clean['Region_Standardized'].isin(['Global North', 'Global South'])]
    print("✅ Analyzing: Global North vs. Global South")
else:
    # Fallback: use original groups if only two categories
    if len(unique_groups) == 2:
        data_filtered = data_clean.copy()
        data_filtered['Region_Standardized'] = data_filtered['Region_Group']
        print(f"✅ Analyzing two groups: {unique_groups[0]} vs. {unique_groups[1]}")
    else:
        raise ValueError("Could not identify 'Global North' and 'Global South' groups. Please ensure your region column contains these labels or adjust the mapping.")

# Final dataset
gn_data = data_filtered[data_filtered['Region_Standardized'] == 'Global North']['Citation_Count']
gs_data = data_filtered[data_filtered['Region_Standardized'] == 'Global South']['Citation_Count']

print(f"📊 Global North: {len(gn_data)} publications")
print(f"📊 Global South: {len(gs_data)} publications")

# --- Statistical Testing ---
# Mann-Whitney U Test (non-parametric, for non-normal distributions)
u_stat, p_value = stats.mannwhitneyu(gn_data, gs_data, alternative='greater')  # GN > GS?

# Cliff's Delta (effect size for non-parametric test)
def cliffs_delta(x, y):
    """Compute Cliff's delta: probability of difference minus probability of tie."""
    n_x, n_y = len(x), len(y)
    if n_x == 0 or n_y == 0:
        return np.nan
    delta = 0.0
    for xi in x:
        for yj in y:
            if xi > yj:
                delta += 1
            elif xi < yj:
                delta -= 1
    return delta / (n_x * n_y)

cliff_d = cliffs_delta(gn_data.values, gs_data.values)

# Interpret effect size (Cohen-like thresholds for Cliff's delta)
def interpret_cliffs_delta(d):
    d = abs(d)
    if d < 0.147:
        return "Negligible"
    elif d < 0.33:
        return "Small"
    elif d < 0.474:
        return "Medium"
    else:
        return "Large"

effect_interpretation = interpret_cliffs_delta(cliff_d)

# --- Results Table ---
results_df = pd.DataFrame({
    'Metric': [
        'Mann-Whitney U Statistic',
        'p-value',
        "Cliff's Delta (Effect Size)",
        'Effect Size Interpretation',
        'Median Citations (Global North)',
        'Median Citations (Global South)',
        'Mean Citations (Global North)',
        'Mean Citations (Global South)'
    ],
    'Value': [
        round(u_stat, 2),
        f"{p_value:.2e}" if p_value < 0.001 else f"{p_value:.4f}",
        round(cliff_d, 3),
        effect_interpretation,
        round(gn_data.median(), 2),
        round(gs_data.median(), 2),
        round(gn_data.mean(), 2),
        round(gs_data.mean(), 2)
    ]
})

print("\n" + "🔬 STATISTICAL COMPARISON: GLOBAL NORTH vs. GLOBAL SOUTH" + "\n" + "="*70)
print(f"• Hypothesis: Global North publications receive more citations than Global South.")
print(f"• Test: Mann-Whitney U (one-tailed, α = 0.05)")
print()
display(results_df)

# Save results
results_df.to_csv(os.path.join(output_tables, "north_south_citation_inequality_stats.csv"), index=False)
print(f"\n💾 Statistical results saved to: {os.path.join(output_tables, 'north_south_citation_inequality_stats.csv')}")

# --- Visualization: Citation Distribution by Region ---
plt.figure(figsize=(10, 6))
sns.boxplot(
    data=data_filtered,
    x='Region_Standardized',
    y='Citation_Count',
    order=['Global North', 'Global South'],
    palette=['#4E79A7', '#F28E2B']
)
plt.title('Citation Distribution: Global North vs. Global South', fontsize=14, fontweight='bold')
plt.xlabel('Region Group', fontsize=12)
plt.ylabel('Number of Citations', fontsize=12)
plt.yscale('log')  # Log scale for better visibility
plt.grid(axis='y', linestyle='--', alpha=0.5)

# Add sample sizes
for i, group in enumerate(['Global North', 'Global South']):
    n = len(data_filtered[data_filtered['Region_Standardized'] == group])
    plt.text(i, plt.ylim()[0] * 1.2, f'n = {n}', ha='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plot_path = os.path.join(output_figures, "north_south_citation_distribution.png")
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()
print(f"✅ Visualization saved to: {plot_path}")

# --- Interpretation ---
alpha = 0.05
if p_value < alpha:
    conclusion = "Statistically significant evidence that Global North publications receive more citations than Global South."
else:
    conclusion = "No statistically significant difference in citation counts between Global North and Global South."

print("\n" + "📝 INTERPRETATION" + "\n" + "-"*50)
print(f"• {conclusion}")
print(f"• Effect size (Cliff's Delta = {cliff_d:.3f}) indicates a '{effect_interpretation}' difference.")
print(f"• Median citations: GN = {gn_data.median():.1f}, GS = {gs_data.median():.1f}")
print("\n➡️ These results suggest structural inequalities in academic visibility and impact along Global North–South lines.")