In [None]:
# -*- coding: utf-8 -*-
"""
LANGUAGE HEGEMONY ANALYSIS: ENGLISH DOMINANCE IN GLOBAL ACADEMIC PUBLISHING
Comparative Assessment of Linguistic Inequality Between Global North and Global South
"""

# --- Import Libraries ---
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from IPython.display import display

# --- Define Relative Paths (relative to notebooks/) ---
input_file = "../data/processed/processed_publications_for_mertonian_analysis.csv"
output_figures = "../output/figures"
output_tables = "../output/tables"

# Create output directories if they don't exist
os.makedirs(output_figures, exist_ok=True)
os.makedirs(output_tables, exist_ok=True)

# --- 1. Load Data ---
try:
    data = pd.read_csv(input_file)
    print("✅ Dataset successfully loaded.")
except FileNotFoundError:
    raise FileNotFoundError(f"File not found: {input_file}")

print(f"📊 Dataset dimensions: {data.shape[0]} observations × {data.shape[1]} variables")

# --- 2. Flexible Column Detection ---
def find_column(data, possible_names):
    for name in possible_names:
        if name in data.columns:
            return name
    return None

col_country = find_column(data, [
    'Negara_norm', 'Country', 'Nation', 'Affiliation_Country'
])
col_language = find_column(data, [
    'Bahasa', 'Language', 'Publication_Language', 'Lang'
])

if not col_country:
    raise KeyError("Required column not found: Country (e.g., 'Negara_norm')")
if not col_language:
    raise KeyError("Required column not found: Language (e.g., 'Bahasa')")

print(f"🔍 Detected country variable: '{col_country}'")
print(f"🔍 Detected language variable: '{col_language}'")

# Rename for consistency
data_clean = data.rename(columns={
    col_country: 'Country',
    col_language: 'Language'
})

# --- 3. Classify Geopolitical Region ---
global_north_countries = {
    'United States', 'USA', 'US', 'Canada', 'UK', 'United Kingdom', 'Germany',
    'France', 'Italy', 'Spain', 'Netherlands', 'Belgium', 'Sweden', 'Denmark',
    'Norway', 'Finland', 'Austria', 'Switzerland', 'Ireland', 'Portugal',
    'Australia', 'New Zealand', 'Japan', 'South Korea', 'Israel', 'Singapore',
    'Luxembourg', 'Iceland', 'Greece', 'Czech Republic', 'Poland', 'Hungary',
    'Slovakia', 'Slovenia', 'Estonia', 'Latvia', 'Lithuania', 'Malta', 'Cyprus'
}

def classify_region(country):
    country_str = str(country).strip()
    return 'Global North' if country_str in global_north_countries else 'Global South'

data_clean['Region'] = data_clean['Country'].apply(classify_region)

# --- 4. Clean and Filter Language Data ---
data_clean['Language'] = data_clean['Language'].astype(str).str.strip()
data_clean = data_clean[~data_clean['Language'].isin(['nan', ''])]

print(f"🧹 Cleaned dataset: {len(data_clean)} publications retained after language filtering")

# --- 5. Cross-tabulate Language Distribution by Region ---
# Raw counts
contingency_raw = pd.crosstab(data_clean['Region'], data_clean['Language'], margins=True, margins_name='Total')
print("\n📊 LANGUAGE DISTRIBUTION BY REGION (Raw Counts)")
display(contingency_raw)

# Normalized by row (proportion within each region)
contingency_prop = pd.crosstab(
    data_clean['Region'], 
    data_clean['Language'], 
    normalize='index'
) * 100

# Round for readability
contingency_prop = contingency_prop.round(1)

print("\n📈 PROPORTION OF PUBLICATIONS BY LANGUAGE (Row-normalized, %)")
display(contingency_prop)

# Focus on major languages
major_languages = ['English', 'Spanish', 'Persian', 'Chinese', 'Portuguese', 'Turkish', 'French', 'Russian']
available_languages = [lang for lang in major_languages if lang in contingency_prop.columns]
filtered_prop = contingency_prop[available_languages]

# --- 6. English Dominance Analysis ---
english_share = contingency_prop.get('English', pd.Series([0, 0], index=contingency_prop.index))
print(f"\n🔤 PERCENTAGE OF PUBLICATIONS IN ENGLISH:")
for region in english_share.index:
    if region in ['Global North', 'Global South']:
        print(f"  • {region}: {english_share[region]:.1f}%")

# --- 7. Visualization: Proportional Language Use by Region ---
plt.figure(figsize=(10, 6), dpi=150)
ax = filtered_prop.plot(
    kind='bar', 
    ax=plt.gca(),
    color=sns.color_palette("Set2"),
    edgecolor='black',
    linewidth=0.6
)

# Formatting
ax.set_title('Proportion of Academic Publications by Language\n(Global North vs. Global South)', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_ylabel('Percentage of Publications (%)', fontsize=12)
ax.set_xlabel('Geopolitical Region', fontsize=12)
ax.legend(title='Language', title_fontsize=11, fontsize=10, loc='upper right')
plt.xticks(rotation=0, fontsize=11)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.4, linewidth=0.8)
plt.tight_layout()

# Save figure to figures/
plot_path = os.path.join(output_figures, "language_hegemony_distribution.png")
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()
print(f"✅ Language distribution plot saved to:\n {plot_path}")

# --- 8. Export Summary Tables to tables/ ---
raw_table_path = os.path.join(output_tables, "language_distribution_raw_counts.xlsx")
prop_table_path = os.path.join(output_tables, "language_distribution_proportions.xlsx")

with pd.ExcelWriter(raw_table_path, engine='openpyxl') as writer:
    contingency_raw.to_excel(writer, sheet_name='Language_Counts')

with pd.ExcelWriter(prop_table_path, engine='openpyxl') as writer:
    contingency_prop.to_excel(writer, sheet_name='Language_Proportions')

print(f"✅ Raw language counts exported to:\n {raw_table_path}")
print(f"✅ Proportional distribution exported to:\n {prop_table_path}")

# --- 9. Interpretation ---
gn_english = english_share.get('Global North', 0)
gs_english = english_share.get('Global South', 0)
diff = gn_english - gs_english

print("\n" + "📝 INTERPRETATION" + "\n" + "-"*60)
print("The linguistic landscape of academic publishing exhibits significant asymmetry:")

if diff >= 20:
    print(f"• The Global North publishes {gn_english:.1f}% of its work in English, compared to {gs_english:.1f}% in the Global South — a gap of {diff:.1f} percentage points.")
    print("• This reflects entrenched linguistic hegemony favoring Anglophone scholarship.")
elif diff >= 5:
    print(f"• The Global North shows stronger dominance in English-language publishing ({gn_english:.1f}% vs {gs_english:.1f}%).")
else:
    print(f"• English usage is relatively balanced between regions ({gn_english:.1f}% North vs {gs_english:.1f}% South).")

if gs_english < 70:
    print("• A substantial share of Southern scholarship is published in local or regional languages, potentially limiting its global visibility and citation potential.")

print("\n➡️ These findings support the argument that language functions as a structural gatekeeper in knowledge dissemination, reinforcing epistemic inequality. Non-English publications, despite their scholarly value, often face systemic underrepresentation in global citation networks — a phenomenon consistent with the 'linguistic bias' in scientometrics.")