In [None]:
# -*- coding: utf-8 -*-
"""
LANGUAGE DISTRIBUTION ANALYSIS: GLOBAL NORTH VS. GLOBAL SOUTH
Assessing Linguistic Inequality in Scholarly Communication
"""

# --- Import Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from IPython.display import display

# --- Define Relative Paths (relative to notebooks/) ---
input_file = "../data/processed/processed_publications_for_mertonian_analysis.csv"
output_figures = "../output/figures"
output_tables = "../output/tables"

# Create output directories if they don't exist
os.makedirs(output_figures, exist_ok=True)
os.makedirs(output_tables, exist_ok=True)

# --- 1. Load Data ---
try:
    data = pd.read_csv(input_file)
    print("✅ Dataset successfully loaded.")
except FileNotFoundError:
    raise FileNotFoundError(f"File not found: {input_file}")

print(f"📊 Initial dataset dimensions: {data.shape[0]} observations × {data.shape[1]} variables")

# --- 2. Flexible Column Detection ---
def find_column(data, possible_names):
    for name in possible_names:
        if name in data.columns:
            return name
    return None

col_country = find_column(data, ['Negara_norm', 'Country', 'Nation', 'Affiliation_Country'])
col_language = find_column(data, ['Bahasa', 'Language', 'Languages', 'Publication_Language'])
col_citation = find_column(data, ['Jumlah_Sitasi', 'Citations', 'Cited by'])

if not col_country:
    raise KeyError("Required column not found: Country (e.g., 'Negara_norm')")
if not col_language:
    raise KeyError("Required column not found: Language (e.g., 'Bahasa')")
if not col_citation:
    raise KeyError("Required column not found: Citation count")

print(f"🔍 Detected country variable: '{col_country}'")
print(f"🔍 Detected language variable: '{col_language}'")
print(f"🔍 Detected citation variable: '{col_citation}'")

# Rename for consistency
data_clean = data.rename(columns={
    col_country: 'Country',
    col_language: 'Language',
    col_citation: 'Citations'
})

# --- 3. Data Cleaning ---
# Normalize and clean language field
data_clean['Language'] = (
    data_clean['Language']
    .astype(str)
    .str.strip()
    .str.title()  # Capitalize first letter
)
data_clean = data_clean[data_clean['Language'] != 'Nan']
data_clean = data_clean.dropna(subset=['Country', 'Language'])

print(f"🧹 Cleaned dataset: {len(data_clean)} publications retained after filtering")

# --- 4. Classify Region: Global North vs. Global South ---
global_north_countries = {
    'United States', 'USA', 'US', 'Canada', 'UK', 'United Kingdom', 'Germany',
    'France', 'Italy', 'Spain', 'Netherlands', 'Belgium', 'Sweden', 'Denmark',
    'Norway', 'Finland', 'Austria', 'Switzerland', 'Ireland', 'Portugal',
    'Australia', 'New Zealand', 'Japan', 'South Korea', 'Israel', 'Singapore',
    'Luxembourg', 'Iceland', 'Greece', 'Czech Republic', 'Poland', 'Hungary',
    'Slovakia', 'Slovenia', 'Estonia', 'Latvia', 'Lithuania', 'Malta', 'Cyprus'
}

def classify_region(country):
    country_str = str(country).strip()
    return 'Global North' if country_str in global_north_countries else 'Global South'

data_clean['Region'] = data_clean['Country'].apply(classify_region)

# --- 5. Cross-tabulation: Language Distribution by Region ---
# Raw counts
contingency_raw = pd.crosstab(
    data_clean['Region'],
    data_clean['Language'],
    margins=True,
    margins_name='Total'
)

# Row-normalized percentages (proportion within each region)
contingency_pct = pd.crosstab(
    data_clean['Region'],
    data_clean['Language'],
    normalize='index'
) * 100
contingency_pct = contingency_pct.round(1)
contingency_pct['Total_Count'] = contingency_raw.loc[:, 'Total']  # Add total counts

# Build summary table
language_summary = []
for lang in contingency_raw.columns[:-1]:  # Exclude 'Total' column
    row = {
        'Language': lang,
        'Global_North_Count': contingency_raw.loc['Global North', lang],
        'Global_North_Pct': f"{contingency_pct.loc['Global North', lang]:.1f}%",
        'Global_South_Count': contingency_raw.loc['Global South', lang],
        'Global_South_Pct': f"{contingency_pct.loc['Global South', lang]:.1f}%",
        'Total_Publications': contingency_raw.loc['Total', lang]
    }
    language_summary.append(row)

# Convert to DataFrame and sort by total volume
summary_table = pd.DataFrame(language_summary)
summary_table = summary_table.sort_values('Total_Publications', ascending=False)
summary_table.index = range(1, len(summary_table) + 1)
summary_table.index.name = 'Rank'

# --- 6. Display Results ---
print("\n" + "🔤 LANGUAGE DISTRIBUTION IN ACADEMIC PUBLISHING: GLOBAL NORTH VS. GLOBAL SOUTH" + "\n" + "="*90)
display(summary_table)

# --- 7. English Dominance Summary ---
english_gn_count = contingency_raw.loc['Global North', 'English']
english_gs_count = contingency_raw.loc['Global South', 'English']
total_gn = contingency_raw.loc['Global North', 'Total']
total_gs = contingency_raw.loc['Global South', 'Total']

pct_english_gn = (english_gn_count / total_gn) * 100
pct_english_gs = (english_gs_count / total_gs) * 100

print(f"\n📌 ENGLISH-LANGUAGE PUBLICATION SHARE")
print(f"• Global North: {english_gn_count:,} out of {total_gn:,} papers ({pct_english_gn:.1f}%) published in English")
print(f"• Global South: {english_gs_count:,} out of {total_gs:,} papers ({pct_english_gs:.1f}%) published in English")
print(f"• Difference: {abs(pct_english_gn - pct_english_gs):.1f} percentage points")

# --- 8. Visualization ---
plt.style.use('default')
fig, axes = plt.subplots(1, 2, figsize=(16, 7), dpi=150)
fig.suptitle('Linguistic Patterns in Academic Publishing\nGlobal North vs. Global South', fontsize=15, fontweight='bold', y=0.98)

# A. Stacked Bar: Proportion of Top 5 Languages by Region
top_languages = summary_table.head(5)['Language'].tolist()
data_top_langs = data_clean[data_clean['Language'].isin(top_languages)]
crosstab_prop = pd.crosstab(data_top_langs['Region'], data_top_langs['Language'], normalize='index') * 100

crosstab_prop.T.plot(kind='bar', ax=axes[0], color=sns.color_palette("Set2", len(top_languages)), edgecolor='black', linewidth=0.7)
axes[0].set_title('Proportion of Top 5 Publication Languages by Region', fontweight='bold')
axes[0].set_ylabel('Percentage of Publications (%)')
axes[0].set_xlabel('Geopolitical Region')
axes[0].legend(title='Language', title_fontsize=10, fontsize=9, loc='upper right')
axes[0].tick_params(axis='x', rotation=0)

# B. Horizontal Bar: Top 10 Languages by Total Output
top_10_languages = summary_table.head(10)
bars = axes[1].barh(
    top_10_languages['Language'][::-1], 
    top_10_languages['Total_Publications'][::-1], 
    color='steelblue', 
    edgecolor='black', 
    linewidth=0.7
)
axes[1].set_title('Top 10 Languages by Total Publication Volume', fontweight='bold')
axes[1].set_xlabel('Number of Publications')
axes[1].set_ylabel('Language')

# Annotate bars with counts
for i, bar in enumerate(bars):
    width = bar.get_width()
    axes[1].text(width + 0.5, bar.get_y() + bar.get_height()/2, 
                 f'{int(width)}', va='center', ha='left', fontsize=9)

plt.tight_layout(rect=[0, 0, 1, 0.95])

# Save figure to figures/ folder
plot_path = os.path.join(output_figures, "language_distribution_analysis.png")
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()
print(f"✅ Visualization saved to:\n   {plot_path}")

# --- 9. Export Results to Excel ---
excel_path = os.path.join(output_tables, "language_distribution_analysis.xlsx")

with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
    summary_table.to_excel(writer, sheet_name='Language_Distribution', index=True)
    contingency_raw.to_excel(writer, sheet_name='Crosstab_Counts')
    contingency_pct.to_excel(writer, sheet_name='Crosstab_Proportions')

print(f"\n✅ Comprehensive results exported to:\n   {excel_path}")

# --- 10. Interpretation ---
print("\n" + "📝 INTERPRETATION" + "\n" + "-"*50)
print("The analysis reveals pronounced linguistic stratification in academic publishing:")

if pct_english_gn > 90:
    print(f"• The Global North exhibits near-total dominance of English ({pct_english_gn:.1f}% of publications), reinforcing its role as the primary language of science.")
else:
    print(f"• A large majority ({pct_english_gn:.1f}%) of Northern publications are in English, reflecting institutional and editorial norms.")

if pct_english_gs < 70:
    print(f"• Only {pct_english_gs:.1f}% of Southern publications are in English, indicating substantial scholarly output in regional and local languages.")
else:
    print(f"• The Global South shows relatively high adoption of English ({pct_english_gs:.1f}%), likely due to publication pressure in international journals.")

print("\n➡️ These patterns suggest that language functions as a structural barrier: non-English publications, despite their intellectual merit, often face reduced visibility, indexing limitations, and lower citation rates — contributing to epistemic marginalization. This supports critiques of linguistic imperialism in global knowledge systems.")