In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:

pop_df=pd.read_csv(r'C:\Users\Yasaman\Downloads\World_bank_population.csv',skiprows=3)
pop_df['Country Code']=pop_df['Country Code'].apply(lambda x: x.lower())
possible_countries=pop_df.query(" `2019` >=1000000")['Country Code'].values

excluded_iso3_codes = [
    "IRL",  # Ireland
    "SSD",  # South Sudan
    "SDN",  # Sudan
    "COG",  # Republic of the Congo
    "COD",  # Democratic Republic of the Congo
    "GIN",  # Guinea
    "GNB",  # Guinea-Bissau
    "GNQ",  # Equatorial Guinea
    "PNG",  # Papua New Guinea
    "XKX",  # Kosovo (unofficial)
    "MNE",  # Montenegro
    "SRB",  # Serbia
    "TLS",   # Timor-Leste
    "GEO", #Georgia
    'SWZ', 
    'PRK', #North Korea
]
excluded_iso3_codes=[c.lower() for c in excluded_iso3_codes]


possible_iso=list(set(possible_countries)-set(excluded_iso3_codes))

In [3]:

df = pd.read_csv(r"C:\Users\Yasaman\Downloads\Attention-fractional counting.csv")
df.rename(columns={'aggregated_value': 'count', 'country': 'Mention_country', 'affiliation_country': 'Aff_country'}, inplace=True)
df=df[(df['Mention_country'].isin(possible_iso))&(df['Aff_country'].isin(possible_iso))]
df = df[df['year'].isin(np.arange(2002, 2020))]
Country_list={'Egypt':'EGY', 'Tunisia':'TUN','Libya':'LBY','Syria':'SYR','Yemen':'YEM','Bahrain':'BHR','Jordan':'JOR','Kuwait':'KWT','Morocco':'MAR','Oman':'OMN'}
rev_Country_list={Country_list[key]: key for key in Country_list}
abbr=[country.lower() for country in Country_list.values()]
physical_sciences=['MATH', 'ENGI', 'PHYS', 'COMP', 'MUL']
df=df[~df['subjarea'].isin(physical_sciences)]
df=df.groupby(['year', 'Mention_country'])['count'].sum().reset_index()


data=pd.read_csv(r"C:\Users\Yasaman\Downloads\scopus_2024_V1_scholarlymigration_country_enriched.csv")
data=data[data['year'].isin(np.arange(2002, 2020))]
data=data[['iso3code', 'incomelevel', 'gdp_per_capita', 'year', 'population', 'region', 'padded_population_of_researchers']].dropna()
data.rename(columns={'iso3code':'Mention_country'}, inplace=True)
data['Mention_country']=data['Mention_country'].apply(lambda x: x.lower())
df=df.merge(data, on=['Mention_country', 'year'], how='outer')
df=df[df['Mention_country'].isin(possible_iso)]


countries_to_remove=[]
for c  in df['Mention_country'].unique():
    if ((~df['count'].isna()) & (df['Mention_country'] == c)).sum()<15:
        countries_to_remove.append(c)
        print(c)

print(len(countries_to_remove))


# Define the required year range
required_years = list(range(2002, 2020))

# Get the unique countries
unique_countries = df["Mention_country"].unique()

# Create a complete DataFrame with all country-year combinations
full_data = []
for country in unique_countries:
    country_data = df[df["Mention_country"] == country]
    existing_years = set(country_data["year"])
    
    for year in required_years:
        if year in existing_years:
            row = country_data[country_data["year"] == year].iloc[0].to_dict()
        else:
            row = {
                "year": year,
                "Mention_country": country,
                "count": 0,
                "gdp_per_capita": np.nan,
                "population": np.nan,
                "region": country_data["region"].iloc[0] if not country_data.empty else np.nan,
            }
        full_data.append(row)

# Convert to DataFrame
df_complete = pd.DataFrame(full_data)

df_complete['treated']=df_complete['Mention_country'].isin(abbr).astype(int)
df_complete['treated_CW']=df_complete['Mention_country'].isin(['yem', 'lby', 'syr']).astype(int)
df_complete['treated_GO']=df_complete['Mention_country'].isin(['egy', 'tun']).astype(int)
df_complete['treated_GC']=df_complete['Mention_country'].isin(['omn', 'kwt', 'bhr', 'mar','jor']).astype(int)
df_complete['post']=df_complete['year'].apply(lambda x: 0 if x>=2002 and x<=2010 else 1 )
df_complete['count']=df_complete['count'].fillna(0)
df_complete['log_count']=np.log(df_complete['count']+1)

df_complete[['region', 'gdp_per_capita', 'population','padded_population_of_researchers']] = df_complete.groupby('Mention_country')[[ 'region', 'gdp_per_capita', 'population','padded_population_of_researchers']].ffill()
df_complete[[ 'region', 'gdp_per_capita', 'population','padded_population_of_researchers']] = df_complete.groupby('Mention_country')[[ 'region', 'gdp_per_capita', 'population','padded_population_of_researchers']].bfill()
df_complete['log_gdp']=np.log(df_complete['gdp_per_capita'])
df_complete['log_population']=np.log(df_complete['population'])
df_complete['log_Rpop']=np.log(df_complete['padded_population_of_researchers']+1)
df_complete=df_complete[df_complete['Mention_country'].isin(possible_iso)].reset_index(drop=True)


0


In [4]:
matched=pd.read_csv(r'matched_data.csv')
df_complete=df_complete.merge(matched[['Mention_country', 'cem_w']], on='Mention_country', how='inner')

In [5]:
from linearmodels.panel import PanelOLS

panel_data = df_complete.set_index(['Mention_country', 'year'])

model = PanelOLS.from_formula(
    'log_count ~ treated : post +log_population +log_gdp+log_Rpop+ EntityEffects + TimeEffects',
    data=panel_data, weights=panel_data['cem_w']
)

results = model.fit(cov_type='clustered', cluster_entity=True)
print(results)

                          PanelOLS Estimation Summary                           
Dep. Variable:              log_count   R-squared:                        0.4704
Estimator:                   PanelOLS   R-squared (Between):             -3.7332
No. Observations:                1044   R-squared (Within):               0.7283
Date:                Sun, Sep 28 2025   R-squared (Overall):             -3.6945
Time:                        19:13:45   Log-likelihood                    486.30
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      214.25
Entities:                          58   P-value                           0.0000
Avg Obs:                       18.000   Distribution:                   F(4,965)
Min Obs:                       18.000                                           
Max Obs:                       18.000   F-statistic (robust):             35.609
                            

In [6]:
from linearmodels.panel import PanelOLS

panel_data = df_complete.set_index(['Mention_country', 'year'])

model = PanelOLS.from_formula(
    'log_count ~ treated_GO : post+treated_CW : post+treated_GC : post  +log_population +log_gdp+log_Rpop+ EntityEffects + TimeEffects',
    data=panel_data, weights=panel_data['cem_w']
)

results = model.fit(cov_type='clustered', cluster_entity=True)
print(results)

                          PanelOLS Estimation Summary                           
Dep. Variable:              log_count   R-squared:                        0.4784
Estimator:                   PanelOLS   R-squared (Between):             -2.2271
No. Observations:                1044   R-squared (Within):               0.7324
Date:                Sun, Sep 28 2025   R-squared (Overall):             -2.2014
Time:                        19:13:45   Log-likelihood                    494.24
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      147.19
Entities:                          58   P-value                           0.0000
Avg Obs:                       18.000   Distribution:                   F(6,963)
Min Obs:                       18.000                                           
Max Obs:                       18.000   F-statistic (robust):             35.884
                            

In [9]:
df_complete['time']=df_complete['year']-2011

panel_data = df_complete.set_index(['Mention_country', 'year'])

model = PanelOLS.from_formula(
    'log_count ~ treated_GO : time+treated_CW : time+treated_GC : time  +log_population +log_gdp+log_Rpop+ EntityEffects + TimeEffects',
    data=panel_data, weights=panel_data['cem_w']
)

results = model.fit(cov_type='clustered', cluster_entity=True)
print(results)

                          PanelOLS Estimation Summary                           
Dep. Variable:              log_count   R-squared:                        0.4723
Estimator:                   PanelOLS   R-squared (Between):             -3.2451
No. Observations:                1044   R-squared (Within):               0.7329
Date:                Sun, Sep 28 2025   R-squared (Overall):             -3.2106
Time:                        19:14:03   Log-likelihood                    488.19
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      143.64
Entities:                          58   P-value                           0.0000
Avg Obs:                       18.000   Distribution:                   F(6,963)
Min Obs:                       18.000                                           
Max Obs:                       18.000   F-statistic (robust):             39.410
                            

In [10]:
df_complete['time']=df_complete['year']-2011

panel_data = df_complete.set_index(['Mention_country', 'year'])

model = PanelOLS.from_formula(
    'log_count ~ treated: time  +log_population +log_gdp+log_Rpop+ EntityEffects + TimeEffects',
    data=panel_data, weights=panel_data['cem_w']
)

results = model.fit(cov_type='clustered', cluster_entity=True)
print(results)

                          PanelOLS Estimation Summary                           
Dep. Variable:              log_count   R-squared:                        0.4700
Estimator:                   PanelOLS   R-squared (Between):             -4.2081
No. Observations:                1044   R-squared (Within):               0.7352
Date:                Sun, Sep 28 2025   R-squared (Overall):             -4.1652
Time:                        19:14:16   Log-likelihood                    485.98
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      213.97
Entities:                          58   P-value                           0.0000
Avg Obs:                       18.000   Distribution:                   F(4,965)
Min Obs:                       18.000                                           
Max Obs:                       18.000   F-statistic (robust):             35.830
                            