In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:

pop_df=pd.read_csv(r'C:\Users\Yasaman\Downloads\World_bank_population.csv',skiprows=3)
pop_df['Country Code']=pop_df['Country Code'].apply(lambda x: x.lower())
possible_countries=pop_df.query(" `2019` >=1000000")['Country Code'].values

excluded_iso3_codes = [
    "IRL",  # Ireland
    "SSD",  # South Sudan
    "SDN",  # Sudan
    "COG",  # Republic of the Congo
    "COD",  # Democratic Republic of the Congo
    "GIN",  # Guinea
    "GNB",  # Guinea-Bissau
    "GNQ",  # Equatorial Guinea
    "PNG",  # Papua New Guinea
    "XKX",  # Kosovo (unofficial)
    "MNE",  # Montenegro
    "SRB",  # Serbia
    "TLS",   # Timor-Leste
    "GEO", #Georgia
    'SWZ', 
    'PRK', #North Korea
]
excluded_iso3_codes=[c.lower() for c in excluded_iso3_codes]


possible_iso=list(set(possible_countries)-set(excluded_iso3_codes))

In [3]:

df = pd.read_csv(r"C:\Users\Yasaman\Downloads\Attention-fractional counting.csv")
df.rename(columns={'aggregated_value': 'count', 'country': 'Mention_country', 'affiliation_country': 'Aff_country'}, inplace=True)
df=df[(df['Mention_country'].isin(possible_iso))&(df['Aff_country'].isin(possible_iso)) &(df['Aff_country']==df['Mention_country'])]
df = df[df['year'].isin(np.arange(2002, 2020))]
Country_list={'Egypt':'EGY', 'Tunisia':'TUN','Libya':'LBY','Syria':'SYR','Yemen':'YEM','Bahrain':'BHR','Jordan':'JOR','Kuwait':'KWT','Morocco':'MAR','Oman':'OMN'}
rev_Country_list={Country_list[key]: key for key in Country_list}
abbr=[country.lower() for country in Country_list.values()]
physical_sciences=['MATH', 'ENGI', 'PHYS', 'COMP', 'MUL']
df=df[~df['subjarea'].isin(physical_sciences)]
df=df.groupby(['year', 'Mention_country'])['count'].sum().reset_index()


data=pd.read_csv(r"C:\Users\Yasaman\Downloads\scopus_2024_V1_scholarlymigration_country_enriched.csv")
data=data[data['year'].isin(np.arange(2002, 2020))]
data=data[['iso3code', 'incomelevel', 'gdp_per_capita', 'year', 'population', 'region', 'padded_population_of_researchers']].dropna()
data.rename(columns={'iso3code':'Mention_country'}, inplace=True)
data['Mention_country']=data['Mention_country'].apply(lambda x: x.lower())
df=df.merge(data, on=['Mention_country', 'year'], how='outer')
df=df[df['Mention_country'].isin(possible_iso)]


countries_to_remove=[]
for c  in df['Mention_country'].unique():
    if ((~df['count'].isna()) & (df['Mention_country'] == c)).sum()<15:
        countries_to_remove.append(c)
        print(c)

print(len(countries_to_remove))


# Define the required year range
required_years = list(range(2002, 2020))

# Get the unique countries
unique_countries = df["Mention_country"].unique()

# Create a complete DataFrame with all country-year combinations
full_data = []
for country in unique_countries:
    country_data = df[df["Mention_country"] == country]
    existing_years = set(country_data["year"])
    
    for year in required_years:
        if year in existing_years:
            row = country_data[country_data["year"] == year].iloc[0].to_dict()
        else:
            row = {
                "year": year,
                "Mention_country": country,
                "count": 0,
                "gdp_per_capita": np.nan,
                "population": np.nan,
                "region": country_data["region"].iloc[0] if not country_data.empty else np.nan,
            }
        full_data.append(row)

# Convert to DataFrame
df_complete = pd.DataFrame(full_data)

df_complete['treated']=df_complete['Mention_country'].isin(abbr).astype(int)
df_complete['treated_CW']=df_complete['Mention_country'].isin(['yem', 'lby', 'syr']).astype(int)
df_complete['treated_GO']=df_complete['Mention_country'].isin(['egy', 'tun']).astype(int)
df_complete['treated_GC']=df_complete['Mention_country'].isin(['omn', 'kwt', 'bhr', 'mar','jor']).astype(int)
df_complete['post']=df_complete['year'].apply(lambda x: 0 if x>=2002 and x<=2010 else 1 )
df_complete['count']=df_complete['count'].fillna(0)
df_complete['log_count']=np.log(df_complete['count']+1)

df_complete[['region', 'gdp_per_capita', 'population','padded_population_of_researchers']] = df_complete.groupby('Mention_country')[[ 'region', 'gdp_per_capita', 'population','padded_population_of_researchers']].ffill()
df_complete[[ 'region', 'gdp_per_capita', 'population','padded_population_of_researchers']] = df_complete.groupby('Mention_country')[[ 'region', 'gdp_per_capita', 'population','padded_population_of_researchers']].bfill()
df_complete['log_gdp']=np.log(df_complete['gdp_per_capita'])
df_complete['log_population']=np.log(df_complete['population'])
df_complete['log_Rpop']=np.log(df_complete['padded_population_of_researchers']+1)
df_complete=df_complete[df_complete['Mention_country'].isin(possible_iso)].reset_index(drop=True)


tkm
1


In [21]:
dataset=df_complete
variables = [ 'log_gdp', 'log_Rpop']
country_list=['lby', 'syr']
Means={}
for country in dataset['Mention_country'].unique():
    country_data = dataset[(dataset['Mention_country'] == country) &(dataset['year']<=2010)]
    if country_data.empty:
        continue
    country_means = country_data[variables].mean().to_dict()
    Means[country]=country_means


country_means=pd.DataFrame(Means).T.reset_index().rename(columns={'index':'Mention_country'})

In [24]:
arab_spring_countries=['lby', 'syr', 'tun', 'egy', 'yem', 'bhr', 'mar', 'kwt', 'omn', 'jor']

In [25]:
def find_match( country_list, n_match_per_country, variables, dataset): 
    matched_countries = []
    for country in country_list:
        country_data = dataset[(dataset['Mention_country'] == country) &(dataset['year']<=2010)]
        if country_data.empty:
            continue
        country_means = country_data[variables].mean()
        
        # Exclude already treated countries and those already matched
        potential_matches = dataset[
            (~dataset['Mention_country'].isin(country_list)) & 
            (~dataset['Mention_country'].isin(matched_countries))
        ]
        
        potential_matches_means = potential_matches.groupby('Mention_country')[variables].mean()
        possible_possible=list(set(possible_iso)-set(abbr))
        potential_matches_means=potential_matches_means[potential_matches_means.index.isin(possible_possible)]
        # Calculate Euclidean distances
        distances = np.linalg.norm(potential_matches_means - country_means, axis=1)
        
        # Get the top N matches
        top_matches = potential_matches_means.index[np.argsort(distances)[:n_match_per_country]]
        
        matched_countries.extend(top_matches)
    
    return sorted(set(matched_countries))


In [27]:
n_counts=5
cw_match=find_match(['lby', 'syr', 'yem'], n_counts, variables, dataset)
GO_match=find_match( ['tun', 'egy'], n_counts, variables, dataset)
GC_match=find_match( ['omn', 'kwt', 'bhr', 'mar','jor'], n_counts, variables, dataset)

In [30]:
cw_match

['alb',
 'bwa',
 'cri',
 'gab',
 'gmb',
 'jam',
 'kgz',
 'lao',
 'mmr',
 'mus',
 'nam',
 'pan',
 'pry',
 'tjk',
 'tto']

# Civil War

Matched=['alb',
 'bwa',
 'cri',
 'gab',
 'gmb',
 'jam',
 'kgz',
 'lao',
 'mmr',
 'mus',
 'nam',
 'pan',
 'pry',
 'tjk',
 'tto']

 

* parallel trend-satisfied!
* Effect with < 1%




In [34]:
df_new=df_complete[df_complete['Mention_country'].isin(['lby', 'syr', 'yem']+cw_match)]
model = smf.ols("log_count ~ treated* post +log_gdp+log_Rpop+C(Mention_country) + C(year)", data=df_new).fit(cov_type='cluster', cov_kwds={'groups': df_new['Mention_country']})
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_count   R-squared:                       0.936
Model:                            OLS   Adj. R-squared:                  0.927
Method:                 Least Squares   F-statistic:                     19.42
Date:                Wed, 03 Sep 2025   Prob (F-statistic):           6.55e-08
Time:                        23:02:31   Log-Likelihood:                 10.846
No. Observations:                 324   AIC:                             54.31
Df Residuals:                     286   BIC:                             198.0
Df Model:                          37                                         
Covariance Type:              cluster                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             



In [35]:
df_p=df_new[df_new['year']<2011].reset_index(drop=True)
df_p['time']=df_p['year']-2011
# Running the Difference-in-Differences regression
model = smf.ols("log_count ~ time+treated * time +log_gdp+log_Rpop+ C(Mention_country)", data=df_p).fit(cov_type='cluster', cov_kwds={'groups': df_p['Mention_country']})
# Print summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_count   R-squared:                       0.926
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     9.089
Date:                Wed, 03 Sep 2025   Prob (F-statistic):           0.000407
Time:                        23:02:42   Log-Likelihood:                -3.7153
No. Observations:                 162   AIC:                             51.43
Df Residuals:                     140   BIC:                             119.4
Df Model:                          21                                         
Covariance Type:              cluster                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             




# GO 

matches=['bgd', 'bgr', 'col', 'cub', 'dza', 'idn', 'nga', 'pak', 'ukr', 'vnm']

* Parallel trend satisfied, but effect not signficant. 

In [36]:
df_new=df_complete[df_complete['Mention_country'].isin(['egy', 'tun']+GO_match)]
# Running the Difference-in-Differences regression
model = smf.ols("log_count ~ treated* post +log_gdp+log_Rpop+C(Mention_country) + C(year)", data=df_new).fit(cov_type='cluster', cov_kwds={'groups': df_new['Mention_country']})
# Print summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_count   R-squared:                       0.962
Model:                            OLS   Adj. R-squared:                  0.956
Method:                 Least Squares   F-statistic:                     65.08
Date:                Wed, 03 Sep 2025   Prob (F-statistic):           2.22e-08
Time:                        23:02:56   Log-Likelihood:                 40.071
No. Observations:                 216   AIC:                            -16.14
Df Residuals:                     184   BIC:                             91.87
Df Model:                          31                                         
Covariance Type:              cluster                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             



In [37]:
df_p=df_new[df_new['year']<2011].reset_index(drop=True)
df_p['time']=df_p['year']-2011
# Running the Difference-in-Differences regression
model = smf.ols("log_count ~ time+treated * time +log_gdp+log_Rpop+ C(Mention_country)", data=df_p).fit(cov_type='cluster', cov_kwds={'groups': df_p['Mention_country']})
# Print summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_count   R-squared:                       0.979
Model:                            OLS   Adj. R-squared:                  0.976
Method:                 Least Squares   F-statistic:                     142.2
Date:                Wed, 03 Sep 2025   Prob (F-statistic):           2.17e-09
Time:                        23:03:06   Log-Likelihood:                 74.189
No. Observations:                 108   AIC:                            -116.4
Df Residuals:                      92   BIC:                            -73.46
Df Model:                          15                                         
Covariance Type:              cluster                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             



# GC

matches=['are',
 'arm',
 'aze',
 'blr',
 'bwa',
 'cri',
 'cyp',
 'dza',
 'est',
 'gab',
 'idn',
 'irq',
 'jam',
 'lka',
 'lva',
 'mus',
 'pan',
 'per',
 'phl',
 'pri',
 'qat',
 'svn',
 'tto',
 'ury',
 'vnm']


* Satisfied, but not significant
 

In [39]:
df_new=df_complete[df_complete['Mention_country'].isin(['omn', 'kwt', 'bhr', 'mar','jor']+GC_match)]
# Running the Difference-in-Differences regression
model = smf.ols("log_count ~ treated* post +log_gdp+log_Rpop+C(Mention_country) + C(year)", data=df_new).fit(cov_type='cluster', cov_kwds={'groups': df_new['Mention_country']})
# Print summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_count   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.954
Method:                 Least Squares   F-statistic:                     35.45
Date:                Wed, 03 Sep 2025   Prob (F-statistic):           3.07e-15
Time:                        23:03:41   Log-Likelihood:                 71.668
No. Observations:                 540   AIC:                            -43.34
Df Residuals:                     490   BIC:                             171.2
Df Model:                          49                                         
Covariance Type:              cluster                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             



In [40]:
df_p=df_new[df_new['year']<2011].reset_index(drop=True)
df_p['time']=df_p['year']-2011
# Running the Difference-in-Differences regression
model = smf.ols("log_count ~ time+treated * time +log_gdp+log_Rpop+ C(Mention_country)", data=df_p).fit(cov_type='cluster', cov_kwds={'groups': df_p['Mention_country']})
# Print summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_count   R-squared:                       0.955
Model:                            OLS   Adj. R-squared:                  0.949
Method:                 Least Squares   F-statistic:                     60.83
Date:                Wed, 03 Sep 2025   Prob (F-statistic):           1.10e-13
Time:                        23:03:54   Log-Likelihood:                 76.350
No. Observations:                 270   AIC:                            -84.70
Df Residuals:                     236   BIC:                             37.65
Df Model:                          33                                         
Covariance Type:              cluster                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

