In [1]:
#This is a project that constructs the data I used for my paper "Minorities in Dictatorship and Democracy".
#The paper investigates the relationship between the government representation 
#of ethnic minorities and the level of democracy.
#While not all the data made it into the paper, I am making the code public because 
#some data transformation and merging techniques can be useful to other researchers.
#In particular, I demonstrate how to transform 
#the Ethnic Power Relations Dataset (https://icr.ethz.ch/data/epr/core/) and 
#World Development Indicators (https://databank.worldbank.org/source/world-development-indicators)
#into a more user-friendly format that allows for easy analysis. 
#I also show how to merge cross-country datasets where country names have different spellings 
#(e.g. "USA" and "United States of America").
#Feel free to contact me at asamsonov94@ucla.edu for any questions!

In [None]:
#Import packages
import os
import pandas as pd
import numpy as np
import requests


#Replace the path with the actual directory on your computer.
os.chdir("/Users/asamsonov94/Desktop/minorities_empirics_2023")

In [None]:
#The EPR dataset (https://icr.ethz.ch/data/epr/core/)) lists ethnic groups accross the world and several decades and classifies their government representation.
#The original dataset is in the following format:
#country/ beginning year / end year / ethnic group / group size/ government status / other variables
#The code below transforms it into a more usable format needed for my research:
#country / year / number of minorities / share of minorities included in government / other variables


#Download the EPR dataset (https://icr.ethz.ch/data/epr/core/) in a csv format. Replace the path with the actual file name on your computer.

df = pd.read_csv("/Users/asamsonov94/Desktop/minorities_empirics_2023/EPR-2021.csv")

#Transform the dataframe into a list of dictionaries where each dictionary is a line in dataframe
list_of_dicts = [row.to_dict() for _, row in df.iterrows()]


min_year = df['from'].min()
max_year = df['to'].max()

#Choose which statuses mean that a minority is included
included = ['MONOPOLY','DOMINANT','SENIOR PARTNER', 'JUNIOR PARTNER']

#Leave only countries with large majorities as in the paper
countries_with_large_majorities = set()
for dict in list_of_dicts:
    if dict["size"] >= 0.5:
        countries_with_large_majorities.add(dict["statename"])
        
for dict in list_of_dicts:
    if dict["statename"] not in countries_with_large_majorities:
        list_of_dicts.remove(dict)


#The function below goes over list of dictionaries that represent original dataframe rows. Ignores if a group is not a minority.
#If a group is a minority, computes the number of minorities in a given country-year.
#Function outputs a dictionary where key is a tuple (country, year) 
#and value is a list [number of minorities, number of included minorities]

        
def check_and_add(year,smalldict,bigdict):
    
    begin = smalldict["from"]
    end = smalldict["to"]
    statename = smalldict["statename"]
    
    if not (begin <= year <= end):
        return
    if smalldict["size"] >= 0.25:
        return
    delta_mins = 1
    delta_inc_mins = 0
    if smalldict["status"] in included:
        delta_inc_mins = 1
    if not (statename,year) in bigdict:
        bigdict[(statename,year)] = [0,0]
    bigdict[(statename,year)] = list(np.array([delta_mins,delta_inc_mins]) + np.array(bigdict[(statename,year)]))
        
bigdict = {}
for year in range(min_year, max_year):
    for dict in list_of_dicts:
        check_and_add(year,dict,bigdict)

        
#Turn the above dictionary into a list of lists where each list is a row for a future dataframe
        
data = []
for key in bigdict:
    lst = list(key) + bigdict[key]
    data.append(lst)

#Turn the above dictionary into a list of lists where each list is a row for a future dataframe    

df1 = pd.DataFrame(data, columns = ['country', 'year','minorities','included_minorities'])

#Transform the "year" variable into an "int" format    

for i in range(len(df1)):
            df1.loc[i, 'year'] = int(df1.loc[i, 'year'])
        
#compute the share of included minorities

df1['included_minorities_share'] = df1['included_minorities']/df1['minorities']
        
display(df1)

In [None]:
#The code below transforms the World Development Indicators dataset (https://databank.worldbank.org/source/world-development-indicators)
#into a more usable format.
#The original format is: 
#country / variable/ variable value in Year1/ variable value in Year2 .../
#The resulting dataframe is in the format:
#contry/year/variable

#Download the WDI dataset (https://databank.worldbank.org/source/world-development-indicators) in a csv format. 
#Replace the path with the actual file name on your computer.

df2 = pd.read_csv("/Users/asamsonov94/Desktop/minorities_empirics_2023/wdi_1.csv")


#Remove redundand strings from column names that are years

for col_name in df2.columns:
    for year in range(1960,2023):
        if str(year) in str(col_name):
            new_col_name = str(year)
            df2.rename(columns={col_name: new_col_name}, inplace=True)
            
df2.rename(columns={'Country Name': 'country'}, inplace=True)
df2.rename(columns={'Series Name': 'series'}, inplace=True)
df2 = df2.drop(['Country Code','Series Code'], axis = 1)


df2.dropna(subset=['country','series'], inplace=True)


#Function returns a dictionary where key is a tuple (country, variable, year) and 
#value is respective table element. 

def lookup(df):
    years = [col for col in df.columns if len(col) == 4]
    dict = {}
    for i in range(len(df)):
        for year in years:
            country = df.loc[i, 'country']
            variable = df.loc[i,'series']
            dict[(country,variable,year)] = df.loc[i,year]
    return dict

#Function uses the above dictionary to construct a dataframe.

def modify_df(df):

    countries = df['country'].unique()
    variables = df['series'].unique() 
    years = [col for col in df.columns if len(col) == 4]
    lookup_dict = lookup(df)
    list_of_dicts = []
      
    for country in countries:
        for year in years:
            dict = {}
            dict['country'] = country
            dict['year'] = year
            for variable in variables:
                dict[variable] = lookup_dict[(country,variable,year)]
            list_of_dicts.append(dict)
            
    new_df = pd.DataFrame(list_of_dicts)
    return new_df

#Apply the function to dataframe

df2 = modify_df(df2)

#Transform year into int format

for i in range(len(df2)):
            df2.loc[i, 'year'] = int(df2.loc[i, 'year'])

display(df2)

In [None]:
#The EPR and WDI datasets often have different spelling of the same country (e.g. "USA" and "United States of America")
#The code below harmonizes country names and merges two datasets. The key idea is comparing Wikipedia output for two strings 
#that might be two names of the same country. The method needs further development, but has a high success rate.

#A function that gets the first Wikipedia article based on a search.

def get_first_wikipedia_article_url(s):
    WIKIPEDIA_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": s,
        "utf8": 1,
        "formatversion": 2
    }

    response = requests.get(WIKIPEDIA_API_ENDPOINT, params=params)
    data = response.json()

    # Check if search results are present
    if not data["query"]["search"]:
        return None

    first_article_title = data["query"]["search"][0]["title"]
    article_url = f"https://en.wikipedia.org/wiki/{first_article_title.replace(' ', '_')}"

    return article_url


#A function that accepts a string s and returns a list. The first element is the first Wikipedia article from search "s".
#Other entires are search results from substrings of s separated by spaces. 


def wiki_output(s):
    
    res = []
    res.append(get_first_wikipedia_article_url(s))
    if len(s.split()) <= 1:
        return res
    for word in s.split():
        res.append(get_first_wikipedia_article_url(word))   
    return res

#Score of similarity between string s1 and s2 based on Wikipedia output defined above.

def score(s1,s2,dict_of_country_titles):
    x = dict_of_country_titles[s1]
    y = dict_of_country_titles[s2]
    if x[0] == y[0]:
        return 1
    else:
        count = 0
        match = 0
        for w in x:
            for z in y:
                count += 1
                if w == z:
                    match += 1
        res = match/count
        return res
    

#A function that matches country lists based on the score function defined above. The result is a dictionary
#where keys are country names from one list and values are alternative versions of these country names from 
#the other list.
    
def match_country_lists(l1,l2,dict_for_score,f):
    dict = {}
    l1 = list(l1)
    l2 = list(l2)
    for x in l1:
        match_score = 0
        res = None
        for y in l2:
            new_score = f(x,y,dict_for_score)
            if new_score > match_score:
                res = y
                match_score = new_score
        if res != None:
            dict[x] = res
    key_set = set(dict.keys())
    value_set = set(dict.values())
    l1 = [x for x in l1 if x not in key_set]
    l2 = [x for x in l2 if x not in value_set]
    print(l1)
    print(" ")
    print(l2)
    print(" ")
    return dict

#Functions that apply the matching of country lists to merging datasets 
#with different country spellings.

def dfs_to_dict(df1,df2):
    countries_in_df1 = df1['country'].unique()
    countries_in_df2 = df2['country'].unique()
    countries = list(countries_in_df1) +  list(countries_in_df2)
    dict_of_country_titles = {}
    for country in countries:
        dict_of_country_titles[country] = wiki_output(country)
    dict = match_country_lists(countries_in_df1,countries_in_df2,dict_of_country_titles,score)
    return dict


def merge_harmonized_datasets(df1,df2,var1,var2,dict):
    for i in range(len(df1)):
        if df1.loc[i, var1] in dict:
            df1.loc[i, var1] = dict[df1.loc[i, var1]]
    merged_df = pd.merge(df1, df2, on=[var1, var2], how='inner')
    return merged_df

#Creating and correcting a dictionary for specifc datasets I was working with

dict  = dfs_to_dict(df1,df2)
dict["Congo, Democratic Republic of (Zaire)"] = 'Congo, Dem. Rep.'
dict["Congo"] = 'Congo, Rep.'
dict["Italy/Sardinia"] = 'Italy'
dict["Surinam"] = 'Suriname'
dict['Czechoslovakia'] = 'Czechia'



df3 = merge_harmonized_datasets(df1,df2,'country','year',dict)
display(df3)
