# Complete scripts for reproduction

### The scripts below are for wrangling data within the individual spreadsheets for each LDC for years 1999 and 2001.


Create a new variable "emne" that is True if name of country isn't found in list of developed economies.
List of Developed Economies taken from UNCTAD LDC Report 2004 found here: https://unctad.org/system/files/official-document/ldc2004_en.pdf


In [41]:
import os
import pandas as pd

# Load the list of developed economies from the CSV file
developed_economies_file = "./data/supplementary_data/developed_economies_list.csv"
developed_economies_df = pd.read_csv(developed_economies_file)
developed_economies = set(developed_economies_df["country_name"].tolist())


# Function to process a single CSV file
def process_csv(file_path):
    df = pd.read_csv(file_path)
    if "home_economy" in df.columns:
        # Mark as 'emne' (emerging market multinational enterprise) if not in developed economies
        df["emne"] = df["home_economy"].apply(
            lambda x: False if x in developed_economies else True
        )
        df.to_csv(file_path, index=False)
        print(f"Processed {file_path}")
    else:
        print(f"Column 'home_economy' not found in {file_path}")


# Directories to process
directories = ["./data/country_spreadsheets/1999", "./data/country_spreadsheets/2001"]

# Process each CSV file in the directories
for directory in directories:
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            process_csv(file_path)

print("Processed files and added emne column.")

Processed ./data/country_spreadsheets/1999/samoa_1999.csv
Processed ./data/country_spreadsheets/1999/afghanistan_1999.csv
Processed ./data/country_spreadsheets/1999/niger_1999.csv
Processed ./data/country_spreadsheets/1999/uganda_1999.csv
Processed ./data/country_spreadsheets/1999/madagascar_1999.csv
Processed ./data/country_spreadsheets/1999/somalia_1999.csv
Processed ./data/country_spreadsheets/1999/mozambique_1999.csv
Processed ./data/country_spreadsheets/1999/democratic_republic_of_congo_1999.csv
Processed ./data/country_spreadsheets/1999/benin_1999.csv
Processed ./data/country_spreadsheets/1999/haiti_1999.csv
Processed ./data/country_spreadsheets/1999/bhutan_1999.csv
Processed ./data/country_spreadsheets/1999/burundi_1999.csv
Processed ./data/country_spreadsheets/1999/comoros_1999.csv
Processed ./data/country_spreadsheets/1999/bangladesh_1999.csv
Processed ./data/country_spreadsheets/1999/sao_tome_and_principe_1999.csv
Processed ./data/country_spreadsheets/1999/zambia_1999.csv
Pro

Create new variable "nat_res_ind" based on ./data/supplementary_data/bordering_countries.csv


In [42]:
import pandas as pd
import os

# Define the paths to the directories containing the CSV files
dir1 = "./data/country_spreadsheets/1999"  # Update this path as needed
dir2 = "./data/country_spreadsheets/2001"  # Update this path as needed

# Read the natural resource industries into a set with boolean conversion
nat_res_ind_path = "./data/supplementary_data/nat_resource_industries.csv"
nat_res_ind_df = pd.read_csv(nat_res_ind_path)
nat_res_ind_df["nat_resource"] = (
    nat_res_ind_df["nat_resource"].astype(str).str.lower() == "true"
)
nat_res_industries = set(
    nat_res_ind_df[nat_res_ind_df["nat_resource"]]["industry"].str.strip()
)


# Function to process CSV files with filtering for nat_resource == True
def process_csv_files(directory, nat_res_industries):
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            if "industry" in df.columns:
                df["industry"] = df["industry"].astype(str).str.strip()
                df["nat_res_ind"] = df["industry"].apply(
                    lambda x: x in nat_res_industries
                )
                df.to_csv(file_path, index=False)
                print(f"Processed {file_path}")
            else:
                print(f"Column 'industry' not found in {file_path}")
                # Handle files without the 'industry' column if necessary


# Process the directories with the correct reference to nat_res_industries
process_csv_files(dir1, nat_res_industries)
process_csv_files(dir2, nat_res_industries)

print("Processed files and added nat_res_ind column.")

Processed ./data/country_spreadsheets/1999/samoa_1999.csv
Processed ./data/country_spreadsheets/1999/afghanistan_1999.csv
Processed ./data/country_spreadsheets/1999/niger_1999.csv
Processed ./data/country_spreadsheets/1999/uganda_1999.csv
Processed ./data/country_spreadsheets/1999/madagascar_1999.csv
Processed ./data/country_spreadsheets/1999/somalia_1999.csv
Processed ./data/country_spreadsheets/1999/mozambique_1999.csv
Processed ./data/country_spreadsheets/1999/democratic_republic_of_congo_1999.csv
Processed ./data/country_spreadsheets/1999/benin_1999.csv
Processed ./data/country_spreadsheets/1999/haiti_1999.csv
Processed ./data/country_spreadsheets/1999/bhutan_1999.csv
Processed ./data/country_spreadsheets/1999/burundi_1999.csv
Processed ./data/country_spreadsheets/1999/comoros_1999.csv
Processed ./data/country_spreadsheets/1999/bangladesh_1999.csv
Processed ./data/country_spreadsheets/1999/sao_tome_and_principe_1999.csv
Processed ./data/country_spreadsheets/1999/zambia_1999.csv
Pro

Create new variable former_col_power based on coding in ./data/supplementary_data/colonial_rulers_list.csv


In [43]:
import os
import pandas as pd

# Define the paths to the directories containing the CSV files
dir1999 = "./data/country_spreadsheets/1999"  # Replace with the actual path to the 1999 directory
dir2001 = "./data/country_spreadsheets/2001"  # Replace with the actual path to the 2001 directory

# Read the colonial rulers list into a dictionary
colonial_rulers_path = "./data/supplementary_data/colonial_rulers_list.csv"
colonial_rulers_df = pd.read_csv(colonial_rulers_path)
colonial_rulers_dict = colonial_rulers_df.set_index("country")[
    "colonial_rulers"
].to_dict()


# Function to process CSV files and add former_col_power column
def process_csv_files(directory, colonial_rulers_dict):
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            # Extract the country name by removing the last underscore and year
            country = "_".join(filename.split("_")[:-1])
            former_colonial_power = colonial_rulers_dict.get(country, None)
            if former_colonial_power:
                file_path = os.path.join(directory, filename)
                df = pd.read_csv(file_path)
                if "home_economy" in df.columns:
                    df["former_col_power"] = df["home_economy"].apply(
                        lambda x: x == former_colonial_power
                    )
                    df.to_csv(file_path, index=False)
                    print(f"Processed {file_path}")
                else:
                    print(f"Column 'home_economy' not found in {file_path}")


# Process the directories
process_csv_files(dir1999, colonial_rulers_dict)
process_csv_files(dir2001, colonial_rulers_dict)

print("Processed files and added former_col_power column.")

Processed ./data/country_spreadsheets/1999/samoa_1999.csv
Processed ./data/country_spreadsheets/1999/afghanistan_1999.csv
Processed ./data/country_spreadsheets/1999/niger_1999.csv
Processed ./data/country_spreadsheets/1999/uganda_1999.csv
Processed ./data/country_spreadsheets/1999/madagascar_1999.csv
Processed ./data/country_spreadsheets/1999/somalia_1999.csv
Processed ./data/country_spreadsheets/1999/mozambique_1999.csv
Processed ./data/country_spreadsheets/1999/democratic_republic_of_congo_1999.csv
Processed ./data/country_spreadsheets/1999/benin_1999.csv
Processed ./data/country_spreadsheets/1999/haiti_1999.csv
Processed ./data/country_spreadsheets/1999/bhutan_1999.csv
Processed ./data/country_spreadsheets/1999/burundi_1999.csv
Processed ./data/country_spreadsheets/1999/comoros_1999.csv
Processed ./data/country_spreadsheets/1999/bangladesh_1999.csv
Processed ./data/country_spreadsheets/1999/sao_tome_and_principe_1999.csv
Processed ./data/country_spreadsheets/1999/zambia_1999.csv
Pro

Create new variable neighbouring_country for based on coding in neighbouring_country spreadsheet in ./data/supplementary_data/bordering_countries.csv


In [44]:
import os
import pandas as pd

# Define the paths to the directories containing the CSV files
dir1999 = "./data/country_spreadsheets/1999"  
dir2001 = "./data/country_spreadsheets/2001"  

# Read the bordering countries list into a dictionary
bordering_countries_path = "./data/supplementary_data/bordering_countries.csv"
bordering_countries_df = pd.read_csv(bordering_countries_path)

# Create a dictionary where the key is the country and the value is a list of bordering countries
bordering_countries_dict = {}
for _, row in bordering_countries_df.iterrows():
    country = row["country"]
    borders = row["bordering_countries"]
    if isinstance(borders, str):  # Ensure borders is a string
        bordering_countries_dict[country] = [
            border.replace(" ", "_").lower() for border in borders.split(", ")
        ]


# Function to convert Title Case to snake_case
def to_snake_case(s):
    return s.lower().replace(" ", "_")


# Function to process CSV files and add bordering_country column
def process_csv_files(directory, bordering_countries_dict):
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            # Extract the country name by removing the last underscore and year
            country = "_".join(filename.split("_")[:-1])
            bordering_countries = bordering_countries_dict.get(country, [])

            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            if "home_economy" in df.columns:
                df["bordering_country"] = df["home_economy"].apply(
                    lambda x: to_snake_case(x) in bordering_countries
                )
                df.to_csv(file_path, index=False)
                print(f"Processed {file_path}")
            else:
                print(f"Column 'home_economy' not found in {file_path}")


# Process the directories
process_csv_files(dir1999, bordering_countries_dict)
process_csv_files(dir2001, bordering_countries_dict)

print("Processed files and added bordering_country column.")

Processed ./data/country_spreadsheets/1999/samoa_1999.csv
Processed ./data/country_spreadsheets/1999/afghanistan_1999.csv
Processed ./data/country_spreadsheets/1999/niger_1999.csv
Processed ./data/country_spreadsheets/1999/uganda_1999.csv
Processed ./data/country_spreadsheets/1999/madagascar_1999.csv
Processed ./data/country_spreadsheets/1999/somalia_1999.csv
Processed ./data/country_spreadsheets/1999/mozambique_1999.csv
Processed ./data/country_spreadsheets/1999/democratic_republic_of_congo_1999.csv
Processed ./data/country_spreadsheets/1999/benin_1999.csv
Processed ./data/country_spreadsheets/1999/haiti_1999.csv
Processed ./data/country_spreadsheets/1999/bhutan_1999.csv
Processed ./data/country_spreadsheets/1999/burundi_1999.csv
Processed ./data/country_spreadsheets/1999/comoros_1999.csv
Processed ./data/country_spreadsheets/1999/bangladesh_1999.csv
Processed ./data/country_spreadsheets/1999/sao_tome_and_principe_1999.csv
Processed ./data/country_spreadsheets/1999/zambia_1999.csv
Pro

### Working with master spreadsheets for 1999 and 2001


Create master spreadsheet for 1999


In [45]:
import os
import pandas as pd

# Directory where the CSV files are located
directory = "./data/country_spreadsheets/1999"  

# List of all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith(".csv")]

# List to store the results
results = []
missing_former_col_power_files = []


# Function to calculate the proportions
def calculate_proportions(df, col):
    total_count = len(df)
    if total_count == 0:
        return None
    mne_count = df[df["emne"] == True].shape[0]
    return mne_count / total_count


# Process each CSV file
for file in csv_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)

    # Extract the country name by splitting the filename at the last underscore
    country = "_".join(file.split("_")[:-1])

    # Calculate the required proportions
    prop_emnes = calculate_proportions(df, "emne")

    if "nat_res_ind" in df.columns:
        prop_emnes_excl_nat_res = calculate_proportions(
            df[df["nat_res_ind"] == False], "emne"
        )
    else:
        prop_emnes_excl_nat_res = prop_emnes

    if "former_col_power" in df.columns:
        prop_emnes_excl_former_col_power = calculate_proportions(
            df[df["former_col_power"] == False], "emne"
        )
    else:
        prop_emnes_excl_former_col_power = prop_emnes
        missing_former_col_power_files.append(file)

    # Check if there are zero companies
    zero_mnes = len(df) == 0

    # Check if there are zero companies excluding natural resource companies
    zero_mnes_excl_nat_res = (
        len(df[df["nat_res_ind"] == False]) == 0
        if "nat_res_ind" in df.columns
        else zero_mnes
    )

    # Check if there are zero companies excluding former colonial powers
    zero_mnes_excl_former_col_power = (
        len(df[df["former_col_power"] == False]) == 0
        if "former_col_power" in df.columns
        else zero_mnes
    )

    # Append the results
    results.append(
        {
            "country": country,
            "prop_emnes": prop_emnes,
            "prop_emnes_excl_nat_res": prop_emnes_excl_nat_res,
            "prop_emnes_excl_former_col_power": prop_emnes_excl_former_col_power,
            "zero_mnes": zero_mnes,
            "zero_mnes_excl_nat_res": zero_mnes_excl_nat_res,
            "zero_mnes_excl_former_col_power": zero_mnes_excl_former_col_power,
        }
    )

# Create a DataFrame from the results
master_df = pd.DataFrame(results)

# Save the master DataFrame to a CSV file
output_path = "./ldcs_1999_master.csv"
master_df.to_csv(output_path, index=False)

print(f"Master CSV file created at: {output_path}")

# Print the list of files missing the former_col_power column
if missing_former_col_power_files:
    print("The following files were missing the 'former_col_power' column:")
    for missing_file in missing_former_col_power_files:
        print(missing_file)
else:
    print("No files were missing the 'former_col_power' column.")

Master CSV file created at: ./ldcs_1999_master.csv
No files were missing the 'former_col_power' column.


Create master spreadsheet for 2001


In [46]:
import os
import pandas as pd

# Directory where the CSV files are located
directory = "./data/country_spreadsheets/2001"  

# List of all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith(".csv")]

# List to store the results
results = []
missing_former_col_power_files = []


# Function to calculate the proportions
def calculate_proportions(df, col):
    total_count = len(df)
    if total_count == 0:
        return None
    mne_count = df[df["emne"] == True].shape[0]
    return mne_count / total_count


# Process each CSV file
for file in csv_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)

    # Extract the country name by splitting the filename at the last underscore
    country = "_".join(file.split("_")[:-1])

    # Calculate the required proportions
    prop_emnes = calculate_proportions(df, "emne")

    if "nat_res_ind" in df.columns:
        prop_emnes_excl_nat_res = calculate_proportions(
            df[df["nat_res_ind"] == False], "emne"
        )
    else:
        prop_emnes_excl_nat_res = prop_emnes

    if "former_col_power" in df.columns:
        prop_emnes_excl_former_col_power = calculate_proportions(
            df[df["former_col_power"] == False], "emne"
        )
    else:
        prop_emnes_excl_former_col_power = prop_emnes
        missing_former_col_power_files.append(file)

    # Check if there are zero companies
    zero_mnes = len(df) == 0

    # Check if there are zero companies excluding natural resource companies
    zero_mnes_excl_nat_res = (
        len(df[df["nat_res_ind"] == False]) == 0
        if "nat_res_ind" in df.columns
        else zero_mnes
    )

    # Check if there are zero companies excluding former colonial powers
    zero_mnes_excl_former_col_power = (
        len(df[df["former_col_power"] == False]) == 0
        if "former_col_power" in df.columns
        else zero_mnes
    )

    # Append the results
    results.append(
        {
            "country": country,
            "prop_emnes": prop_emnes,
            "prop_emnes_excl_nat_res": prop_emnes_excl_nat_res,
            "prop_emnes_excl_former_col_power": prop_emnes_excl_former_col_power,
            "zero_mnes": zero_mnes,
            "zero_mnes_excl_nat_res": zero_mnes_excl_nat_res,
            "zero_mnes_excl_former_col_power": zero_mnes_excl_former_col_power,
        }
    )

# Create a DataFrame from the results
master_df = pd.DataFrame(results)

# Save the master DataFrame to a CSV file
output_path = "./ldcs_2001_master.csv"
master_df.to_csv(output_path, index=False)

print(f"Master CSV file created at: {output_path}")

# Print the list of files missing the former_col_power column
if missing_former_col_power_files:
    print("The following files were missing the 'former_col_power' column:")
    for missing_file in missing_former_col_power_files:
        print(missing_file)
else:
    print("No files were missing the 'former_col_power' column.")

Master CSV file created at: ./ldcs_2001_master.csv
No files were missing the 'former_col_power' column.


Create geogrphic_proximity variable in master spreadsheets. Variable will be True if any of the MNEs in that country are from a neighbouring country. 
Create colonial_link variables in master spreadsheets. Variable will be True if any of the MNEs in that country are from a former colonial power.


Create new variables for ldcs_1999_master.csv


In [47]:
import pandas as pd
import os

# Load the master spreadsheet
master_df = pd.read_csv("./ldcs_1999_master.csv")

# Define the directory containing the country files
country_files_dir = "./data/country_spreadsheets/1999"

# Create lists to store the new columns
geographic_proximity = []
colonial_link = []

# List to store countries without corresponding files
missing_files = []

# Iterate through each row in the master dataframe
for index, row in master_df.iterrows():
    country_name = row[
        "country"
    ]  # Assuming 'country' is the column name in the master file
    country_file_name = f"{country_name.lower()}_1999.csv"
    country_file_path = os.path.join(country_files_dir, country_file_name)

    if os.path.exists(country_file_path):
        # Load the corresponding country file
        country_df = pd.read_csv(country_file_path)

        # Check for geographic_proximity
        geo_prox = (
            country_df["bordering_country"].any()
            if "bordering_country" in country_df.columns
            else False
        )

        # Check for colonial_link
        col_link = (
            country_df["former_col_power"].any()
            if "former_col_power" in country_df.columns
            else False
        )

        geographic_proximity.append(geo_prox)
        colonial_link.append(col_link)
    else:
        geographic_proximity.append(False)
        colonial_link.append(False)
        missing_files.append(country_name)

# Add the new columns to the master dataframe
master_df["geographic_proximity"] = geographic_proximity
master_df["colonial_link"] = colonial_link

# Save the updated master dataframe to a new CSV file
master_df.to_csv("./ldcs_1999_master.csv", index=False)

# Alert for missing country files
if missing_files:
    print("The following country files were not found:")
    for country in missing_files:
        print(country)
else:
    print("All country files were found and processed.")

print("Master spreadsheet has been updated with new columns.")

All country files were found and processed.
Master spreadsheet has been updated with new columns.


Create new variables for ldcs_2001.csv


In [48]:
import pandas as pd
import os

# Load the master spreadsheet
master_df = pd.read_csv("./ldcs_2001_master.csv")

# Define the directory containing the country files
country_files_dir = "./data/country_spreadsheets/2001"

# Create lists to store the new columns
geographic_proximity = []
colonial_link = []

# List to store countries without corresponding files
missing_files = []

# Iterate through each row in the master dataframe
for index, row in master_df.iterrows():
    country_name = row[
        "country"
    ]  # Assuming 'country' is the column name in the master file
    country_file_name = f"{country_name.lower()}_2001.csv"
    country_file_path = os.path.join(country_files_dir, country_file_name)

    if os.path.exists(country_file_path):
        # Load the corresponding country file
        country_df = pd.read_csv(country_file_path)

        # Check for geographic_proximity
        geo_prox = (
            country_df["bordering_country"].any()
            if "bordering_country" in country_df.columns
            else False
        )

        # Check for colonial_link
        col_link = (
            country_df["former_col_power"].any()
            if "former_col_power" in country_df.columns
            else False
        )

        geographic_proximity.append(geo_prox)
        colonial_link.append(col_link)
    else:
        geographic_proximity.append(False)
        colonial_link.append(False)
        missing_files.append(country_name)

# Add the new columns to the master dataframe
master_df["geographic_proximity"] = geographic_proximity
master_df["colonial_link"] = colonial_link

# Save the updated master dataframe to a new CSV file
master_df.to_csv("./ldcs_2001_master.csv", index=False)

# Alert for missing country files
if missing_files:
    print("The following country files were not found:")
    for country in missing_files:
        print(country)
else:
    print("All country files were found and processed.")

print("Master spreadsheet has been updated with new columns.")

All country files were found and processed.
Master spreadsheet has been updated with new columns.


Add governance variables from supplementary data folder. Data entered into spreadhseets manually from Kaufmann (2003). 1998 data used for 1999 observation year/spreadsheet. 2000 data used for 2001 observation year/spreadsheet (i.e., 1 year lags).

In [49]:
import pandas as pd

# Define the paths to the CSV files
ldcs_1999_master_file = "./ldcs_1999_master.csv"
governance_variables_1998_file = (
    "./data/supplementary_data/governance_variables_1998.csv"
)
ldcs_2001_master_file = "./ldcs_2001_master.csv"
governance_variables_2000_file = (
    "./data/supplementary_data/governance_variables_2000.csv"
)

# Load the CSV files
ldcs_1999_master_df = pd.read_csv(ldcs_1999_master_file)
governance_variables_1998_df = pd.read_csv(governance_variables_1998_file)
ldcs_2001_master_df = pd.read_csv(ldcs_2001_master_file)
governance_variables_2000_df = pd.read_csv(governance_variables_2000_file)


# Function to merge only if columns do not already exist
def merge_if_columns_absent(master_df, governance_df, key="country"):
    governance_columns = set(governance_df.columns) - {key}
    if not governance_columns.issubset(master_df.columns):
        merged_df = pd.merge(master_df, governance_df, on=key, how="left")
        return merged_df
    else:
        print("Governance variables already exist in the master dataframe.")
        return master_df


# Merge the dataframes for 1999 if necessary
merged_1999_df = merge_if_columns_absent(
    ldcs_1999_master_df, governance_variables_1998_df
)

# Merge the dataframes for 2001 if necessary
merged_2001_df = merge_if_columns_absent(
    ldcs_2001_master_df, governance_variables_2000_df
)

# Save the merged dataframes to new CSV files
output_file_1999 = "./ldcs_1999_master.csv"
output_file_2001 = "./ldcs_2001_master.csv"
merged_1999_df.to_csv(output_file_1999, index=False)
merged_2001_df.to_csv(output_file_2001, index=False)

print(f"Merged 1999 dataset saved to {output_file_1999}")
print(f"Merged 2001 dataset saved to {output_file_2001}")

Merged 1999 dataset saved to ./ldcs_1999_master.csv
Merged 2001 dataset saved to ./ldcs_2001_master.csv


Add WDI control variables from supplementary data folder. Data entered into spreadhseet manually from World Bank (2004). Data are for 2002. 

In [50]:
import pandas as pd

# Define the paths to the CSV files
ldcs_1999_master_file = "./ldcs_1999_master.csv"
ldcs_2001_master_file = "./ldcs_2001_master.csv"
wdi_variables_2002_file = "./data/supplementary_data/wdi_variables_2002.csv"

# Load the CSV files
ldcs_1999_master_df = pd.read_csv(ldcs_1999_master_file)
ldcs_2001_master_df = pd.read_csv(ldcs_2001_master_file)
wdi_variables_2002_df = pd.read_csv(wdi_variables_2002_file)


# Function to merge WDI variables with master dataframe if they do not already exist
def merge_if_columns_absent(master_df, wdi_df, key="country"):
    wdi_columns = set(wdi_df.columns) - {key}
    if not wdi_columns.issubset(master_df.columns):
        merged_df = pd.merge(master_df, wdi_df, on=key, how="left")
        return merged_df
    else:
        print("WDI variables already exist in the master dataframe.")
        return master_df


# Merge the dataframes for 1999 if necessary
merged_1999_df = merge_if_columns_absent(ldcs_1999_master_df, wdi_variables_2002_df)

# Merge the dataframes for 2001 if necessary
merged_2001_df = merge_if_columns_absent(ldcs_2001_master_df, wdi_variables_2002_df)

# Save the merged dataframes to new CSV files
output_file_1999 = "./ldcs_1999_master.csv"
output_file_2001 = "./ldcs_2001_master.csv"
merged_1999_df.to_csv(output_file_1999, index=False)
merged_2001_df.to_csv(output_file_2001, index=False)

print(f"Merged 1999 dataset with WDI variables saved to {output_file_1999}")
print(f"Merged 2001 dataset with WDI variables saved to {output_file_2001}")

Merged 1999 dataset with WDI variables saved to ./ldcs_1999_master.csv
Merged 2001 dataset with WDI variables saved to ./ldcs_2001_master.csv


Create "year" variable in each of the master spreadsheets (1999 and 2001) and merge into panel dataset.

In [51]:
import pandas as pd

# Define the paths to the CSV files
file_1999 = "./ldcs_1999_master.csv"
file_2001 = "./ldcs_2001_master.csv"

# Load the CSV files
df_1999 = pd.read_csv(file_1999)
df_2001 = pd.read_csv(file_2001)

# Add the 'year' column
df_1999['year'] = 1999
df_2001['year'] = 2001

# Merge the dataframes
main_dataset = pd.concat([df_1999, df_2001], ignore_index=True)

# Save the merged dataframe to a new CSV file
output_file = "./main_dataset.csv"
main_dataset.to_csv(output_file, index=False)

print(f"Merged dataset saved to {output_file}")

Merged dataset saved to ./main_dataset.csv


Convert Booleans in main_dataset.csv to zeros and ones for Stata.

In [53]:
import pandas as pd

# Load the dataset
dataset_path = "./main_dataset.csv"
df = pd.read_csv(dataset_path)

# Convert boolean columns to numeric
df["geographic_proximity"] = df["geographic_proximity"].astype(int)
df["colonial_link"] = df["colonial_link"].astype(int)

# Save the modified dataset
modified_dataset_path = "./main_dataset.csv"
df.to_csv(modified_dataset_path, index=False)

print("Boolean variables converted and dataset saved.")

Boolean variables converted and dataset saved.


Move "year" column next to "country" column in main_dataset.csv, then sort by country, year.

In [54]:
import pandas as pd

# Define the path to the CSV file
main_dataset_file = "./main_dataset.csv"

# Load the CSV file
main_df = pd.read_csv(main_dataset_file)

# Move the 'year' column immediately to the right of the 'country' column
country_col_index = main_df.columns.get_loc("country")
columns = list(main_df.columns)
columns.insert(country_col_index + 1, columns.pop(columns.index("year")))
main_df = main_df[columns]

# Sort the dataframe by 'country' and 'year'
main_df = main_df.sort_values(by=["country", "year"])

# Save the updated dataframe to a new CSV file
output_file = "./main_dataset.csv"
main_df.to_csv(output_file, index=False)

print(f"Updated dataset saved to {output_file}")

Updated dataset saved to ./main_dataset.csv


Panel dataset created. Stat .do files are in project "do_files" folder labeled:
- model_1b_replication_do_file.do
- model_2b_replication_do_file.do
- model_3b_replication_do_file.do

Models created using Stata with output logs stored in "results" folder. Summary of results and differences from original study's finding in "summaries" folder.