Import LDCs greenfield FDI info for years 2017-2018. List of LDCs taken from UNCTAD (2017, 2018). Greenfield obtained from FT fDi Markets database and captures all transactions over USD 500,000. 2017_2018_greenfield_data_fdi_markets_original.csv saved as "main_dataset_greenfield.csv" and all variables and all country names converted to snake_case to faciliate data wrangling.

In [2]:
import pandas as pd

# Load data from fDi Markets
fdi_data = "./raw_data/2017_2018_greenfield_data_fdi_markets_original.csv"
df = pd.read_csv(fdi_data)

# Define function to convert text to snake_case
def to_snake_case(s):
    return s.strip().replace(" ", "_").replace("-", "_").lower()

# Convert variable names (i.e., column headers) to snake_case for data wrangling
df.columns = [to_snake_case(col) for col in df.columns]

# Convert values in 'destination_country" and "source_country" to snake_case
df["destination_country"] = df["destination_country"].apply(to_snake_case)
df["source_country"] = df["source_country"].apply(to_snake_case)

# Save modified DataFrame to new CSV file
output_filepath = "./processed_data/greenfield_main_dataset.csv"
df.to_csv(output_filepath, index=False)

print(f"File saved to {output_filepath}")


FileNotFoundError: [Errno 2] No such file or directory: './raw_data/2017_2018_greenfield_data_fdi_markets_original.csv'

Create spreadsheet of unique destination countries for data auditing/validation.

In [2]:
import pandas as pd

# Load processed greenfield dataset
filepath = "./processed_data/greenfield_main_dataset.csv"
df = pd.read_csv(filepath)

# Get unique country names and count of each
unique_destination_countries = df["destination_country"].value_counts().reset_index()

# Rename columns
unique_destination_countries.columns = ["country", "count"]

# Sort new df alphabetically
unique_destination_countries = unique_destination_countries.sort_values(by="country").reset_index(drop=True)

# Save to CSV
output_filepath = "./audit_data/unique_destinationa_countries.csv"
unique_destination_countries.to_csv(output_filepath, index=False)

print(f"File saved to {output_filepath}")

# Display results
unique_destination_countries

File saved to ./audit_data/unique_destinationa_countries.csv


Unnamed: 0,country,count
0,afghanistan,3
1,angola,11
2,bangladesh,47
3,benin,8
4,bhutan,6
5,burkina_faso,8
6,burundi,4
7,cambodia,92
8,central_african_republic,2
9,chad,2


Compare list of destination countries with UNCTAD 2017-2018 LDCs list for data audit/validation purposes.

In [3]:
import pandas as pd

# Load both spreadsheets into DataFrames
destination_countries_df = pd.read_csv("./audit_data/unique_destinationa_countries.csv")
ldcs_df = pd.read_csv("./variable_data/ldcs_list_2017_2018.csv")

# Convert country names to snake_case
def to_snake_case(s):
    return s.strip().replace(" ", "_").replace("-", "_").lower()

destination_countries_df["country"] = destination_countries_df["country"].apply(to_snake_case)
ldcs_df["country"] = ldcs_df["country"].apply(to_snake_case)

# Convert pandas Series objects into sets for comparison
destination_countries_set = set(destination_countries_df["country"])
ldcs_set = set(ldcs_df["country"])

# Create new sets of missing countries
missing_in_destination_countries_list = sorted(ldcs_set - destination_countries_set)
missing_in_ldcs_list = sorted(destination_countries_set - ldcs_set)

# Display results
print("LDCs not appearing as desintation countries in greenfield_main_dataset.csv:")
for country in missing_in_destination_countries_list:
    print(country)
print("Countries appearing as destination countires in greenfield_main_dataset.csv but not appearing in LDCs list:")
for country in missing_in_ldcs_list:
    print(country)



LDCs not appearing as desintation countries in greenfield_main_dataset.csv:
comoros
eritrea
guinea_bissau
kiribati
sao_tome_and_principe
solomon_islands
south_sudan
tuvalu
vanuatu
yemen
Countries appearing as destination countires in greenfield_main_dataset.csv but not appearing in LDCs list:


LDCs not appearing in greenfield FDI are consistent with those for which there was no data in fDi Markets for those years.

Create spreadsheet of unique source countries for data auditing/validation.

In [4]:
import pandas as pd

# Load processed greenfield dataset
filepath = "./processed_data/greenfield_main_dataset.csv"
df = pd.read_csv(filepath)

# Get unique source country names and count of each
unique_source_countries = df["source_country"].value_counts().reset_index()

# Rename columns
unique_source_countries.columns = ["country", "count"]

# Sort new df alphabetically
unique_source_countries = unique_source_countries.sort_values(by="country").reset_index(drop=True)

# Save to CSV
output_filepath = "./audit_data/unique_source_countries.csv"
unique_source_countries.to_csv(output_filepath, index=False)

print(f"File saved to {output_filepath}")

# Display results
unique_source_countries

File saved to ./audit_data/unique_source_countries.csv


Unnamed: 0,country,count
0,australia,4
1,austria,2
2,azerbaijan,1
3,barbados,1
4,belgium,6
...,...,...
58,uae,26
59,ukraine,4
60,united_kingdom,28
61,united_states,49


Compare list of source countries with UNCTAD 2017-2018 developed country list for data audit/validation purposes.

In [5]:
import pandas as pd

# Load both spreadsheets into DataFrames
source_countries_df = pd.read_csv("./audit_data/unique_source_countries.csv")
developed_countries_df = pd.read_csv("./variable_data/developed_countries_2017_2018.csv")


# Convert country names to snake_case
def to_snake_case(s):
    return s.strip().replace(" ", "_").replace("-", "_").lower()


source_countries_df["country"] = source_countries_df["country"].apply(
    to_snake_case
)
developed_countries_df["country"] = developed_countries_df["country"].apply(to_snake_case)

# Convert pandas Series objects into sets for comparison
source_countries_set = set(source_countries_df["country"])
developed_countries_set = set(developed_countries_df["country"])

# Create new sets of missing countries
missing_in_source_countries = developed_countries_set - source_countries_set

# Display results
print("Developed countries not appearing as source countries in greenfield_main_dataset.csv:")
for country in sorted(missing_in_source_countries):
    print(country)

Developed countries not appearing as source countries in greenfield_main_dataset.csv:
andorra
bermuda
bulgaria
czechia
estonia
faroe_islands
gibraltar
greece
greenland
holy_see
hungary
iceland
ireland
latvia
lithuania
luxembourg
malta
poland
romania
saint_pierre_and_miquelon
san_marino
slovakia
slovenia
sweden


Developed countries not appearing in greenfield_main_dataset were checked manually against unique_source_countries.csv to ensure no matches were missed due to spelling or punctuation issues.

Create "developed" variable in greenfield_main_dataset.csv. Variable takes value of True if source country appears in list of developed countries and False otherwise. Also created "emne" variable that takes the inverse value of the "developed" variable.

In [6]:
import pandas as pd

# Load spreadsheets into DataFrames

greenfield_filepath = "./processed_data/greenfield_main_dataset.csv"
developed_countries_filepath = "./variable_data/developed_countries_2017_2018.csv"

greenfield_df = pd.read_csv(greenfield_filepath)
developed_countries_df = pd.read_csv(developed_countries_filepath)

# Create set of developed countries
developed_countries_set = set(developed_countries_df["country"])

# Add "developed" variable to greenfield_df
greenfield_df["developed"] = greenfield_df["source_country"].isin(developed_countries_set)

# Add "enne" variable to greenfield_df
greenfield_df["emne"] = ~greenfield_df["source_country"].isin(developed_countries_set)

# Save the updated DataFrame as greenfield_main_dataset.csv 
greenfield_df.to_csv(greenfield_filepath, index=False)

print(f"greenfield_main_dataset.csv updated with \"developed\" and \"emne\" variables.")


greenfield_main_dataset.csv updated with "developed" and "emne" variables.


Create "bordering_country" variable. Variable is True if source countries shares a border with destination country and False otherwise. Bordering countries checked from CIA World Factbook https://www.cia.gov/the-world-factbook/ 

In [7]:
import pandas as pd

# Load spreadsheets into DataFrames
greenfield_filepath = "./processed_data/greenfield_main_dataset.csv"
bordering_countries_filepath = "./variable_data/bordering_countries_2017_2018.csv"

greenfield_df = pd.read_csv(greenfield_filepath)
bordering_countries_df = pd.read_csv(bordering_countries_filepath)

# Create dictionary of bordering countries
bordering_countries_dict = bordering_countries_df.set_index("ldc")["bordering_countries"].to_dict()

# Function to check if the source country is in the list of bordering countries
def is_bordering(source_country, destination_country):
    bordering_countries = bordering_countries_dict.get(destination_country, "")
    if pd.isna(bordering_countries) or not bordering_countries:
        return False
    bordering_list = bordering_countries.split(", ")
    return source_country in bordering_list

# Apply the is_bordering function to create the new "bordering_country" variable in greenfield_df
greenfield_df["bordering_country"] = greenfield_df.apply(
    lambda row: is_bordering(row["source_country"], row["destination_country"]), axis=1
)

# Save the updated DataFrame to a new CSV file
greenfield_df.to_csv(greenfield_filepath, index=False)

print(f"The new variable \"bordering country\" has been added and the updated file has been saved.")


The new variable "bordering country" has been added and the updated file has been saved.


Create new "industry" variable by joining "sector", "sub_sector", and "activity". Create list/spreadsheet of all unique values in "industry" column (and their count) from greenfield_main_dataset.csv.

In [8]:
import pandas as pd

# Load processed greenfield dataset
filepath = "./processed_data/greenfield_main_dataset.csv"
df = pd.read_csv(filepath)

# Define function to convert strings to snake_case
def to_snake_case(s):
    s = (
        s.replace(" ", "_")
        .replace("&", "and")
        .replace(",", "")
        .replace("/", "")
        .replace("-", "_")
        .replace(",", "")
        .replace("(", "")
        .replace(")", "")
    )
    return s.lower()

# Create a new 'industry' column by joining 'sector', 'sub-sector', and 'activity'
df["industry"] = (
    df[["sector", "sub_sector", "activity"]]
    .fillna("")
    .apply(lambda x: " ".join(x), axis=1)
    
)

# Convert the new 'industry' column to snake_case
df["industry"] = df["industry"].apply(to_snake_case)

# Save the modified DataFrame to CSV
df.to_csv(filepath, index=False)

# Calculate the count of unique values in 'industry' column 
industry_counts = df["industry"].value_counts().reset_index()
industry_counts.columns = ["industry", "count"]

# Sort alphabetically
industry_counts.sort_values(by="industry", inplace=True)

# Save to CSV
industry_counts_filepath = "./audit_data/industry_counts.csv"
industry_counts.to_csv(industry_counts_filepath, index=False)

print(
    f"Updated dataset has been saved to {filepath}.\n\
        Industry counts has been saved to {industry_counts_filepath}"
)

Updated dataset has been saved to ./processed_data/greenfield_main_dataset.csv.
        Industry counts has been saved to ./audit_data/industry_counts.csv


Create "natural_resource" variable. Variable is True if industry is listed as a natural resource industry in the "nat_resource_list.csv" spreadsheet.

In [9]:
import pandas as pd

# Load both spreadsheets into DataFrames
main_dataset_path = "./processed_data/greenfield_main_dataset.csv"
nat_resource_list_path = "./variable_data/nat_resource_list.csv"

main_df = pd.read_csv(main_dataset_path)
nat_resource_df = pd.read_csv(nat_resource_list_path)

# Create a set of natural resource industries
natural_resource_set = set(
    nat_resource_df[nat_resource_df["natural_resource"] == True]["industry"]
)

# Create a new column "natural_resource_ind" in main_df
main_df["natural_resource_ind"] = main_df["industry"].apply(
    lambda x: x in natural_resource_set
)

# Save the modified DataFrame to CSV
main_df.to_csv(main_dataset_path, index=False)

print(
    f"The modified dataset has been saved to {main_dataset_path}"
)

The modified dataset has been saved to ./processed_data/greenfield_main_dataset.csv


Create "colonial_link" variable. Variable is True if the source country is listed in the "colonial_rulers" column for that country in "colonial_rulers.csv" and False otherwise.

In [10]:
import pandas as pd

# Load both spreadsheets into DataFrames
main_dataset_path = "./processed_data/greenfield_main_dataset.csv"
colonial_rulers_path = "./variable_data/colonial_rulers_list.csv"

main_df = pd.read_csv(main_dataset_path)
colonial_rulers_df = pd.read_csv(colonial_rulers_path)

# Create a dictionary from colonial_rulers_list
colonial_rulers_dict = colonial_rulers_df.set_index("country")[
    "colonial_rulers"
    ].to_dict()

# Function to check for colonial link
def has_colonial_link(row):
    destination = row["destination_country"]
    source = row["source_country"]
    colonial_rulers = colonial_rulers_dict.get(destination)

    if isinstance(colonial_rulers, str):
        colonial_rulers_list = [
            ruler.strip().replace(" ", "_").replace("-", "_").lower() for ruler in colonial_rulers.split(", ")
        ]
        return source in colonial_rulers_list
    return False

# Apply the function to create new "colonial_link" column
main_df["colonial_link"] = main_df.apply(has_colonial_link, axis=1)

# Save the updated DatafRame to CSV
main_df.to_csv(main_dataset_path, index=False)

print(f"Updated dataset saved to {main_dataset_path}")

Updated dataset saved to ./processed_data/greenfield_main_dataset.csv


Create 2017 and 2018 spreadsheets for each individual LDC

In [11]:
import pandas as pd
import os

# Load both spreadsheets into DataFrames
main_dataset_path = "./processed_data/greenfield_main_dataset.csv"
ldcs_list_path = "./variable_data/ldcs_list_2017_2018.csv"

main_df = pd.read_csv(main_dataset_path)
ldcs_df = pd.read_csv(ldcs_list_path)


# Define function to convert country names to snake_case
def to_snake_case(s):
    return s.strip().replace(" ", "_").replace("-", "_").lower()

# Convert country names in both DataFrames to snake_case
main_df["destination_country"].apply(to_snake_case)
ldcs_df["country"].apply(to_snake_case)

# Convert "project_date" to datetime for easier filtering
main_df["project_date"] = pd.to_datetime(
    main_df["project_date"], format="%b %Y"
)

# Extract year from "project_date"
main_df["year"] = main_df["project_date"].dt.year

# Create output directories for country spreadsheets
output_dir = "./country_spreadsheets/greenfield"

for year in [2017, 2018]:
    year_dir = os.path.join(output_dir, str(year))
    if not os.path.exists(year_dir):
        os.makedirs(year_dir)

# Create list to keep track of rows not assigned to a spreadhseet
unassigned_rows = []

# List to keep track of empty spreadsheets
empty_spreadsheets = []

# Proess each country in the ldcs_df
for country in ldcs_df["country"]:
    for year in [2017, 2018]:
        # Filter data for the specific country and year
        country_year_data = main_df[
            (main_df["destination_country"] == country)
            & (main_df["year"] == year)
        ]

        # Create filename
        year_dir = os.path.join(output_dir, str(year))
        filename = f"{year_dir}/{country}_greenfield_{year}.csv"

        # Save data to CSV
        if not country_year_data.empty:
            country_year_data.to_csv(filename, index=False)
        else:
            # Save an empty CSV with the same headers as greenfield_main_dataset.csv
            empty_df = main_df.head(0)
            empty_df.to_csv(filename, index=False)
            empty_spreadsheets.append(filename)

# Identify rows not assigned to any country spreadsheet
assigned_rows = main_df[
    main_df["destination_country"].isin(ldcs_df["country"])
]
unassigned_rows = main_df[~main_df.index.isin(assigned_rows.index)]

# Save unassigned rows to a CSV file
audit_dir = "./audit_data"
unassigned_filename = f"{audit_dir}/unassigned_rows.csv"
unassigned_rows.to_csv(unassigned_filename, index=False)

# Save list of empty spreadsheets to a CSV file
empty_spreadsheets_filename = f"{audit_dir}/empty_spreadsheets.csv"
pd.DataFrame(empty_spreadsheets, columns=["empty_spreadsheets"]).to_csv(
    empty_spreadsheets_filename, index=False
)

print("Country spreadsheets created and saved to \"country_spreadsheets/greenfield\"")


Country spreadsheets created and saved to "country_spreadsheets/greenfield"


Create master spreadsheets for 2017 and 2018 that show greenfield FDI total; total excluding natural resource industries, and total excluding firms from former colonial powers.

In [12]:
import pandas as pd
import os

# Define the directory paths
base_dir = "./country_spreadsheets"
years = ["2017", "2018"]
output_dir = "./processed_data"

# Initialize an empty list for each year to hold data
data_2017 = []
data_2018 = []

# Function to process each file and extract the required information
def process_file(filepath, country, year):
    df = pd.read_csv(filepath)

    total_greenfield = df["capital_investment"].sum()
    total_greenfield_excl_nat_res = df[df["natural_resource_ind"] == False][
        "capital_investment"
    ].sum()
    total_greenfield_excl_col_link = df[df["colonial_link"] == False][
        "capital_investment"
    ].sum()
    emne_greenfield = df[df["emne"] == True]["capital_investment"].sum()
    emne_greenfield_exl_nat_res = df[
        (df["emne"] == True) & (df["natural_resource_ind"] == False)
    ]["capital_investment"].sum()
    emne_greenfield_exl_col_link = df[
        (df["emne"] == True) & (df["colonial_link"] == False)
    ]["capital_investment"].sum()

    return {
        "country": country,
        "year": year,
        "total_greenfield": total_greenfield,
        "total_greenefield_excl_nat_res": total_greenfield_excl_nat_res,
        "total_greenfield_excl_col_link": total_greenfield_excl_col_link,
        "emne_greenfield": emne_greenfield,
        "emne_greenfield_excl_nat_res": emne_greenfield_exl_nat_res,
        "emne_greenfield_excl_col_link": emne_greenfield_exl_col_link,
    }

# Loop through each year and process the files
for year in years:
    year_dir = os.path.join(base_dir, "greenfield", year)
    for file_name in os.listdir(year_dir):
        if file_name.endswith(".csv"):
            parts = file_name.split("_")

            # Extract country name by joining all parts except the last two
            country = "_".join(parts[:-2]) 
            filepath = os.path.join(year_dir, file_name)
            data = process_file(filepath, country, year)
            if year == "2017":
                data_2017.append(data)
            else:
                data_2018.append(data)

# Create DataFrames for 2017 and 2018
df_2017 = pd.DataFrame(data_2017)
df_2018 = pd.DataFrame(data_2018)

# Sort the DataFrames
df_2017 = df_2017.sort_values(by=["country"])
df_2018 = df_2018.sort_values(by=["country"])

# Save the DataFrames to CSV
df_2017.to_csv(os.path.join(output_dir, "master_dataset_2017.csv"), index=False)
df_2018.to_csv(os.path.join(output_dir, "master_dataset_2018.csv"), index=False)

print("Master spreadsheets created successfully.")



Master spreadsheets created successfully.


### Add governance variables/data from World Governance Indicators spreadsheet found here: https://www.worldbank.org/content/dam/sites/govindicators/doc/wgidataset.xlsx

Standardize names to same format as main datasets for data wrangling. Names checked manually and presented in 'standardized' format in "./wgi_data/supplementary_data/name_conversions_wgi.csv". 

Filter WGI variables down to LDCs and check for any missing LDC countries.

In [80]:
import pandas as pd

# Load name conversion and ldcs spreadsheets
name_conversions = pd.read_csv("./variable_data/wgi_data/supplementary_data/name_conversions_wgi.csv")
ldcs_df = pd.read_csv("./variable_data/ldcs_list_2017_2018.csv")

# Create a dictionary from the name conversions, dropping NaN values
name_conversions_dict = {
    row["old_name"]: row["new_name"] for _, row in name_conversions.dropna().iterrows()
}

# Dictionary of WGI variable filepaths
filepaths = {
    "./raw_data/wgi_raw_data/voice_and_acc.csv": "variable_data/wgi_data/wgi_variables/voice_and_acc.csv",
    "./raw_data/wgi_raw_data/pol_stability.csv": "variable_data/wgi_data/wgi_variables/pol_stability.csv",
    "./raw_data/wgi_raw_data/govt_effectiveness.csv": "variable_data/wgi_data/wgi_variables/govt_effectiveness.csv",
    "./raw_data/wgi_raw_data/reg_quality.csv": "variable_data/wgi_data/wgi_variables/reg_quality.csv",
    "./raw_data/wgi_raw_data/rule_of_law.csv": "variable_data/wgi_data/wgi_variables/rule_of_law.csv",
    "./raw_data/wgi_raw_data/control_of_corruption.csv": "variable_data/wgi_data/wgi_variables/control_of_corruption.csv",
}

# Define function to convert names to snake_case
def to_snake_case(s):
    return s.strip().replace(" ", "_").replace("-", "_").lower()

# Define function to filter countries and check for missing LDCs
def filter_and_check(df, ldcs_set):
    filtered_df = df[df["country"].isin(ldcs_set)]
    found_countries = set(filtered_df["country"])
    missing_countries = ldcs_set - found_countries
    return filtered_df, missing_countries

# Function to convert country names and save to new file
def convert_country_names(input_path, output_path, name_conversions_dict):
    df = pd.read_csv(input_path)
    # Convert cocuntry names using the dictionary and apply snake_case conversion
    df["country"] = df["country"].apply(lambda x: to_snake_case(name_conversions_dict.get(x, x)))
    df.to_csv(output_path, index=False)
    return df


# Convert country names to snake_case in the LDCs DataFrame
ldcs_df["country"] = ldcs_df["country"].apply(to_snake_case)

# Create LDCs set
ldcs_set = set(ldcs_df["country"])

# Dictionary to store missing countries for each file
missing_countries_dict = {}

# Apply the conversion to each file in "raw_data/wgi_raw_data" and save to "variable_data/wgi_data/wgi_variables"
for input_path, output_path, in filepaths.items():
    df = convert_country_names(input_path, output_path, name_conversions_dict)
    filtered_df, missing_countries = filter_and_check(df, ldcs_set)
    if missing_countries:
        missing_countries_dict[output_path] = list(missing_countries)
    filtered_df.to_csv(output_path, index=False)


# Check if there are any missing countries
if missing_countries_dict:
    # Convert the missing countries dictionary to a DataFrame and print
    missing_countries_df = pd.DataFrame(
        {k: pd.Series(v) for k, v in missing_countries_dict.items()}
    )
    print("Missing Countries in Each File:")
    print(missing_countries_df)
else:
    # All LDCs were found in every file
    print("All LDCs were found in every file.")

All LDCs were found in every file.


Add WGI variables to master datasets (2017 and 2018).
2016 year data added to "./processed_data/master_dataset_2017.csv"
2017 year data added to "./processed_data/master_dataset_2018.csv"

In [81]:
import pandas as pd

# Load the master datasets
master_datasets = {
    2017: pd.read_csv("./processed_data/master_dataset_2017.csv"),
    2018: pd.read_csv("./processed_data/master_dataset_2018.csv"),
}

# List of WGI files
wgi_files = [
    "./variable_data/wgi_data/wgi_variables/voice_and_acc.csv",
    "./variable_data/wgi_data/wgi_variables/pol_stability.csv",
    "./variable_data/wgi_data/wgi_variables/govt_effectiveness.csv",
    "./variable_data/wgi_data/wgi_variables/reg_quality.csv",
    "./variable_data/wgi_data/wgi_variables/rule_of_law.csv",
    "./variable_data/wgi_data/wgi_variables/control_of_corruption.csv",
]
# Function to extract variables names from filenames   
def extract_var_name(filepath):
    return filepath.split("/")[-1].replace(".csv", "")

# Function to merge WGI data with the master dataset for a given year
def merge_wgi_data(master_df, wgi_file, var_name, year):
    wgi_df = pd.read_csv(wgi_file)
    wgi_df = wgi_df[["country", f"{year}_estimate"]].rename(
        columns={f"{year}_estimate": f"{var_name}_{year}"}
    )
    # Merge only if the variable does not already exist in the master dataset
    if f"{var_name}_{year}" not in master_df.columns:
        return master_df.merge(wgi_df, on="country", how="left")
    return master_df

for year, master_df in master_datasets.items():
    for wgi_file in wgi_files:
        var_name = extract_var_name(wgi_file)
        master_datasets[year] = merge_wgi_data(master_datasets[year], wgi_file, var_name, year - 1)

# Save the ujpdated master datasets
for year, master_df in master_datasets.items():
    master_df.to_csv(f"./processed_data/master_dataset_{year}.csv", index=False)

print("Datasets have been updated and saved.")

Datasets have been updated and saved.


# Misc. housekeeping code.

In [1]:
import pandas as pd

# Load data
input_filepath = "./variable_data/bordering_countries_2017_2018.csv"
output_filepath = "./variable_data/bordering_countries_2017_2018_fixed.csv"

bordering_countries_old = pd.read_csv(input_filepath)


# Function to convert names to snake_case
def to_snake_case(country_name):
    if pd.isna(country_name):  # Handle missing values
        return country_name
    return (
        country_name.replace("'", "_")
        .replace(" ", "_")
        .replace("-", "_")
        .replace(",", "_")
        .lower()
    )


# Check if the "bordering_countries" column exists
if "bordering_countries" in bordering_countries_old.columns:
    bordering_countries_old["bordering_countries"] = bordering_countries_old[
        "bordering_countries"
    ].apply(
        lambda x: ", ".join(
            [to_snake_case(country) for country in str(x).split(", ") if country]
        )
    )
else:
    print("The 'bordering_countries' column is not found in the CSV file.")

# Save the updated DataFrame to a new CSV file
bordering_countries_old.to_csv(output_filepath)
print(f"File with snake_case countries saved to {output_filepath}.")

FileNotFoundError: [Errno 2] No such file or directory: './variable_data/bordering_countries_2017_2018.csv'

Create "nat_resource_list_m_and_a.csv" using tsicp variable from m_and_a_main_dataset.