# HOW TO USE

**Notes**:

- To use this script please put the new benchmark data file (li_benchmark_data.csv) inside ./prev_lib_data
- Rename the file with version (please continue the file version li_benchmark_data_v[int].csv. The purpose of the versioning is we want to keep the prev benchmark value

In [1]:
import pandas as pd
import re
import json
import glob
import re

# Function

In [2]:
# Function to clean and convert to float
def convert_to_float(value):
    if isinstance(value, str):  # Check if it's a string
        value = value.replace(" ", "").replace(",", ".")  # Remove spaces and replace "," with "."
        return float(value)
    return value  # Return original if it's not a string

In [3]:
def remove_extra_whitespace(val):
    temp = val.split()
    return ' '.join(temp)

In [4]:
def find_country(val):
    # manual match
    if val == "Turkey":
        val = "Turkiye"
    # if val == "Anguilla":
    #     print("404", val)
    #     return None
    if val == "Central African Rep.":
        val = "Central African Republic"
    if val == "Congo, Dem. Rep. of the":
        val = "Congo, Dem. Rep."
    if val == "Congo, Rep. of":
        val = "Congo, Rep."
    if val == "Côte d'Ivoire":
        val = "Cote d'Ivoire"
    if val == "Curaçao, Kingdom of the Netherlands":
        val = "Curacao"
    if val == "Czech Rep.":
        val = "Czech Republic"
    if val == "Dominican Rep.":
        val = "Dominican Republic"
    if val == "Egypt, Arab Rep. of":
        val = "Egypt, Arab Rep."
    # if val == "Guadeloupe":
    #     print("404", val)
    #     return None
    if val == "Iran, Islamic Rep. of":
        val = "Iran, Islamic Rep."
    if val == "Korea, Rep. of":
        val = "Korea, Rep."
    if val == "Kyrgyz Rep.":
        val = "Kyrgyz Republic"
    if val == "Lao People's Dem. Rep.":
        val = "Lao PDR"
    # if val == "Martinique":
    #     print("404", val)
    #     return None
    if val == "Micronesia, Federated States of":
        val = "Micronesia, Fed. Sts."
    # if val == "Montserrat":
    #     print("404", val)
    #     return None
    # if val == "San Marino, Rep. of":
    #     print("404", val)
    #     return None
    # if val == "São Tomé and Príncipe, Dem. Rep. of":
    #     print("404", val)
    #     return None
    if val == "Sint Maarten, Kingdom of the Netherlands":
        val = "Sint Maarten (Dutch part)"
    if val == "Slovak Rep.":
        val = "Slovak Republic"
    # if val == "Syrian Arab Rep.":
    #     print("404", val)
    #     return None
    if val == "Venezuela, Rep. Bolivariana de":
        val = "Venezuela, RB"
    if val == "Yemen, Rep. of":
        val = "Yemen, Rep."
    # EOL manual match
    
    find_country = countries[countries["country"].str.lower() == str(val).lower()]
    if find_country.empty:
        # Another custom way
        val_lower = str(val).lower()
        val_tmp = val_lower.lower().split(',')
        for index, row in countries.iterrows():
            # Convert the country name in the DataFrame to lowercase
            country_name = row['country'].lower()
            if val_lower == 'nan':
                return None
            if country_name not in val_tmp:
                continue
            return row["id"]
        # EOL Another custom way
        print("404", val)
        return val
    return find_country["id"].to_list()[0]

In [5]:
# Function to melt data and adjust the year
def melt_conversion_data(df, currency_name):
    # Melt the data: 'Country Name' stays, everything else is melted
    melted_df = pd.melt(df, id_vars=['Country Name'], var_name='year', value_name='value')
    
    # Convert Year to integer by removing the decimal part
    melted_df['year'] = melted_df['year'].astype(float).astype(int)

    # Filter out rows where Year is before 2020
    melted_df = melted_df[melted_df['year'] >= 2020]
    
    # Add a new column 'currency' to identify whether it's USD or EUR
    melted_df['currency'] = currency_name
    
    # Rename 'Country Name' to 'country_name' for consistency
    melted_df.rename(columns={'Country Name': 'country'}, inplace=True)
    
    return melted_df

# Load source

## Country mapping JSON

In [6]:
country_mapping = {}
with open("./country_names_matched.json", "r") as file:
    json_data = json.load(file)

country_mapping = {entry["benchmark name"]: entry["tool name"] for entry in json_data}

## Country

In [7]:
countries = pd.read_csv("./countries.csv")

In [8]:
countries.tail()

Unnamed: 0,id,country,currency,abbreviation
205,206,Martinique,Euro,EUR
206,207,Montserrat,East Caribbean Dollar,XCD
207,208,"San Marino, Rep. Of",Euro,EUR
208,209,"São Tomé and Príncipe, Dem. Rep. of",Dobra,STN
209,210,Syrian Arab Rep.,Syrian Pound,SYP


## Region

In [9]:
regions = pd.read_csv('./regions.csv')

In [10]:
regions.tail(5)

Unnamed: 0,id,region,country_id,country
78,79,Rural Areas and Small Towns / All coffee and c...,[145],['Peru']
79,80,Urban /,[75],['Guatemala']
80,81,Urban / Ho Chi Minh City,[199],['Vietnam']
81,82,Urban / Urban Banten Province,[85],['Indonesia']
82,83,Urban / Urban West Java Province (SUBANG AND G...,[85],['Indonesia']


# Merge prev LIB data

In [11]:
# Match files like li_benchmark_data_v2.csv, li_benchmark_data_v3.csv, etc.
files = glob.glob('./prev_lib_data/li_benchmark_data_v*.csv')

libs = {}
for file in files:
    match = re.search(r'v(\d+)', file)
    if match:
        version = match.group(1)
        libs[f'lib_v{version}'] = pd.read_csv(file)

# Now libs contains all versioned files, e.g., libs['lib_v2'], libs['lib_v3'], etc.

In [12]:
libs['lib_v6'].head()

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,household_equiv,links
0,Angola,Rural / coffee-producing households in\r\nCua...,New Foresight,7.5,2,2023.0,4474488.0,3.15,
1,Argentina,Non-Metropolitan Urban / All,Calculated from Global Living Wage Coalition,4.0,2,2024.0,3847528.08,2.1,https://www.globallivingwage.org/reference-val...
2,Argentina,Rural / All,Global Living Wage Coalition,4.0,2,2024.0,11841900.0,2.1,https://www.globallivingwage.org/reference-val...
3,Bangladesh,Urban / Satellite Cities,Calculated from Global Living Wage Coalition,4.0,2,2024.0,437988.0,2.1,https://www.globallivingwage.org/living-wage-b...
4,Bangladesh,Urban / Dhaka City,Calculated from Global Living Wage Coalition,4.0,2,2024.0,528972.0,2.1,https://www.globallivingwage.org/living-wage-b...


In [13]:
# Merge all DataFrames in the dictionary
merged_lib = pd.concat(libs.values(), ignore_index=True)

# Check the total number of rows
print(len(merged_lib))

1101


In [14]:
final_lib = merged_lib.drop_duplicates(subset=['country', 'region', 'year'], keep='last')
len(final_lib)

298

## Write to lib data

In [15]:
final_lib.head()

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,household_equiv,links,USD,EUR
1,Argentina,Non-Metropolitan Urban / All,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,3847528.08,2.1,https://www.globallivingwage.org/reference-val...,,
2,Argentina,Rural / All,Global Living Wage Coalition,4.0,2.0,2024.0,11841900.0,2.1,https://www.globallivingwage.org/reference-val...,,
3,Bangladesh,Urban / Satellite Cities,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,437988.0,2.1,https://www.globallivingwage.org/living-wage-b...,,
4,Bangladesh,Urban / Dhaka City,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,528972.0,2.1,https://www.globallivingwage.org/living-wage-b...,,
5,Belize,Rural / All,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,19656.0,2.1,https://www.globallivingwage.org/living-wage-b...,,


In [16]:
benchmark_filename = "merged_li_benchmark_data.csv"

In [17]:
final_lib.to_csv(f'./{benchmark_filename}', index=False)

# Conversion Rate V1

In [18]:
# Load excel file
file_path = "./conversion_rates.xlsx"
xls = pd.ExcelFile(file_path)

In [19]:
# Load two sheets for usd and eur into dataframe
usd_df = pd.read_excel(xls, sheet_name="USD conversion")
eur_df = pd.read_excel(xls, sheet_name="EUR conversion")

In [20]:
# Apply the function to both dataframes
usd_long = melt_conversion_data(usd_df, 'USD')
eur_long = melt_conversion_data(eur_df, 'EUR')

In [21]:
# Combine the two dataframes if you want them in one
combined_df = pd.concat([usd_long, eur_long], ignore_index=True)

In [22]:
# Replace 'country' column using the mapping
combined_df["country"] = combined_df["country"].map(country_mapping).fillna(combined_df["country"])  # Keep original if no match

In [23]:
# fill NA with 0
combined_df.fillna(0, inplace=True)

In [24]:
combined_df['country_id'] = combined_df["country"].apply(find_country)

404 EUR
404 EUR
404 EUR
404 EUR
404 EUR


In [25]:
# sort add id and so on
combined_df = combined_df.sort_values(by=['country', 'year'], ascending=[True, True])
combined_df['id'] = combined_df.reset_index().index + 1
combined_df = combined_df.reindex(columns=['id', 'country', 'country_id', 'year', 'value', 'currency'])

In [26]:
combined_df

Unnamed: 0,id,country,country_id,year,value,currency
0,1,Angola,5,2020,364.825805,USD
290,2,Angola,5,2020,408.413189,EUR
58,3,Angola,5,2021,578.258780,USD
347,4,Angola,5,2021,660.484929,EUR
116,5,Angola,5,2022,631.441956,USD
...,...,...,...,...,...,...
460,571,Zimbabwe,203,2022,104.734549,EUR
231,572,Zimbabwe,203,2023,374.954363,USD
517,573,Zimbabwe,203,2023,394.845181,EUR
289,574,Zimbabwe,203,2024,3509.172220,USD


In [27]:
conversion_rate = combined_df

In [28]:
conversion_rate.to_csv('./conversion_rate.csv', index=False)

# Regions V4 - Living Income Benchmark V6 - CPI V6

Here we will map the li_benchmark_data.csv and cpi_data.csv only

## Load files

In [29]:
lib = pd.read_csv(f"./{benchmark_filename}")

In [30]:
lib.tail()

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,household_equiv,links,USD,EUR
293,Vietnam,Rural / Economic Zone 4,Calculated from Global Living Wage Coalition,4.0,2.0,2020.0,7729194.0,2.1,https://www.globallivingwage.org/living-wage-b...,333.034787,291.574086
294,Vietnam,Urban / Ho Chi Minh City,Calculated from Global Living Wage Coalition,4.0,2.0,2020.0,11862692.0,2.1,https://www.globallivingwage.org/living-wage-b...,511.138561,447.50508
295,Zambia,Rural / All,Calculated from Global Living Wage Coalition,6.0,2.0,2022.0,4107.84,2.7,https://globallivingwage.org/reference-value/l...,242.527948,230.3103
296,Zimbabwe,Rural / All,Calculated from Global Living Wage Coalition,5.0,2.0,2022.0,364.0,2.4,https://globallivingwage.org/reference-value/l...,364.0,345.663046
297,,,,,,,,,,,


In [31]:
lib = lib.dropna(how="all")

In [32]:
lib["id"] = lib.reset_index().index + 1

In [33]:
lib.columns

Index(['country', 'region', 'source', 'household_size', 'nr_adults', 'year',
       'LCU', 'household_equiv', 'links', 'USD', 'EUR', 'id'],
      dtype='object')

In [34]:
lib[lib['country'] == "Mexico"]

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,household_equiv,links,USD,EUR,id
61,Mexico,Yucatán / Rural Areas and Small Towns in Yucatan,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,227208.0,2.1,https://globallivingwage.org/living-wage-bench...,,,62
62,Mexico,Rural Areas and Small Towns / Rural Areas and...,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,238032.0,2.1,https://globallivingwage.org/living-wage-bench...,,,63
63,Mexico,"Non-metropolitan Urban & Rural / Michoacán, n...",Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,247440.0,2.1,https://www.globallivingwage.org/living-wage-b...,,,64
64,Mexico,Municipality of San Quintín / Baja California,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,317892.0,2.1,https://www.globallivingwage.org/living-wage-b...,,,65
65,Mexico,Municipality of Ensenada / Baja California,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,339288.0,2.1,https://www.globallivingwage.org/living-wage-b...,,,66
158,Mexico,Yucatán / Rural Areas and Small Towns in Yucatan,Calculated from Global Living Wage Coalition,4.0,2.0,2023.0,216444.0,2.1,https://www.globallivingwage.org/living-wage-b...,,,159
159,Mexico,Rural Areas and Small Towns / Rural Areas and...,Calculated from Global Living Wage Coalition,4.0,2.0,2023.0,226752.0,2.1,https://www.globallivingwage.org/living-wage-r...,,,160
160,Mexico,"Non-metropolitan Urban & Rural / Michoacán, n...",Calculated from Global Living Wage Coalition,4.0,2.0,2023.0,235704.0,2.1,https://globallivingwage.org/living-income-ref...,,,161
161,Mexico,Municipality of San Quintín / Baja California,Calculated from Global Living Wage Coalition,4.0,2.0,2023.0,302832.0,2.1,https://globallivingwage.org/living-wage-bench...,,,162
162,Mexico,Municipality of Ensenada / Baja California,Calculated from Global Living Wage Coalition,4.0,2.0,2023.0,323208.0,2.1,https://www.globallivingwage.org/living-wage-b...,,,163


In [35]:
df = pd.read_csv("./li_benchmark_data.csv")

FileNotFoundError: [Errno 2] No such file or directory: './li_benchmark_data.csv'

In [None]:
df.columns

## Recreate region

In [None]:
region_source = df[["country", "region"]]

In [None]:
region_temp = region_source

In [None]:
region_temp = region_temp[region_temp["region"].notna()]

In [None]:
region_temp['country'] = region_temp['country'].str.replace('\n', ' ').str.strip()
region_temp['region'] = region_temp['region'].str.replace('\n', ' ').str.strip()
region_temp['region'] = region_temp['region'].str.replace('\r', ' ').str.strip()
region_temp['region'] = region_temp['region'].str.replace(r'^\s+|\s+?$', '', regex=True)
region_temp['region'] = region_temp['region'].apply(remove_extra_whitespace)

In [None]:
region_temp.tail(10)

### Map regions with countries

In [None]:
# Replace 'country' column using the mapping
region_temp["country"] = region_temp["country"].map(country_mapping).fillna(region_temp["country"])  # Keep original if no match

In [None]:
region_temp["country_id"] = region_temp["country"].apply(find_country)

In [None]:
region_temp.tail(10)

### Group region with same name

In [None]:
group_df = region_temp.groupby('region').agg({
    'country': lambda x: list(set(x)),
    'country_id': lambda x: list(set(x))
}).reset_index()

In [None]:
group_df = group_df[group_df['region'].str.strip() != '']

In [None]:
group_df['region'] = group_df['region'].str.replace(r'^\s+|\s+?$', '', regex=True)

In [None]:
group_df['country_id'] = group_df['country_id'].apply(list)
group_df['country'] = group_df['country'].apply(list)

In [None]:
group_df["id"] = group_df.reset_index().index + 1

In [None]:
new_column_order = ["id", "region", "country_id", "country"]

In [None]:
regions = group_df[new_column_order]

### Reorder regions to follow prev regions order

In [None]:
prev_regions = pd.read_csv('../benchmark_v5/regions.csv')

In [None]:
# Step 2: Split group_df into matching and new (extra) regions
existing_regions = prev_regions['region'].unique()
matching_df = regions[regions['region'].isin(existing_regions)]

In [None]:
# Preserve original region order from region_temp
matching_df['region'] = pd.Categorical(matching_df['region'], categories=prev_regions['region'], ordered=True)
matching_df = matching_df.sort_values('region')

In [None]:
# Step 3: Find new regions not in the original df and put them at the bottom
extra_df = regions[~regions['region'].isin(existing_regions)]

In [None]:
extra_df

In [None]:
# Step 4: Concatenate to form final DataFrame
final_df = pd.concat([matching_df, extra_df], ignore_index=True)

In [None]:
final_df["id"] = final_df.reset_index().index + 1

In [None]:
final_df.head()

In [None]:
regions = final_df

### Export Region to CSV

In [None]:
regions[regions['region'].str.contains("coffee-producing households in Cuanza Sul province")]

In [None]:
regions.tail(5)

In [None]:
regions.to_csv('./regions.csv', index=False)

## Living Income Benchmark V6

### Get country id

In [None]:
countries.columns

In [None]:
def find_new_country(val):
    if val == "Turkey":
        val = "Turkiye"
    country = countries[countries["country"].str.lower() == str(val).lower()]
    if country.empty:
        print("404", val)
        return None
    return country["id"].to_list()[0]

In [None]:
lib["country_id"] = lib["country"].apply(find_new_country)

In [None]:
lib.tail()

### Get region id

In [None]:
regions.tail()

In [None]:
regions.columns

In [None]:
lib['country'] = lib['country'].str.replace('\n', ' ').str.strip()
lib['region'] = lib['region'].str.replace('\n', ' ').str.strip()
lib['region'] = lib['region'].str.replace('\r', ' ').str.strip()
lib['region'] = lib['region'].str.replace(r'^\s+|\s+?$', '', regex=True)
lib['region'] = lib['region'].apply(remove_extra_whitespace)

In [None]:
lib.tail()

In [None]:
def find_region(val):
    region = regions[regions["region"].str.lower() == str(val).lower()]
    if region.empty:
        print("404", val)
        return None
    return region["id"].to_list()[0]

In [None]:
lib["region_id"] = lib["region"].apply(find_region)

### Export Benchmark to CSV

In [None]:
lib = lib.dropna(subset=['region_id']) # remove row when region_id column is None

In [None]:
lib = lib.sort_values(by=['country', 'region', 'year'], ascending=[True, True, True])
lib["id"] = lib.reset_index().index + 1

In [None]:
lib.columns

#### Calculate USD and EUR if this columns doesn't exist in df lib data columns
- Calculation using the conversion rate data

In [None]:
lib.head(3)

In [None]:
# find conversion rate and calculate the conversion into USD/EUR
def find_conversion_rate(row, currency):
    country_id = row['country_id']
    country_id = int(country_id) if not pd.isna(country_id) else 0
    year = row['year']
    year = int(year) if not pd.isna(year) else 0
    
    rate = conversion_rate.loc[
        (conversion_rate["year"] == year) & 
        (conversion_rate["country_id"] == country_id) & 
        (conversion_rate["currency"] == currency),
        "value"
    ]
    # Extract scalar value if needed
    rate = rate.iloc[0] if not rate.empty else None
    if not rate:
        return 0
    lcu = row['LCU']
    target_value = lcu / rate
    return target_value
    

new_lib = lib.copy()
if "USD" not in lib.columns or "EUR" not in lib.columns:
    new_lib["USD"] = new_lib.apply(lambda row: find_conversion_rate(row, currency="USD"), axis=1)
    new_lib["EUR"] = new_lib.apply(lambda row: find_conversion_rate(row, currency="EUR"), axis=1)


# Make USD and EUR 0 then use conversion rate in endpoint
if "USD" in lib.columns:
    new_lib["USD"] = 0

if "EUR" in lib.columns:
    new_lib["EUR"] = 0

In [None]:
new_lib = new_lib[['id', 'country_id',
       'region_id', 'country', 'region', 'source', 'household_size', 'nr_adults', 'year',
       'LCU', 'USD', 'EUR', 'household_equiv', 'links']]

In [None]:
# dropna by column country_id is none
new_lib = new_lib.dropna(subset=["country_id"])

In [None]:
new_lib.tail(5)

In [None]:
new_lib.to_csv("./li_benchmark.csv", index=False)

## CPI V6

In [None]:
cpi_df = pd.read_csv("./cpi_data.csv")

In [None]:
cpi_df = cpi_df.rename(columns={"Country Name": "country"})

In [None]:
cpi_df = cpi_df.dropna(how="all")

In [None]:
cpi_df.columns

In [None]:
if "links" in cpi_df.columns:
    cpi_df = cpi_df.copy()
    cpi_df.drop(columns="links", inplace=True)

In [None]:
cpi_df.head(5)

### Map with Countries

In [None]:
# Replace 'country' column using the mapping
cpi_df["country"] = cpi_df["country"].map(country_mapping).fillna(cpi_df["country"])  # Keep original if no match

In [None]:
cpi_df["country_id"] = cpi_df["country"].apply(find_country)

### Transform CPI table

In [None]:
# transform int country, country_id, year, value format
melted_df = pd.melt(cpi_df, id_vars=['country', 'country_id'], var_name='year', value_name='value')
melted_df['value'] = melted_df['value'].fillna(0)

In [None]:
# only for year >= 2020
melted_df['year'] = melted_df['year'].astype(str).astype(int)
melted_df = melted_df[melted_df['year'] >= 2020]

In [None]:
melted_df.reset_index(drop=True, inplace=True)
melted_df["id"] = melted_df.reset_index().index + 1

In [None]:
new_column_order = ["id", "country", "country_id", "year", "value"]

In [None]:
cpi_df = melted_df[new_column_order]

In [None]:
cpi_df['value'] = cpi_df['value'].apply(convert_to_float)

### Export CPI to CSV

In [None]:
cpi = cpi_df.dropna()

In [None]:
cpi = cpi.drop_duplicates(subset=["country_id", "year"])

In [None]:
cpi.head()

In [None]:
cpi.to_csv('cpi.csv', index=False)