In [1]:
import pandas as pd
import re
import json

# Function

In [2]:
# Function to clean and convert to float
def convert_to_float(value):
    if isinstance(value, str):  # Check if it's a string
        value = value.replace(" ", "").replace(",", ".")  # Remove spaces and replace "," with "."
        return float(value)
    return value  # Return original if it's not a string

In [3]:
def remove_extra_whitespace(val):
    temp = val.split()
    return ' '.join(temp)

In [4]:
def find_country(val):
    # manual match
    if val == "Turkey":
        val = "Turkiye"
    # if val == "Anguilla":
    #     print("404", val)
    #     return None
    if val == "Central African Rep.":
        val = "Central African Republic"
    if val == "Congo, Dem. Rep. of the":
        val = "Congo, Dem. Rep."
    if val == "Congo, Rep. of":
        val = "Congo, Rep."
    if val == "Côte d'Ivoire":
        val = "Cote d'Ivoire"
    if val == "Curaçao, Kingdom of the Netherlands":
        val = "Curacao"
    if val == "Czech Rep.":
        val = "Czech Republic"
    if val == "Dominican Rep.":
        val = "Dominican Republic"
    if val == "Egypt, Arab Rep. of":
        val = "Egypt, Arab Rep."
    # if val == "Guadeloupe":
    #     print("404", val)
    #     return None
    if val == "Iran, Islamic Rep. of":
        val = "Iran, Islamic Rep."
    if val == "Korea, Rep. of":
        val = "Korea, Rep."
    if val == "Kyrgyz Rep.":
        val = "Kyrgyz Republic"
    if val == "Lao People's Dem. Rep.":
        val = "Lao PDR"
    # if val == "Martinique":
    #     print("404", val)
    #     return None
    if val == "Micronesia, Federated States of":
        val = "Micronesia, Fed. Sts."
    # if val == "Montserrat":
    #     print("404", val)
    #     return None
    # if val == "San Marino, Rep. of":
    #     print("404", val)
    #     return None
    # if val == "São Tomé and Príncipe, Dem. Rep. of":
    #     print("404", val)
    #     return None
    if val == "Sint Maarten, Kingdom of the Netherlands":
        val = "Sint Maarten (Dutch part)"
    if val == "Slovak Rep.":
        val = "Slovak Republic"
    # if val == "Syrian Arab Rep.":
    #     print("404", val)
    #     return None
    if val == "Venezuela, Rep. Bolivariana de":
        val = "Venezuela, RB"
    if val == "Yemen, Rep. of":
        val = "Yemen, Rep."
    # EOL manual match
    
    find_country = countries[countries["country"].str.lower() == str(val).lower()]
    if find_country.empty:
        # Another custom way
        val_lower = str(val).lower()
        val_tmp = val_lower.lower().split(',')
        for index, row in countries.iterrows():
            # Convert the country name in the DataFrame to lowercase
            country_name = row['country'].lower()
            if val_lower == 'nan':
                return None
            if country_name not in val_tmp:
                continue
            return row["id"]
        # EOL Another custom way
        print("404", val)
        return val
    return find_country["id"].to_list()[0]

In [5]:
# Function to melt data and adjust the year
def melt_conversion_data(df, currency_name):
    # Melt the data: 'Country Name' stays, everything else is melted
    melted_df = pd.melt(df, id_vars=['Country Name'], var_name='year', value_name='value')
    
    # Convert Year to integer by removing the decimal part
    melted_df['year'] = melted_df['year'].astype(float).astype(int)

    # Filter out rows where Year is before 2020
    melted_df = melted_df[melted_df['year'] >= 2020]
    
    # Add a new column 'currency' to identify whether it's USD or EUR
    melted_df['currency'] = currency_name
    
    # Rename 'Country Name' to 'country_name' for consistency
    melted_df.rename(columns={'Country Name': 'country'}, inplace=True)
    
    return melted_df

# Load source

## Country mapping JSON

In [6]:
country_mapping = {}
with open("./country_names_matched.json", "r") as file:
    json_data = json.load(file)

country_mapping = {entry["benchmark name"]: entry["tool name"] for entry in json_data}

## Country

In [7]:
countries = pd.read_csv("./countries.csv")

In [8]:
countries.tail()

Unnamed: 0,id,country,currency,abbreviation
205,206,Martinique,Euro,EUR
206,207,Montserrat,East Caribbean Dollar,XCD
207,208,"San Marino, Rep. Of",Euro,EUR
208,209,"São Tomé and Príncipe, Dem. Rep. of",Dobra,STN
209,210,Syrian Arab Rep.,Syrian Pound,SYP


## Region

In [9]:
regions = pd.read_csv('./regions.csv')

In [10]:
regions.tail(5)

Unnamed: 0,id,region,country_id,country
78,79,Urban / Urban Guatemala,[75],['Guatemala']
79,80,Urban / Urban West Java Province (SUBANG AND G...,[85],['Indonesia']
80,81,Urban / Zhengzhou,[40],['China']
81,82,Urban-Rural / Caribbean coast,[41],['Colombia']
82,83,Yucatán / Rural Areas and Small Towns in Yucatan,[119],['Mexico']


# Merge prev LIB data

In [11]:
path = './prev_lib_data/li_benchmark_data_v{0}.csv'

lib_v2 = pd.read_csv(path.format(2))
lib_v3 = pd.read_csv(path.format(3))
lib_v4 = pd.read_csv(path.format(4))
lib_v5 = pd.read_csv(path.format(5))
lib_v6 = pd.read_csv(path.format(6))

In [12]:
merged_lib = pd.concat(
    [lib_v2, lib_v3, lib_v4, lib_v5, lib_v6], 
    ignore_index=True
)
len(merged_lib)

1271

In [13]:
final_lib = merged_lib.drop_duplicates(subset=['country', 'region', 'year'], keep='last')
len(final_lib)

298

## Write to lib data

In [14]:
final_lib.head()

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,USD,EUR,household_equiv,links
0,Argentina,Non-Metropolitan Urban / All,Calculated from Global Living Wage Coalition,4.0,2.0,2022.0,148722.72,1138.620795,1081.261353,2.1,https://www.globallivingwage.org/living-wage-r...
1,Argentina,Rural / All,Global Living Wage Coalition,4.0,2.0,2022.0,123203.0,943.241879,895.724893,2.1,https://globallivingwage.org/living-income-ref...
2,Bangladesh,Urban / Satellite Cities,Calculated from Global Living Wage Coalition,4.0,2.0,2022.0,30408.0,331.438764,314.742123,2.1,https://www.globallivingwage.org/living-wage-b...
3,Bangladesh,Urban / Dhaka City,Calculated from Global Living Wage Coalition,4.0,2.0,2022.0,36724.0,400.281411,380.116736,2.1,https://www.globallivingwage.org/living-wage-b...
4,Belize,Rural / All,Calculated from Global Living Wage Coalition,4.0,2.0,2022.0,1477.0,738.5,701.297142,2.1,https://www.globallivingwage.org/living-wage-b...


In [15]:
final_lib.to_csv('./li_benchmark_data.csv', index=False)

# Conversion Rate V1

In [16]:
# Load excel file
file_path = "./conversion_rates.xlsx"
xls = pd.ExcelFile(file_path)

In [17]:
# Load two sheets for usd and eur into dataframe
usd_df = pd.read_excel(xls, sheet_name="USD conversion")
eur_df = pd.read_excel(xls, sheet_name="EUR conversion")

In [18]:
# Apply the function to both dataframes
usd_long = melt_conversion_data(usd_df, 'USD')
eur_long = melt_conversion_data(eur_df, 'EUR')

In [19]:
# Combine the two dataframes if you want them in one
combined_df = pd.concat([usd_long, eur_long], ignore_index=True)

In [20]:
# Replace 'country' column using the mapping
combined_df["country"] = combined_df["country"].map(country_mapping).fillna(combined_df["country"])  # Keep original if no match

In [21]:
# fill NA with 0
combined_df.fillna(0, inplace=True)

In [22]:
combined_df['country_id'] = combined_df["country"].apply(find_country)

404 EUR
404 EUR
404 EUR
404 EUR
404 EUR


In [23]:
# sort add id and so on
combined_df = combined_df.sort_values(by=['country', 'year'], ascending=[True, True])
combined_df['id'] = combined_df.reset_index().index + 1
combined_df = combined_df.reindex(columns=['id', 'country', 'country_id', 'year', 'value', 'currency'])

In [24]:
combined_df

Unnamed: 0,id,country,country_id,year,value,currency
0,1,Angola,5,2020,364.825805,USD
290,2,Angola,5,2020,408.413189,EUR
58,3,Angola,5,2021,578.258780,USD
347,4,Angola,5,2021,660.484929,EUR
116,5,Angola,5,2022,631.441956,USD
...,...,...,...,...,...,...
460,571,Zimbabwe,203,2022,104.734549,EUR
231,572,Zimbabwe,203,2023,374.954363,USD
517,573,Zimbabwe,203,2023,394.845181,EUR
289,574,Zimbabwe,203,2024,3509.172220,USD


In [25]:
conversion_rate = combined_df

In [26]:
conversion_rate.to_csv('./conversion_rate.csv', index=False)

# Regions V4 - Living Income Benchmark V6 - CPI V6

Here we will map the li_benchmark_data.csv and cpi_data.csv only

## Load files

In [27]:
lib = pd.read_csv("./li_benchmark_data.csv")

In [28]:
lib.tail()

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,USD,EUR,household_equiv,links
293,Vietnam,Urban / Minimum Wage Region 3,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,149169000.0,,,2.1,
294,Vietnam,Rural / Minimum Wage Region 4,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,127290200.0,,,2.1,
295,Zambia,Rural / All,Calculated from Global Living Wage Coalition,6.0,2.0,2024.0,62346.24,,,2.7,
296,Zimbabwe,Rural / All,Calculated from Global Living Wage Coalition,5.0,2.0,2024.0,4536.0,,,2.4,
297,,,,,,,,,,,


In [29]:
lib = lib.dropna(how="all")

In [30]:
lib["id"] = lib.reset_index().index + 1

In [31]:
lib.columns

Index(['country', 'region', 'source', 'household_size', 'nr_adults', 'year',
       'LCU', 'USD', 'EUR', 'household_equiv', 'links', 'id'],
      dtype='object')

In [32]:
lib[lib['country'] == "Mexico"]

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,USD,EUR,household_equiv,links,id
59,Mexico,Yucatán / Rural Areas and Small Towns in Yucatan,Calculated from Global Living Wage Coalition,4.0,2.0,2022.0,17296.0,859.328228,816.038497,2.1,https://globallivingwage.org/living-wage-bench...,60
60,Mexico,Rural Areas and Small Towns / Rural Areas and ...,Calculated from Global Living Wage Coalition,4.0,2.0,2022.0,17535.0,871.202617,827.314699,2.1,https://globallivingwage.org/living-wage-bench...,61
61,Mexico,"Non-metropolitan Urban & Rural / Michoacán, no...",Calculated from Global Living Wage Coalition,4.0,2.0,2022.0,18571.0,922.674868,876.193971,2.1,https://www.globallivingwage.org/living-wage-b...,62
62,Mexico,Municipality of San Quintín / Baja California,Calculated from Global Living Wage Coalition,4.0,2.0,2022.0,25638.0,1273.789148,1209.620431,2.1,https://www.globallivingwage.org/living-wage-b...,63
63,Mexico,Municipality of Ensenada / Baja California,Calculated from Global Living Wage Coalition,4.0,2.0,2022.0,24021.0,1193.450703,1133.329135,2.1,https://www.globallivingwage.org/living-wage-b...,64
148,Mexico,Yucatán / Rural Areas and Small Towns in Yucatan,Calculated from Global Living Wage Coalition,4.0,2.0,2023.0,216444.0,10753.725652,10211.993314,2.1,https://globallivingwage.org/living-wage-bench...,149
149,Mexico,Rural Areas and Small Towns / Rural Areas and...,Calculated from Global Living Wage Coalition,4.0,2.0,2023.0,226752.0,11265.864607,10698.332631,2.1,https://globallivingwage.org/living-wage-bench...,150
150,Mexico,"Non-metropolitan Urban & Rural / Michoacán, n...",Calculated from Global Living Wage Coalition,4.0,2.0,2023.0,235704.0,11710.632547,11120.694831,2.1,https://www.globallivingwage.org/living-wage-b...,151
151,Mexico,Municipality of San Quintín / Baja California,Calculated from Global Living Wage Coalition,4.0,2.0,2023.0,302832.0,15045.795895,14287.845167,2.1,https://www.globallivingwage.org/living-wage-b...,152
152,Mexico,Municipality of Ensenada / Baja California,Calculated from Global Living Wage Coalition,4.0,2.0,2023.0,323208.0,16058.149732,15249.200417,2.1,https://www.globallivingwage.org/living-wage-b...,153


In [33]:
df = pd.read_csv("./li_benchmark_data.csv")

In [34]:
df.columns

Index(['country', 'region', 'source', 'household_size', 'nr_adults', 'year',
       'LCU', 'USD', 'EUR', 'household_equiv', 'links'],
      dtype='object')

## Recreate region

In [35]:
region_source = df[["country", "region"]]

In [36]:
region_temp = region_source

In [37]:
region_temp = region_temp[region_temp["region"].notna()]

In [38]:
region_temp['country'] = region_temp['country'].str.replace('\n', ' ').str.strip()
region_temp['region'] = region_temp['region'].str.replace('\n', ' ').str.strip()
region_temp['region'] = region_temp['region'].str.replace('\r', ' ').str.strip()
region_temp['region'] = region_temp['region'].str.replace(r'^\s+|\s+?$', '', regex=True)
region_temp['region'] = region_temp['region'].apply(remove_extra_whitespace)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_temp['country'] = region_temp['country'].str.replace('\n', ' ').str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_temp['region'] = region_temp['region'].str.replace('\n', ' ').str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_temp['region'] = region_temp['regi

In [39]:
region_temp.tail(10)

Unnamed: 0,country,region
287,Togo,Urban / All
288,Togo,Rural / coffee-producing households in the Waw...
289,Tunisia,Rural / All
290,Uganda,Rural / Lake Victoria Basin
291,Vietnam,Urban / Minimum Wage Region 1
292,Vietnam,Urban / Minimum Wage Region 2
293,Vietnam,Urban / Minimum Wage Region 3
294,Vietnam,Rural / Minimum Wage Region 4
295,Zambia,Rural / All
296,Zimbabwe,Rural / All


### Map regions with countries

In [40]:
# Replace 'country' column using the mapping
region_temp["country"] = region_temp["country"].map(country_mapping).fillna(region_temp["country"])  # Keep original if no match

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_temp["country"] = region_temp["country"].map(country_mapping).fillna(region_temp["country"])  # Keep original if no match


In [41]:
region_temp["country_id"] = region_temp["country"].apply(find_country)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_temp["country_id"] = region_temp["country"].apply(find_country)


In [42]:
region_temp.tail(10)

Unnamed: 0,country,region,country_id
287,Togo,Urban / All,182
288,Togo,Rural / coffee-producing households in the Waw...,182
289,Tunisia,Rural / All,185
290,Uganda,Rural / Lake Victoria Basin,190
291,Vietnam,Urban / Minimum Wage Region 1,199
292,Vietnam,Urban / Minimum Wage Region 2,199
293,Vietnam,Urban / Minimum Wage Region 3,199
294,Vietnam,Rural / Minimum Wage Region 4,199
295,Zambia,Rural / All,202
296,Zimbabwe,Rural / All,203


### Group region with same name

In [43]:
group_df = region_temp.groupby('region').agg({
    'country': lambda x: list(set(x)),
    'country_id': lambda x: list(set(x))
}).reset_index()

In [44]:
group_df = group_df[group_df['region'].str.strip() != '']

In [45]:
group_df['region'] = group_df['region'].str.replace(r'^\s+|\s+?$', '', regex=True)

In [46]:
group_df['country_id'] = group_df['country_id'].apply(list)
group_df['country'] = group_df['country'].apply(list)

In [47]:
group_df["id"] = group_df.reset_index().index + 1

In [48]:
new_column_order = ["id", "region", "country_id", "country"]

In [49]:
regions = group_df[new_column_order]

### Reorder regions to follow prev regions order

In [50]:
prev_regions = pd.read_csv('../benchmark_v5/regions.csv')

In [51]:
# Step 2: Split group_df into matching and new (extra) regions
existing_regions = prev_regions['region'].unique()
matching_df = regions[regions['region'].isin(existing_regions)]

In [52]:
# Preserve original region order from region_temp
matching_df['region'] = pd.Categorical(matching_df['region'], categories=prev_regions['region'], ordered=True)
matching_df = matching_df.sort_values('region')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_df['region'] = pd.Categorical(matching_df['region'], categories=prev_regions['region'], ordered=True)


In [53]:
# Step 3: Find new regions not in the original df and put them at the bottom
extra_df = regions[~regions['region'].isin(existing_regions)]

In [54]:
extra_df

Unnamed: 0,id,region,country_id,country
0,1,All / All,[118],[Mauritius]
21,22,Rural / Central Sulawesi,[85],[Indonesia]
26,27,Rural / Economic Zone 4,[199],[Vietnam]
37,38,Rural / Rural Aceh,[85],[Indonesia]
39,40,Rural / Rural Lampung Province,[85],[Indonesia]
41,42,"Rural / Rural Rembang, Central Java",[85],[Indonesia]
49,50,Rural Areas and Small Towns / All coffee and c...,[145],[Peru]
59,60,Urban /,[75],[Guatemala]
64,65,Urban / Ho Chi Minh City,[199],[Vietnam]
77,78,Urban / Urban Banten Province,[85],[Indonesia]


In [55]:
# Step 4: Concatenate to form final DataFrame
final_df = pd.concat([matching_df, extra_df], ignore_index=True)

In [56]:
final_df["id"] = final_df.reset_index().index + 1

In [57]:
final_df.head()

Unnamed: 0,id,region,country_id,country
0,1,Cocoa growing regions / Belém and the metropol...,[26],[Brazil]
1,2,Cocoa growing regions / East Rondônia,[26],[Brazil]
2,3,Cocoa growing regions / South Bahia,[26],[Brazil]
3,4,Cocoa growing regions / Southeast Pará,[26],[Brazil]
4,5,Cocoa growing regions / Southwest Pará,[26],[Brazil]


In [58]:
regions = final_df

### Export Region to CSV

In [59]:
regions[regions['region'].str.contains("coffee-producing households in Cuanza Sul province")]

Unnamed: 0,id,region,country_id,country
40,41,Rural / coffee-producing households in Cuanza ...,[5],[Angola]


In [60]:
regions.tail(5)

Unnamed: 0,id,region,country_id,country
78,79,Rural Areas and Small Towns / All coffee and c...,[145],[Peru]
79,80,Urban /,[75],[Guatemala]
80,81,Urban / Ho Chi Minh City,[199],[Vietnam]
81,82,Urban / Urban Banten Province,[85],[Indonesia]
82,83,Urban / Urban West Java Province (SUBANG AND G...,[85],[Indonesia]


In [61]:
regions.to_csv('./regions.csv', index=False)

## Living Income Benchmark V6

### Get country id

In [62]:
countries.columns

Index(['id', 'country', 'currency', 'abbreviation'], dtype='object')

In [63]:
def find_new_country(val):
    if val == "Turkey":
        val = "Turkiye"
    country = countries[countries["country"].str.lower() == str(val).lower()]
    if country.empty:
        print("404", val)
        return None
    return country["id"].to_list()[0]

In [64]:
lib["country_id"] = lib["country"].apply(find_new_country)

404 East Timor


In [65]:
lib.tail()

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,USD,EUR,household_equiv,links,id,country_id
292,Vietnam,Urban / Minimum Wage Region 2,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,153600400.0,,,2.1,,293,199.0
293,Vietnam,Urban / Minimum Wage Region 3,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,149169000.0,,,2.1,,294,199.0
294,Vietnam,Rural / Minimum Wage Region 4,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,127290200.0,,,2.1,,295,199.0
295,Zambia,Rural / All,Calculated from Global Living Wage Coalition,6.0,2.0,2024.0,62346.24,,,2.7,,296,202.0
296,Zimbabwe,Rural / All,Calculated from Global Living Wage Coalition,5.0,2.0,2024.0,4536.0,,,2.4,,297,203.0


### Get region id

In [66]:
regions.tail()

Unnamed: 0,id,region,country_id,country
78,79,Rural Areas and Small Towns / All coffee and c...,[145],[Peru]
79,80,Urban /,[75],[Guatemala]
80,81,Urban / Ho Chi Minh City,[199],[Vietnam]
81,82,Urban / Urban Banten Province,[85],[Indonesia]
82,83,Urban / Urban West Java Province (SUBANG AND G...,[85],[Indonesia]


In [67]:
regions.columns

Index(['id', 'region', 'country_id', 'country'], dtype='object')

In [68]:
lib['country'] = lib['country'].str.replace('\n', ' ').str.strip()
lib['region'] = lib['region'].str.replace('\n', ' ').str.strip()
lib['region'] = lib['region'].str.replace('\r', ' ').str.strip()
lib['region'] = lib['region'].str.replace(r'^\s+|\s+?$', '', regex=True)
lib['region'] = lib['region'].apply(remove_extra_whitespace)

In [69]:
lib.tail()

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,USD,EUR,household_equiv,links,id,country_id
292,Vietnam,Urban / Minimum Wage Region 2,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,153600400.0,,,2.1,,293,199.0
293,Vietnam,Urban / Minimum Wage Region 3,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,149169000.0,,,2.1,,294,199.0
294,Vietnam,Rural / Minimum Wage Region 4,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,127290200.0,,,2.1,,295,199.0
295,Zambia,Rural / All,Calculated from Global Living Wage Coalition,6.0,2.0,2024.0,62346.24,,,2.7,,296,202.0
296,Zimbabwe,Rural / All,Calculated from Global Living Wage Coalition,5.0,2.0,2024.0,4536.0,,,2.4,,297,203.0


In [70]:
def find_region(val):
    region = regions[regions["region"].str.lower() == str(val).lower()]
    if region.empty:
        print("404", val)
        return None
    return region["id"].to_list()[0]

In [71]:
lib["region_id"] = lib["region"].apply(find_region)

### Export Benchmark to CSV

In [72]:
lib = lib.dropna(subset=['region_id']) # remove row when region_id column is None

In [73]:
lib = lib.sort_values(by=['country', 'region', 'year'], ascending=[True, True, True])
lib["id"] = lib.reset_index().index + 1

In [74]:
lib.columns

Index(['country', 'region', 'source', 'household_size', 'nr_adults', 'year',
       'LCU', 'USD', 'EUR', 'household_equiv', 'links', 'id', 'country_id',
       'region_id'],
      dtype='object')

#### Calculate USD and EUR if this columns doesn't exist in df lib data columns
- Calculation using the conversion rate data

In [75]:
lib.head(3)

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,USD,EUR,household_equiv,links,id,country_id,region_id
183,Angola,Rural / coffee-producing households in Cuanza ...,New Foresight,7.5,2.0,2023.0,4474488.0,,,3.15,,1,5.0,41
0,Argentina,Non-Metropolitan Urban / All,Calculated from Global Living Wage Coalition,4.0,2.0,2022.0,148722.72,1138.620795,1081.261353,2.1,https://www.globallivingwage.org/living-wage-r...,2,7.0,11
96,Argentina,Non-Metropolitan Urban / All,Calculated from Global Living Wage Coalition,4.0,2.0,2023.0,3847528.08,29456.665943,27972.749668,2.1,https://www.globallivingwage.org/reference-val...,3,7.0,11


In [76]:
# find conversion rate and calculate the conversion into USD/EUR
def find_conversion_rate(row, currency):
    country_id = row['country_id']
    country_id = int(country_id) if not pd.isna(country_id) else 0
    year = row['year']
    year = int(year) if not pd.isna(year) else 0
    
    rate = conversion_rate.loc[
        (conversion_rate["year"] == year) & 
        (conversion_rate["country_id"] == country_id) & 
        (conversion_rate["currency"] == currency),
        "value"
    ]
    # Extract scalar value if needed
    rate = rate.iloc[0] if not rate.empty else None
    if not rate:
        return 0
    lcu = row['LCU']
    target_value = lcu / rate
    return target_value
    

new_lib = lib.copy()
if "USD" not in lib.columns or "EUR" not in lib.columns:
    new_lib["USD"] = new_lib.apply(lambda row: find_conversion_rate(row, currency="USD"), axis=1)
    new_lib["EUR"] = new_lib.apply(lambda row: find_conversion_rate(row, currency="EUR"), axis=1)


# Make USD and EUR 0 then use conversion rate in endpoint
if "USD" in lib.columns:
    new_lib["USD"] = 0

if "EUR" in lib.columns:
    new_lib["EUR"] = 0

In [77]:
new_lib = new_lib[['id', 'country_id',
       'region_id', 'country', 'region', 'source', 'household_size', 'nr_adults', 'year',
       'LCU', 'USD', 'EUR', 'household_equiv', 'links']]

In [78]:
# dropna by column country_id is none
new_lib = new_lib.dropna(subset=["country_id"])

In [79]:
new_lib.tail(5)

Unnamed: 0,id,country_id,region_id,country,region,source,household_size,nr_adults,year,LCU,USD,EUR,household_equiv,links
181,293,202.0,17,Zambia,Rural / All,Calculated from Global Living Wage Coalition,6.0,2.0,2023.0,54109.44,0,0,2.7,https://globallivingwage.org/reference-value/l...
295,294,202.0,17,Zambia,Rural / All,Calculated from Global Living Wage Coalition,6.0,2.0,2024.0,62346.24,0,0,2.7,
95,295,203.0,17,Zimbabwe,Rural / All,Calculated from Global Living Wage Coalition,5.0,2.0,2022.0,364.0,0,0,2.4,https://globallivingwage.org/reference-value/l...
182,296,203.0,17,Zimbabwe,Rural / All,Calculated from Global Living Wage Coalition,5.0,2.0,2023.0,4368.0,0,0,2.4,https://globallivingwage.org/reference-value/l...
296,297,203.0,17,Zimbabwe,Rural / All,Calculated from Global Living Wage Coalition,5.0,2.0,2024.0,4536.0,0,0,2.4,


In [80]:
new_lib.to_csv("./li_benchmark.csv", index=False)

## CPI V6

In [81]:
cpi_df = pd.read_csv("./cpi_data.csv")

In [82]:
cpi_df = cpi_df.rename(columns={"Country Name": "country"})

In [83]:
cpi_df = cpi_df.dropna(how="all")

In [84]:
cpi_df.columns

Index(['country', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', '2023', '2024', '2025', 'links'],
      dtype='object')

In [85]:
if "links" in cpi_df.columns:
    cpi_df = cpi_df.copy()
    cpi_df.drop(columns="links", inplace=True)

In [86]:
cpi_df.head(5)

Unnamed: 0,country,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,Afghanistan,101.97,101.295751,105.736448,110.997842,111.692854,114.264439,120.66,126.85,144.25,137.55,128.47,
1,Albania,90.72,92.435759,93.614714,95.474522,97.410802,98.785357,100.386555,102.435918526012,109.324942,114.528551,108.23,
2,Algeria,164.77,172.653333,183.699167,193.97,202.2525,206.2,211.18,226.44,247.420833,270.485833,281.43,
3,Angola,28.45,31.10644,40.65438,52.787468,63.149087,73.935554,90.40214,113.684573330823,137.962244,156.785953,201.063081738323,228.58
4,Anguilla,107.17,106.1275,105.573687,106.94,107.3425,108.2175,107.705,109.64,115.73,119.6725,,


### Map with Countries

In [87]:
# Replace 'country' column using the mapping
cpi_df["country"] = cpi_df["country"].map(country_mapping).fillna(cpi_df["country"])  # Keep original if no match

In [88]:
cpi_df["country_id"] = cpi_df["country"].apply(find_country)

### Transform CPI table

In [89]:
# transform int country, country_id, year, value format
melted_df = pd.melt(cpi_df, id_vars=['country', 'country_id'], var_name='year', value_name='value')
melted_df['value'] = melted_df['value'].fillna(0)

In [90]:
# only for year >= 2020
melted_df['year'] = melted_df['year'].astype(str).astype(int)
melted_df = melted_df[melted_df['year'] >= 2020]

In [91]:
melted_df.reset_index(drop=True, inplace=True)
melted_df["id"] = melted_df.reset_index().index + 1

In [92]:
new_column_order = ["id", "country", "country_id", "year", "value"]

In [93]:
cpi_df = melted_df[new_column_order]

In [94]:
cpi_df['value'] = cpi_df['value'].apply(convert_to_float)

### Export CPI to CSV

In [95]:
cpi = cpi_df.dropna()

In [96]:
cpi = cpi.drop_duplicates(subset=["country_id", "year"])

In [97]:
cpi.head()

Unnamed: 0,id,country,country_id,year,value
0,1,Afghanistan,1,2020,120.66
1,2,Albania,2,2020,100.386555
2,3,Algeria,3,2020,211.18
3,4,Angola,5,2020,90.40214
4,5,Anguilla,204,2020,107.705


In [98]:
cpi.to_csv('cpi.csv', index=False)