In [1]:
import pandas as pd
import re

# Function

In [2]:
# Function to clean and convert to float
def convert_to_float(value):
    if isinstance(value, str):  # Check if it's a string
        value = value.replace(" ", "").replace(",", ".")  # Remove spaces and replace "," with "."
        return float(value)
    return value  # Return original if it's not a string

In [3]:
def remove_extra_whitespace(val):
    temp = val.split()
    return ' '.join(temp)

In [4]:
def find_country(val):
    # manual match
    if val == "Turkey":
        val = "Turkiye"
    # if val == "Anguilla":
    #     print("404", val)
    #     return None
    if val == "Central African Rep.":
        val = "Central African Republic"
    if val == "Congo, Dem. Rep. of the":
        val = "Congo, Dem. Rep."
    if val == "Congo, Rep. of":
        val = "Congo, Rep."
    if val == "Côte d'Ivoire":
        val = "Cote d'Ivoire"
    if val == "Curaçao, Kingdom of the Netherlands":
        val = "Curacao"
    if val == "Czech Rep.":
        val = "Czech Republic"
    if val == "Dominican Rep.":
        val = "Dominican Republic"
    if val == "Egypt, Arab Rep. of":
        val = "Egypt, Arab Rep."
    # if val == "Guadeloupe":
    #     print("404", val)
    #     return None
    if val == "Iran, Islamic Rep. of":
        val = "Iran, Islamic Rep."
    if val == "Korea, Rep. of":
        val = "Korea, Rep."
    if val == "Kyrgyz Rep.":
        val = "Kyrgyz Republic"
    if val == "Lao People's Dem. Rep.":
        val = "Lao PDR"
    # if val == "Martinique":
    #     print("404", val)
    #     return None
    if val == "Micronesia, Federated States of":
        val = "Micronesia, Fed. Sts."
    # if val == "Montserrat":
    #     print("404", val)
    #     return None
    # if val == "San Marino, Rep. of":
    #     print("404", val)
    #     return None
    # if val == "São Tomé and Príncipe, Dem. Rep. of":
    #     print("404", val)
    #     return None
    if val == "Sint Maarten, Kingdom of the Netherlands":
        val = "Sint Maarten (Dutch part)"
    if val == "Slovak Rep.":
        val = "Slovak Republic"
    # if val == "Syrian Arab Rep.":
    #     print("404", val)
    #     return None
    if val == "Venezuela, Rep. Bolivariana de":
        val = "Venezuela, RB"
    if val == "Yemen, Rep. of":
        val = "Yemen, Rep."
    # EOL manual match
    
    find_country = countries[countries["country"].str.lower() == str(val).lower()]
    if find_country.empty:
        # Another custom way
        val_lower = str(val).lower()
        val_tmp = val_lower.lower().split(',')
        for index, row in countries.iterrows():
            # Convert the country name in the DataFrame to lowercase
            country_name = row['country'].lower()
            if val_lower == 'nan':
                return None
            if country_name not in val_tmp:
                continue
            return row["id"]
        # EOL Another custom way
        print("404", val)
        return val
    return find_country["id"].to_list()[0]

In [5]:
# Function to melt data and adjust the year
def melt_conversion_data(df, currency_name):
    # Melt the data: 'Country Name' stays, everything else is melted
    melted_df = pd.melt(df, id_vars=['Country Name'], var_name='year', value_name='value')
    
    # Convert Year to integer by removing the decimal part
    melted_df['year'] = melted_df['year'].astype(float).astype(int)

    # Filter out rows where Year is before 2020
    melted_df = melted_df[melted_df['year'] >= 2020]
    
    # Add a new column 'currency' to identify whether it's USD or EUR
    melted_df['currency'] = currency_name
    
    # Rename 'Country Name' to 'country_name' for consistency
    melted_df.rename(columns={'Country Name': 'country'}, inplace=True)
    
    return melted_df

# Load Countries & Regions source

In [6]:
countries = pd.read_csv("./countries.csv")

In [7]:
countries.tail()

Unnamed: 0,id,country,currency,abbreviation
205,206,Martinique,Euro,EUR
206,207,Montserrat,East Caribbean Dollar,XCD
207,208,"San Marino, Rep. Of",Euro,EUR
208,209,"São Tomé and Príncipe, Dem. Rep. of",Dobra,STN
209,210,Syrian Arab Rep.,Syrian Pound,SYP


In [8]:
regions = pd.read_csv('./regions.csv')

In [9]:
regions.tail(5)

Unnamed: 0,id,region,country_id,country
74,75,Urban / Urban Guatemala,[75],['Guatemala']
75,76,Urban / Urban West Java Province (SUBANG AND G...,[85],['Indonesia']
76,77,Urban / Zhengzhou,[40],['China']
77,78,Urban-Rural / Caribbean coast,[41],['Colombia']
78,79,Yucatán / Rural Areas and Small Towns in Yucatan,[119],['Mexico']


# Conversion Rate V1

In [10]:
# Load excel file
file_path = "./conversion_rates.xlsx"
xls = pd.ExcelFile(file_path)

In [11]:
# Load two sheets for usd and eur into dataframe
usd_df = pd.read_excel(xls, sheet_name="USD conversion")
eur_df = pd.read_excel(xls, sheet_name="EUR conversion")

In [12]:
# Apply the function to both dataframes
usd_long = melt_conversion_data(usd_df, 'USD')
eur_long = melt_conversion_data(eur_df, 'EUR')

In [13]:
# Combine the two dataframes if you want them in one
combined_df = pd.concat([usd_long, eur_long], ignore_index=True)

In [14]:
# fill NA with 0
combined_df.fillna(0, inplace=True)

In [15]:
combined_df['country_id'] = combined_df["country"].apply(find_country)

404 Democratic Republic of Congo
404 Congo
404 Egypt
404 EUR
404 East Timor
404 Democratic Republic of Congo
404 Congo
404 Egypt
404 EUR
404 East Timor
404 Democratic Republic of Congo
404 Congo
404 Egypt
404 EUR
404 East Timor
404 Democratic Republic of Congo
404 Congo
404 Egypt
404 EUR
404 East Timor
404 Democratic Republic of Congo
404 Congo
404 Egypt
404 EUR
404 East Timor
404 Democratic Republic of Congo
404 Congo
404 Egypt
404 East Timor
404 Democratic Republic of Congo
404 Congo
404 Egypt
404 East Timor
404 Democratic Republic of Congo
404 Congo
404 Egypt
404 East Timor
404 Democratic Republic of Congo
404 Congo
404 Egypt
404 East Timor
404 Democratic Republic of Congo
404 Congo
404 Egypt
404 East Timor


In [16]:
# sort add id and so on
combined_df = combined_df.sort_values(by=['country', 'year'], ascending=[True, True])
combined_df['id'] = combined_df.reset_index().index + 1
combined_df = combined_df.reindex(columns=['id', 'country', 'country_id', 'year', 'value', 'currency'])

In [17]:
combined_df

Unnamed: 0,id,country,country_id,year,value,currency
0,1,Angola,5,2020,364.825805,USD
290,2,Angola,5,2020,408.413189,EUR
58,3,Angola,5,2021,578.258780,USD
347,4,Angola,5,2021,660.484929,EUR
116,5,Angola,5,2022,631.441956,USD
...,...,...,...,...,...,...
460,571,Zimbabwe,203,2022,104.734549,EUR
231,572,Zimbabwe,203,2023,374.954363,USD
517,573,Zimbabwe,203,2023,394.845181,EUR
289,574,Zimbabwe,203,2024,3509.172220,USD


In [18]:
conversion_rate = combined_df

In [19]:
conversion_rate.to_csv('./conversion_rate.csv', index=False)

# Regions V4 - Living Income Benchmark V6 - CPI V6

Here we will map the li_benchmark_data.csv and cpi_data.csv only

## Load files

In [20]:
lib = pd.read_csv("./li_benchmark_data.csv")

In [21]:
lib.tail()

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,household_equiv,links
279,,,,,,,,,
280,,,,,,,,,
281,,,,,,,,,
282,,,,,,,,,
283,,,,,,,,,


In [22]:
lib = lib.dropna(how="all")

In [23]:
lib["id"] = lib.reset_index().index + 1

In [24]:
lib.columns

Index(['country', 'region', 'source', 'household_size', 'nr_adults', 'year',
       'LCU', 'household_equiv', 'links', 'id'],
      dtype='object')

In [25]:
lib[lib['country'] == "Kenya"]

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,household_equiv,links,id
66,Kenya,Rural / Kericho,Calculated from Global Living Wage Coalition,5.0,2.0,2024.0,570492.0,2.4,https://www.globallivingwage.org/living-wage-b...,67
67,Kenya,Non-metropolitan Urban / Lake Naivasha,Calculated from Global Living Wage Coalition,5.0,2.0,2024.0,627012.0,2.4,https://www.globallivingwage.org/living-wage-b...,68


In [26]:
df = pd.read_csv("./li_benchmark_data.csv")

In [27]:
df.columns

Index(['country', 'region', 'source', 'household_size', 'nr_adults', 'year',
       'LCU', 'household_equiv', 'links'],
      dtype='object')

## Recreate region

In [28]:
region_source = df[["country", "region"]]

In [29]:
region_temp = region_source

In [30]:
region_temp = region_temp[region_temp["region"].notna()]

In [31]:
region_temp['country'] = region_temp['country'].str.replace('\n', ' ').str.strip()
region_temp['region'] = region_temp['region'].str.replace('\n', ' ').str.strip()
region_temp['region'] = region_temp['region'].str.replace('\r', ' ').str.strip()
region_temp['region'] = region_temp['region'].str.replace(r'^\s+|\s+?$', '', regex=True)
region_temp['region'] = region_temp['region'].apply(remove_extra_whitespace)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_temp['country'] = region_temp['country'].str.replace('\n', ' ').str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_temp['region'] = region_temp['region'].str.replace('\n', ' ').str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_temp['region'] = region_temp['regi

In [32]:
region_temp.tail(10)

Unnamed: 0,country,region
104,Togo,Urban / All
105,Togo,Rural / coffee-producing households in the Waw...
106,Tunisia,Rural / All
107,Uganda,Rural / Lake Victoria Basin
108,Vietnam,Urban / Minimum Wage Region 1
109,Vietnam,Urban / Minimum Wage Region 2
110,Vietnam,Urban / Minimum Wage Region 3
111,Vietnam,Rural / Minimum Wage Region 4
112,Zambia,Rural / All
113,Zimbabwe,Rural / All


### Map regions with countries

In [33]:
region_temp["country_id"] = region_temp["country"].apply(find_country)

404 East Timor


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  region_temp["country_id"] = region_temp["country"].apply(find_country)


In [34]:
region_temp.tail(10)

Unnamed: 0,country,region,country_id
104,Togo,Urban / All,182
105,Togo,Rural / coffee-producing households in the Waw...,182
106,Tunisia,Rural / All,185
107,Uganda,Rural / Lake Victoria Basin,190
108,Vietnam,Urban / Minimum Wage Region 1,199
109,Vietnam,Urban / Minimum Wage Region 2,199
110,Vietnam,Urban / Minimum Wage Region 3,199
111,Vietnam,Rural / Minimum Wage Region 4,199
112,Zambia,Rural / All,202
113,Zimbabwe,Rural / All,203


### Group region with same name

In [35]:
group_df = region_temp.groupby('region').agg({
    'country': lambda x: list(x),
    'country_id': lambda x: list(x)
}).reset_index()

In [36]:
group_df = group_df[group_df['region'].str.strip() != '']

In [37]:
group_df['region'] = group_df['region'].str.replace(r'^\s+|\s+?$', '', regex=True)

In [38]:
group_df['country_id'] = group_df['country_id'].apply(list)
group_df['country'] = group_df['country'].apply(list)

In [39]:
group_df["id"] = group_df.reset_index().index + 1

In [40]:
new_column_order = ["id", "region", "country_id", "country"]

In [41]:
regions = group_df[new_column_order]

### Export Region to CSV

In [42]:
regions[regions['region'].str.contains("coffee-producing households in Cuanza Sul province")]

Unnamed: 0,id,region,country_id,country
45,46,Rural / coffee-producing households in Cuanza ...,[5],[Angola]


In [43]:
regions.tail(5)

Unnamed: 0,id,region,country_id,country
74,75,Urban / Urban Guatemala,[75],[Guatemala]
75,76,Urban / Urban West Java Province (SUBANG AND G...,[85],[Indonesia]
76,77,Urban / Zhengzhou,[40],[China]
77,78,Urban-Rural / Caribbean coast,[41],[Colombia]
78,79,Yucatán / Rural Areas and Small Towns in Yucatan,[119],[Mexico]


In [44]:
regions.to_csv('./regions.csv', index=False)

## Living Income Benchmark V6

### Get country id

In [45]:
countries.columns

Index(['id', 'country', 'currency', 'abbreviation'], dtype='object')

In [46]:
def find_new_country(val):
    if val == "Turkey":
        val = "Turkiye"
    country = countries[countries["country"].str.lower() == str(val).lower()]
    if country.empty:
        print("404", val)
        return None
    return country["id"].to_list()[0]

In [47]:
lib["country_id"] = lib["country"].apply(find_new_country)

404 East Timor


In [48]:
lib.tail()

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,household_equiv,links,id,country_id
109,Vietnam,Urban / Minimum Wage Region 2,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,153600400.0,2.1,,110,199.0
110,Vietnam,Urban / Minimum Wage Region 3,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,149169000.0,2.1,,111,199.0
111,Vietnam,Rural / Minimum Wage Region 4,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,127290200.0,2.1,,112,199.0
112,Zambia,Rural / All,Calculated from Global Living Wage Coalition,6.0,2.0,2024.0,62346.24,2.7,,113,202.0
113,Zimbabwe,Rural / All,Calculated from Global Living Wage Coalition,5.0,2.0,2024.0,4536.0,2.4,,114,203.0


### Get region id

In [49]:
regions.tail()

Unnamed: 0,id,region,country_id,country
74,75,Urban / Urban Guatemala,[75],[Guatemala]
75,76,Urban / Urban West Java Province (SUBANG AND G...,[85],[Indonesia]
76,77,Urban / Zhengzhou,[40],[China]
77,78,Urban-Rural / Caribbean coast,[41],[Colombia]
78,79,Yucatán / Rural Areas and Small Towns in Yucatan,[119],[Mexico]


In [50]:
regions.columns

Index(['id', 'region', 'country_id', 'country'], dtype='object')

In [51]:
lib['country'] = lib['country'].str.replace('\n', ' ').str.strip()
lib['region'] = lib['region'].str.replace('\n', ' ').str.strip()
lib['region'] = lib['region'].str.replace('\r', ' ').str.strip()
lib['region'] = lib['region'].str.replace(r'^\s+|\s+?$', '', regex=True)
lib['region'] = lib['region'].apply(remove_extra_whitespace)

In [52]:
lib.tail()

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,household_equiv,links,id,country_id
109,Vietnam,Urban / Minimum Wage Region 2,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,153600400.0,2.1,,110,199.0
110,Vietnam,Urban / Minimum Wage Region 3,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,149169000.0,2.1,,111,199.0
111,Vietnam,Rural / Minimum Wage Region 4,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,127290200.0,2.1,,112,199.0
112,Zambia,Rural / All,Calculated from Global Living Wage Coalition,6.0,2.0,2024.0,62346.24,2.7,,113,202.0
113,Zimbabwe,Rural / All,Calculated from Global Living Wage Coalition,5.0,2.0,2024.0,4536.0,2.4,,114,203.0


In [53]:
def find_region(val):
    region = regions[regions["region"].str.lower() == str(val).lower()]
    if region.empty:
        print("404", val)
        return None
    return region["id"].to_list()[0]

In [54]:
lib["region_id"] = lib["region"].apply(find_region)

### Export Benchmark to CSV

In [55]:
lib = lib.dropna(subset=['region_id']) # remove row when region_id column is None

In [56]:
lib["id"] = lib.reset_index().index + 1

In [57]:
lib.columns

Index(['country', 'region', 'source', 'household_size', 'nr_adults', 'year',
       'LCU', 'household_equiv', 'links', 'id', 'country_id', 'region_id'],
      dtype='object')

#### Calculate USD and EUR if this columns doesn't exist in df lib data columns
- Calculation using the conversion rate data

In [58]:
lib.head(1)

Unnamed: 0,country,region,source,household_size,nr_adults,year,LCU,household_equiv,links,id,country_id,region_id
0,Angola,Rural / coffee-producing households in Cuanza ...,New Foresight,7.5,2.0,2023.0,4474488.0,3.15,,1,5.0,46


In [59]:
# find conversion rate and calculate the conversion into USD/EUR
def find_conversion_rate(row, currency):
    country_id = row['country_id']
    country_id = int(country_id) if not pd.isna(country_id) else 0
    year = row['year']
    year = int(year) if not pd.isna(year) else 0
    
    rate = conversion_rate.loc[
        (conversion_rate["year"] == year) & 
        (conversion_rate["country_id"] == country_id) & 
        (conversion_rate["currency"] == currency),
        "value"
    ]
    # Extract scalar value if needed
    rate = rate.iloc[0] if not rate.empty else None
    if not rate:
        return 0
    lcu = row['LCU']
    target_value = lcu / rate
    return target_value
    

new_lib = lib.copy()
if "USD" not in lib.columns or "EUR" not in lib.columns:
    new_lib["USD"] = new_lib.apply(lambda row: find_conversion_rate(row, currency="USD"), axis=1)
    new_lib["EUR"] = new_lib.apply(lambda row: find_conversion_rate(row, currency="EUR"), axis=1)
    

In [60]:
new_lib = new_lib[['id', 'country_id',
       'region_id', 'country', 'region', 'source', 'household_size', 'nr_adults', 'year',
       'LCU', 'USD', 'EUR', 'household_equiv', 'links']]

In [61]:
# dropna by column country_id is none
new_lib = new_lib.dropna(subset="country_id")

In [62]:
new_lib.tail(5)

Unnamed: 0,id,country_id,region_id,country,region,source,household_size,nr_adults,year,LCU,USD,EUR,household_equiv,links
109,110,199.0,65,Vietnam,Urban / Minimum Wage Region 2,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,153600400.0,6457.23862,5971.909715,2.1,
110,111,199.0,66,Vietnam,Urban / Minimum Wage Region 3,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,149169000.0,6270.945749,5799.618697,2.1,
111,112,199.0,35,Vietnam,Rural / Minimum Wage Region 4,Calculated from Global Living Wage Coalition,4.0,2.0,2024.0,127290200.0,5351.178546,4948.981604,2.1,
112,113,202.0,18,Zambia,Rural / All,Calculated from Global Living Wage Coalition,6.0,2.0,2024.0,62346.24,3084.612354,2852.771527,2.7,
113,114,203.0,18,Zimbabwe,Rural / All,Calculated from Global Living Wage Coalition,5.0,2.0,2024.0,4536.0,1.292613,1.195459,2.4,


In [63]:
new_lib.to_csv("./li_benchmark.csv", index=False)

## CPI V6

In [64]:
cpi_df = pd.read_csv("./cpi_data.csv")

In [65]:
cpi_df = cpi_df.rename(columns={"Country Name": "country"})

In [66]:
cpi_df = cpi_df.dropna(how="all")

In [67]:
cpi_df.columns

Index(['country', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', '2023', '2024', '2025', 'links'],
      dtype='object')

In [68]:
if "links" in cpi_df.columns:
    cpi_df = cpi_df.copy()
    cpi_df.drop(columns="links", inplace=True)

In [69]:
cpi_df.head(5)

Unnamed: 0,country,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,Afghanistan,101.97,101.295751,105.736448,110.997842,111.692854,114.264439,120.66,126.85,144.25,137.55,128.47,
1,Albania,90.72,92.435759,93.614714,95.474522,97.410802,98.785357,100.386555,102.435918526012,109.324942,114.528551,108.23,
2,Algeria,164.77,172.653333,183.699167,193.97,202.2525,206.2,211.18,226.44,247.420833,270.485833,281.43,
3,Angola,28.45,31.10644,40.65438,52.787468,63.149087,73.935554,90.40214,113.684573330823,137.962244,156.785953,201.063081738323,228.58
4,Anguilla,107.17,106.1275,105.573687,106.94,107.3425,108.2175,107.705,109.64,115.73,119.6725,,


### Map with Countries

In [70]:
cpi_df["country_id"] = cpi_df["country"].apply(find_country)

404 Democratic Republic of Congo
404 Congo
404 Egypt
404 East Timor


### Transform CPI table

In [71]:
# transform int country, country_id, year, value format
melted_df = pd.melt(cpi_df, id_vars=['country', 'country_id'], var_name='year', value_name='value')
melted_df['value'] = melted_df['value'].fillna(0)

In [72]:
melted_df.reset_index(drop=True, inplace=True)
melted_df["id"] = melted_df.reset_index().index + 1

In [73]:
new_column_order = ["id", "country", "country_id", "year", "value"]

In [74]:
cpi_df = melted_df[new_column_order]

In [75]:
cpi_df['value'] = cpi_df['value'].apply(convert_to_float)

### Export CPI to CSV

In [76]:
cpi = cpi_df.dropna()

In [77]:
cpi = cpi.drop_duplicates(subset=["country_id", "year"])

In [78]:
cpi.head()

Unnamed: 0,id,country,country_id,year,value
0,1,Afghanistan,1,2014,101.97
1,2,Albania,2,2014,90.72
2,3,Algeria,3,2014,164.77
3,4,Angola,5,2014,28.45
4,5,Anguilla,204,2014,107.17


In [79]:
cpi.to_csv('cpi.csv', index=False)