# Data generation

# Source data:

Eurostat purchasing power parity data - https://ec.europa.eu/eurostat/web/purchasing-power-parities/data/database

Wise API - Currency conversion rates

# Categories
The country-country comparisons will be calculated for the following expense categories:
- Alcoholic beverages & Tobacco
- Clothing and footwear
- Education
- Food and non-alcoholic beverages
- Health
- Recreation, culture, restaurants & hotels
- Software, communication & IT
- Transport
- Water, electricity, gas and other fuels

In [None]:
import pandas as pd
import numpy as np

import calc
import itertools

import json

In [None]:
data_filelocation = "prc_ppp_ind_1_Data.csv"
currency_rate_filelocation = "rates.json"
output_table_filelocation = "ppp_indices.csv"
supported_countries = {
    "Austria": "at",
    "Denmark": "dk",
    "France": "fr",
    "Germany (until 1990 former territory of the FRG)": "de",
    "Hungary": "hu",
    "Spain": "es",
    "Sweden": "se",
}
currencies = {
    "at" : "EUR",
    "dk" : "DKK",
    "fr" : "EUR",
    "de" : "EUR",
    "hu" : "HUF",
    "es" : "EUR",
    "se" : "SEK",
}
category_names = {
    "alc-tobacco": ["Alcoholic beverages, tobacco and narcotics"],
    "clothing": ["Clothing and footwear"],
    "education": ["Education"],
    "food-non-alc": ["Food and non-alcoholic beverages"],
    "health": ["Health"],
    "it-tech": ["Communication", "Software", "Audio-visual, photographic and information processing equipment"],
    "leisure": ["Restaurants and hotels", "Recreation and culture"],
    "transport": ["Transport"],
}

In [None]:
df = pd.read_csv(data_filelocation, encoding="cp1252")

with open(currency_rate_filelocation, "r") as file:
    currency_conversion_data = json.load(file)

In [None]:
# Remove countries not on supported list
df = df[df["GEO"].isin(supported_countries.keys())]

# Remap country name to country_id
df['country_id'] = df['GEO'].map(supported_countries)

# Keep only columns I need
df.drop(columns=["TIME", "GEO", "NA_ITEM"], inplace=True)

# Convert Value to float
df["Value"] = df["Value"].astype(float)

df.head(10)

In [None]:
df_ppp = pd.DataFrame(columns=["country_id", "category", "value"])

for _, country_id in supported_countries.items():
    for category, subcategory_list in category_names.items():
        temp_df = df[(df["country_id"] == country_id) & (df["PPP_CAT"].isin(subcategory_list))]

        # If result has just 1 row, just get top value
        if len(subcategory_list) == 1:
            value = temp_df["Value"].iloc[0]
        
        # If result contains mutiple things, call calculation
        else:
            value = calc.calc_category_index(temp_df["Value"].iloc[:])
        
        df_ppp.loc[len(df_ppp)] = [country_id, category, value]

In [None]:
df_ppp.head(10)

In [None]:
out_df = pd.DataFrame(columns=["From", "To", "Category"])

# Create all country to-from combination for outdf
countries = currencies.keys()
countryCombinations = []

for combination in itertools.permutations(countries, 2):
    for category in category_names.keys():
        out_df.loc[len(out_df)] = [combination[0], combination[1], category]

out_df

In [None]:
# Get currency data for countries
out_df["Ratio"] = out_df.apply(lambda row: 
    calc.master_calculator(
        row["From"],
        row["To"],
        row["Category"],
        currencies,
        currency_conversion_data,
        df_ppp)
    , axis=1)

In [None]:
out_df.to_csv(output_table_filelocation)

In [None]:
out_df.head()
out_df["Ratio"].min()