In [1]:
import pandas as pd
import json

In [2]:
GEO_DATA_PATH = '../data/countries-codes.csv'
POPULATION_PATH = '../data/world_population.csv'
INFANT_MORTALITY_PATH = '../data/infant_mortality.csv'
LIFE_EXPECTANCY_PATH = '../data/life_expectancy.csv'
FLAGS_PATH = '../data/flags_iso2.csv'

OUTPUT_DATA_PATH = '../data/consolidated/countries_dataset.csv'

In [3]:
USEFUL_GEO_DATA = {
    'Geo Shape': {'new_name': 'geo_points'},
    'LABEL SP': {'new_name': 'name_es'},
    'LABEL EN': {'new_name': 'name_en'},
}
GEO_DATA_COUNTRY_CODE = 'ISO3 CODE'
USEFUL_POPULATION_DATA = {
    '2022 Population': {'new_name': 'population'},
    'Area (km²)': {'new_name': 'area'},
    'Capital': {'new_name': 'capital'},
    'Continent': {'new_name': 'continent'},
}
POPULATION_DATA_COUNTRY_CODE = 'CCA3'
USEFUL_INFANT_MORTALITY_DATA = {
    '2022': {'new_name': 'infant_mortality'},
}
INFANT_MORTALITY_COUNTRY_CODE = 'Country Code'
USEFUL_LIFE_EXPECTANCY_DATA = {
    '2022': {'new_name': 'life_expectancy'},
}
LIFE_EXPECTANCY_COUNTRY_CODE = 'Country Code'
USEFUL_FLAGS_DATA = {
    'URL': {'new_name': 'flag_url'},
}
FLAGS_COUNTRY_CODE = 'Alpha-3 code'

In [4]:
def get_dataset(path, useful_data, code, sep=','):
    data = pd.read_csv(path, sep=sep)
    columns = [code, *useful_data]
    data = data[columns].rename(columns=lambda x: 'code' if x == code else useful_data.get(x, {}).get('new_name', x))
    data['code'] = data.code.apply(lambda x: x.upper())
    return data

In [5]:
def process_geo_points(geo_points):
    """Iterate through all the cooridnate groups to get all geo points of 
    the coordinates in one simple list"""
    try:
        geo_points = json.loads(geo_points)
        coordinates = geo_points['coordinates']
        geo_type = geo_points['type']
    except TypeError:
        return None
    all_positions = set()
    if geo_type == 'MultiPolygon':
        for group in coordinates:
            for sub_group in group:
                for ubi in sub_group:
                    all_positions.add(tuple(ubi))
    else:
        for group in coordinates:
            for ubi in group:
                all_positions.add(tuple(ubi))
    return json.dumps(sorted(all_positions))

In [6]:
population_data = get_dataset(POPULATION_PATH, USEFUL_POPULATION_DATA, POPULATION_DATA_COUNTRY_CODE)
geo_data = get_dataset(GEO_DATA_PATH, USEFUL_GEO_DATA, GEO_DATA_COUNTRY_CODE, ';')
infant_mortality_data = get_dataset(INFANT_MORTALITY_PATH, USEFUL_INFANT_MORTALITY_DATA, INFANT_MORTALITY_COUNTRY_CODE)
life_expectancy_data = get_dataset(LIFE_EXPECTANCY_PATH, USEFUL_LIFE_EXPECTANCY_DATA, LIFE_EXPECTANCY_COUNTRY_CODE)
flags_data = get_dataset(FLAGS_PATH, USEFUL_FLAGS_DATA, FLAGS_COUNTRY_CODE)

final_dataset = population_data.merge(
    geo_data, how='inner', on='code'
).merge(
    infant_mortality_data, how='left', on='code'
).merge(
    life_expectancy_data, how='left', on='code'
).merge(
    flags_data, how='left', on='code'
)

In [7]:
final_dataset['geo_points'] = final_dataset.geo_points.apply(process_geo_points)

In [8]:
final_dataset.to_csv(OUTPUT_DATA_PATH, sep=';', index=False)