In [423]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go


In [424]:
#####Pre-Processing for Bayes Model########
bayes16 = pd.read_csv('final16.csv', index_col=[0])
bayes18 = pd.read_csv('final18.csv', index_col=[0])


In [425]:
#######Edit the EPI Values#######
# Epi values are based on relative scale between 0-100
# Graph is mostly linear therefore its fine to bin them into categories based on quantile values
def discreteEPI(cat):
    cat = cat.apply(
    lambda x: 
    'Very Poor' if x <= float(20) else
    ('Poor' if x > float(20) and x <= float(40) else
    ('Average' if x > float(40) and x <= float(60) else
    ('Good' if x > float(60) and x <= float(80) else
    'Very Good'))))
    return cat

# Function to transform preprocessed Data into Bayes ready data
def BayesTransform(bayes):

    # Transform EPI values
    bayes['Air Quality'] = discreteEPI(bayes['Air Quality'])
    bayes['Biodiversity'] = discreteEPI(bayes['Biodiversity'])
    bayes['Water Sanitation'] = discreteEPI(bayes['Water Sanitation'])

    # Convert GDP to discrete states using World Bank Classifications for specific year
    # 2016 - https://blogs.worldbank.org/opendata/new-country-classifications-2016
    # 2018 - https://blogs.worldbank.org/opendata/new-country-classifications-income-level-2018-2019
    bayes['GDP per Capita'] = bayes['GDP per Capita'].apply(
        lambda x: 
        'Low-Income' if x <= float(1025) else
        ('Lower-Middle-Income' if x > float(1025) and x <= float(4035) else
        ('Upper-Middle-Income' if x > float(4035) and x <= float(12475) else
        'High-Income')))

    # CO2 values have a left skew there need to undergo a log transformation
    bayes['CO2 per Capita'] = (bayes["CO2 per Capita"] + 1).apply(np.log)
    bayes['CO2 per Capita'] = pd.cut(bayes['CO2 per Capita'], 5, labels=["Very Low", "Low", "Medium", "High", "Very High"])

    # Happiness values follow very linear trend therefore no need to normalize
    bayes['Happiness Score'] = pd.cut(
        bayes['Happiness Score'], 7, 
        labels=["Very Unhappy", "Unhappy", "Discontent", "Average", "Content", "Happy", "Very Happy"])

    # Remove Coutnry Labels for training data set
    bayes = bayes[['Region', 'GDP per Capita', 'CO2 per Capita', 'Air Quality', 'Biodiversity', 'Water Sanitation', 'Happiness Score']]

    # return new data frame
    return bayes

bayes16 = BayesTransform(bayes16)
bayes18 = BayesTransform(bayes18)

bayesFinal = bayes16.append(bayes18, ignore_index=True)

# Upload to csv format
bayesFinal.to_csv('trainingData.csv', index=False)
