In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
happy = pd.read_csv("data/Happiness.csv")
nutrition = pd.read_csv("data/nutrition_data.csv")[:96131]  # 2019 Only

nutrition["Country Name"]
happy["Country or region"]

# exclude Taiwan, Northern Cyprus

nutrition[nutrition["Country Name"] == "Russian Federation"] = "Russia"
nutrition[nutrition["Country Name"] == "Czechia"] = "Czech Republic"
nutrition[nutrition["Country Name"] == "Trinidad and Tobago"] = "Trinidad & Tobago"
nutrition[nutrition["Country Name"] == "Hong Kong SAR, China"] = "Hong Kong"
nutrition[nutrition["Country Name"] == "Cote d'Ivoire"] = "Ivory Coast"
nutrition[nutrition["Country Name"] == "Congo, Rep"] = "Congo (Brazzaville)"
nutrition[nutrition["Country Name"] == "Lao PDR"] = "Laos"



In [3]:
pd.set_option('display.max_rows', None)
happy["Country or region"]

0                       Finland
1                       Denmark
2                        Norway
3                       Iceland
4                   Netherlands
5                   Switzerland
6                        Sweden
7                   New Zealand
8                        Canada
9                       Austria
10                    Australia
11                   Costa Rica
12                       Israel
13                   Luxembourg
14               United Kingdom
15                      Ireland
16                      Germany
17                      Belgium
18                United States
19               Czech Republic
20         United Arab Emirates
21                        Malta
22                       Mexico
23                       France
24                       Taiwan
25                        Chile
26                    Guatemala
27                 Saudi Arabia
28                        Qatar
29                        Spain
30                       Panama
31      

In [4]:
nutrition_wide = nutrition.pivot(index = "Country Name", 
                                 columns = "Series Name", 
                                 values = "2019 [YR2019]")

ValueError: Index contains duplicate entries, cannot reshape

In [None]:
happy_merged = happy.merge(nutrition_wide, how = "left", 
                           left_on = "Country or region", 
                           right_on = "Country Name")

In [None]:
test = happy_merged.replace('..', np.nan)
test["Prevalence of undernourishment (percent of population)"] = pd.to_numeric(test["Prevalence of undernourishment (% of population)"])
test["Prevalence of HIV, total (percent of population ages 15-49)"] = pd.to_numeric(test["Prevalence of HIV, total (% of population ages 15-49)"])
test["Net migration"] = pd.to_numeric(test["Net migration"])
test["Current health expenditure per capita, PPP (current international $)"] = pd.to_numeric(test["Current health expenditure per capita, PPP (current international $)"])

test.head()

In [None]:
sns.histplot(data = test, x = "Prevalence of undernourishment (percent of population)")

In [None]:
sns.histplot(data = test, x = "Prevalence of HIV, total (percent of population ages 15-49)", bins = 10)

In [None]:
sns.histplot(data = test, x = "Current health expenditure per capita, PPP (current international $)", bins = 20)

In [None]:
sns.histplot(data = test, x = "Net migration", bins = 30)

In [None]:
sns.relplot(x = "Score", y = "Prevalence of HIV, total (% of population ages 15-49)", data = test)

In [None]:
sns.relplot(x = "Score", y = "Prevalence of undernourishment (% of population)", data = test)

In [None]:
sns.relplot(x = "Score", y = "Net migration", data = test)  # Initially does not look like there is much of a relationship

In [None]:
sns.relplot(x = "Score", y = "Current health expenditure per capita, PPP (current international $)", data = test)

In [None]:
"""
We will attempt to data mine information from the variables.

Thus, we will not be train/test splitting nor standardizing data or using 
regularization methods, for now.
"""
X = test[["Country or region",
          "Current health expenditure per capita, PPP (current international $)",
          "Prevalence of undernourishment (percent of population)",
          "Net migration",
          "Prevalence of HIV, total (percent of population ages 15-49)"]]
y = test[["Score"]]


X[X["Prevalence of HIV, total (percent of population ages 15-49)"].isna()]

One of the first big challenges we run into is missing data. Given the extremely limited amount of observations we have (155), it would be foolish to drop rows with missing entries. Instead, we can use a variation of monte carlo simulation from module 3. Specifically, we will use Markov Chain Monte Carlo (MCMC) to impute missing entries from a multivariate normal distribution. This will undoubtedly give us the most accurate values for missing data and perform better than mean replacement/row dropping.

In [None]:
lm = LinearRegression()
print(X.head())
lm.fit(X, y)
lm.coef_