In [24]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

Task 1: In this task, we will use different kinds of models to explore the relationship between economic status and life expectancy. For Afghanistan for instance, as the following table shows, we can use older data (from 2000 to 2013) to train models and use the trained models to predict life expectancy of 2014 and 2015. The model input can be GDP number and the model output will be life expectancy for that year.

In [118]:
def linear_reg(country_df):
    # get the country name to return
    country_name = country_df.iloc[0]['Country']

    # Cull data to use only rows that we are concerned about
    country_df = country_df[['Year', 'Life expectancy ', 'GDP']]

    # Sort the data frame by year and remove the index
    country_df = country_df.sort_values(by=['Year'])
    country_df.reset_index(inplace=True)
    country_df.drop(columns=['index'], inplace=True)

    # Split data into a testing and training set
    test_df = country_df.loc[country_df['Year'] > 2013]
    training_df = country_df.loc[country_df['Year'] < 2014]

    # Split into our x and y
    x_train = training_df['GDP'].to_numpy().reshape(-1, 1)
    y_train = training_df['Life expectancy '].to_numpy()

    # Do the Linear Regression
    linreg = LinearRegression()
    model = linreg.fit(x_train, y_train)

    # get the GDP values that we are trying to predict
    gdp_vals = test_df['GDP']

    # run the gdps from 2014 and 2015 through the model
    le_preds = [] # store predictions into a list
    for i in gdp_vals:
        gdp = np.array([i])
        prediction = model.predict(gdp.reshape(1, -1))
        print("Predicted Response " + country_name + ":", prediction)
        le_preds.append(prediction)

    # get true values of life expectancy from 2014-2015
    le_true_vals = test_df['Life expectancy ']

    # get the r2 score
    train_r2 = model.score(x, y)
    test_r2 = mean_squared_error(le_true_vals, le_preds, squared=False)

    train_true_le = []
    for gdp in x_train:
        train_true_le.append(model.predict(gdp.reshape(-1, 1)))

    # get the rmse for both train and testing sets
    train_rmse = mean_squared_error(y_train, train_true_le, squared=False)
    test_rmse = r2_score(le_true_vals, le_preds)

    return country_name, test_r2, test_rmse, train_r2, train_rmse

In [119]:
# read CSV
df = pd.read_csv('Life Expectancy Data.csv')

# get rows only with Afghanistan data
afghanistan = df.loc[df['Country'] == 'Afghanistan']

country_name, testing_r2, testing_rmse, train_r2, train_rmse = linear_reg(afghanistan)

# print r2 score
print('R2 score for training data: ', train_r2)
print('R2 score for testing data: ', testing_r2)
print('RMSE for training data: ', train_rmse)
print('RMSE for testing data: ', testing_rmse)

Predicted Response Afghanistan: [59.1529947]
Predicted Response Afghanistan: [59.00903924]
R2 score for training data:  0.4788209985493368
R2 score for testing data:  4.269053041258137
RMSE for training data:  1.067259118407288
RMSE for testing data:  -1.8027395415725236


In [65]:
x = afghanistan['GDP'].to_numpy()
# Quadratic Function
# Fit a polynomial of degree 2
weights = np.polyfit(x, y, 2)
model = np.poly1d(weights)
predicted = model(x)

# Calculate RMSE and R2 for training data
print('RMSE for training data: ', np.sqrt(mean_squared_error(y, predicted)))
print('R2 for training data: ', r2_score(y, predicted))


predicted = model(pred_x)
# Calculate the RMSE for testing data
print('RMSE for testing data: ', np.sqrt(mean_squared_error(true_vals, predicted)))
print('R2 for testing data: ', r2_score(true_vals, predicted))

RMSE for training data:  0.9894912620578978
R2 for training data:  0.5520071442080716
RMSE for testing data:  4.150947401409653
R2 for testing data:  -1.6498061252240737


In [66]:
# Cubic Function
# Fit a polynomial of degree 3
weights = np.polyfit(x, y, 3)
model = np.poly1d(weights)
predicted = model(x)

# Calculate RMSE and R2 for training data
print('RMSE for training data: ', np.sqrt(mean_squared_error(y, predicted)))
print('R2 for training data: ', r2_score(y, predicted))

predicted = model(pred_x)
# Calculate the RMSE for testing data
print('RMSE for testing data: ', np.sqrt(mean_squared_error(true_vals, predicted)))
print('R2 for testing data: ', r2_score(true_vals, predicted))

RMSE for training data:  0.8718359094448804
R2 for training data:  0.6522103389963443
RMSE for testing data:  3.8221538776098685
R2 for testing data:  -1.2466528664556935


In [68]:
# Quartic Function
# Fit a polynomial of degree 4
weights = np.polyfit(x, y, 4)
model = np.poly1d(weights)
predicted = model(x)

# Calculate RMSE and R2 for training data
print('RMSE for training data: ', np.sqrt(mean_squared_error(y, predicted)))
print('R2 for training data: ', r2_score(y, predicted))

predicted = model(pred_x)
# Calculate the RMSE for testing data
print('RMSE for testing data: ', np.sqrt(mean_squared_error(true_vals, predicted)))
print('R2 for testing data: ', r2_score(true_vals, predicted))

RMSE for training data:  0.8329835827343556
R2 for training data:  0.682517314351077
RMSE for testing data:  4.1934730345539855
R2 for testing data:  -1.7043777149606156


Task 2: Please repeat this process for all the countries in this dataset. Then, you can average the RMSE and R2 scores for all the developing and developed countries.

In [90]:
# read CSV
df = pd.read_csv('Life Expectancy Data.csv')

# Drop rows that have no data in the columns gdp and life expectancy
df.dropna(subset=['GDP', 'Life expectancy '], inplace=True)


# We also want to split the data into developed and developing dataframes
developed = df[df['Status'] == 'Developed']
developing = df[df['Status'] == 'Developing']

# limit data to country, gdp, life expectancy
developed = developed[['Country', 'Year', 'Life expectancy ', 'GDP']]
developing = developing[['Country', 'Year', 'Life expectancy ', 'GDP']]

# get each country and put it into a list
list_developed = []
list_developing = []

for country in developed['Country']:
    if country not in list_developed:
        list_developed.append(country)

for country in developing['Country']:
    if country not in list_developing:
        list_developing.append(country)



Unnamed: 0,Country,Year,Life expectancy,GDP
112,Australia,2015,82.8,56554.38760
113,Australia,2014,82.7,62214.69120
114,Australia,2013,82.5,67792.33860
115,Australia,2012,82.3,67677.63477
116,Australia,2011,82.0,62245.12900
...,...,...,...,...
2532,Switzerland,2004,81.0,53255.97631
2533,Switzerland,2003,85.0,4796.56497
2534,Switzerland,2002,84.0,41336.72192
2535,Switzerland,2001,82.0,38538.64447
