In [24]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

Task 1: In this task, we will use different kinds of models to explore the relationship between economic status and life expectancy. For Afghanistan for instance, as the following table shows, we can use older data (from 2000 to 2013) to train models and use the trained models to predict life expectancy of 2014 and 2015. The model input can be GDP number and the model output will be life expectancy for that year.

In [229]:
def linear_reg(country_df):
    # get the country name to return
    country_name = country_df.iloc[0]['Country']

    # Cull data to use only rows that we are concerned about
    country_df = country_df[['Year', 'Life expectancy ', 'GDP']]

    # Sort the data frame by year and remove the index
    country_df = country_df.sort_values(by=['Year'])
    country_df.reset_index(inplace=True)
    country_df.drop(columns=['index'], inplace=True)

    # Split data into a testing and training set
    test_df = country_df.loc[country_df['Year'] > 2013]
    training_df = country_df.loc[country_df['Year'] < 2014]

    # Split into our x and y
    x_train = training_df['GDP'].to_numpy().reshape(-1, 1)
    y_train = training_df['Life expectancy '].to_numpy()

    # Do the Linear Regression
    linreg = LinearRegression()
    model = linreg.fit(x_train, y_train)

    # get the GDP values that we are trying to predict
    gdp_vals = test_df['GDP']

    # run the gdps from 2014 and 2015 through the model
    le_preds = [] # store predictions into a list
    for i in gdp_vals:
        gdp = np.array([i])
        prediction = model.predict(gdp.reshape(1, -1))
        # print("Predicted Response " + country_name + ":", prediction)
        le_preds.append(prediction)

    # get true values of life expectancy from 2014-2015
    le_true_vals = test_df['Life expectancy ']

    # get the r2 score
    train_r2 = model.score(x_train, y_train)
    test_r2 = r2_score(le_true_vals, le_preds)
    train_pred_le = []
    for gdp in x_train:
        train_pred_le.append(model.predict(gdp.reshape(-1, 1)))

    # get the rmse for both train and testing sets
    train_rmse = mean_squared_error(y_train, train_pred_le, squared=False)
    test_rmse = mean_squared_error(le_true_vals, le_preds, squared=False)


    return country_name, test_r2, test_rmse, train_r2, train_rmse

In [193]:
def poly_reg(country_df, degree):
    # get the country name to return
    country_name = country_df.iloc[0]['Country']

    # Cull data to use only rows that we are concerned about
    country_df = country_df[['Year', 'Life expectancy ', 'GDP']]

    # Split data into a testing and training set
    test_df = country_df.loc[country_df['Year'] > 2013]
    training_df = country_df.loc[country_df['Year'] < 2014]

    # Sort the data frame by year and remove the index
    country_df = country_df.sort_values(by=['Year'])
    country_df.reset_index(inplace=True)
    country_df.drop(columns=['index'], inplace=True)

    # Split into our x and y
    x = training_df['GDP'].to_numpy()
    y = training_df['Life expectancy '].to_numpy()

    # Fit a model according to the degree
    weights = np.polyfit(x, y, degree)
    model = np.poly1d(weights)
    predicted = model(x)

    # Calculate RMSE and R2 for training data
    train_rmse = np.sqrt(mean_squared_error(y, predicted))
    train_r2 = r2_score(y, predicted)

    # Get the GDP values from 2014 + 2015
    gdp_vals = test_df['GDP']
    le_true_vals = test_df['Life expectancy ']
    # Predict values according to 2014 + 2015 GDP values
    predicted = model(gdp_vals)

    test_rmse = np.sqrt(mean_squared_error(le_true_vals, predicted))
    test_r2 =  r2_score(le_true_vals, predicted)

    return country_name, train_rmse, train_r2, test_rmse, test_r2

In [179]:
# read CSV
df = pd.read_csv('Life Expectancy Data.csv')

# get rows only with Afghanistan data
afghanistan = df.loc[df['Country'] == 'Afghanistan']

country_name, test_r2, test_rmse, train_r2, train_rmse = linear_reg(afghanistan)

# print r2 score
print('RMSE for training data: ', train_rmse)
print('RMSE for testing data: ', test_rmse)
print('R2 score for training data: ', train_r2)
print('R2 score for testing data: ', test_r2)

RMSE for training data:  1.067259118407288
RMSE for testing data:  4.269053041258137
R2 score for training data:  0.4788209985493368
R2 score for testing data:  -1.8027395415725236


In [160]:
# Quadratic Function
# read CSV
df = pd.read_csv('Life Expectancy Data.csv')

# get rows only with Afghanistan data
afghanistan = df.loc[df['Country'] == 'Afghanistan']

country_name, train_rmse, train_r2, test_rmse, test_r2 = poly_reg(afghanistan, 2)

# print r2 score
print('RMSE for training data: ', train_rmse)
print('RMSE for testing data: ', test_rmse)
print('R2 score for training data: ', train_r2)
print('R2 score for testing data: ', test_r2)

RMSE for training data:  0.9894912620578965
RMSE for testing data:  4.150947401409658
R2 score for training data:  0.5520071442080727
R2 score for testing data:  -1.6498061252240808


In [161]:
# Cubic Function
# Fit a polynomial of degree 3
country_name, train_rmse, train_r2, test_rmse, test_r2 = poly_reg(afghanistan, 3)

# print r2 score
print('RMSE for training data: ', train_rmse)
print('RMSE for testing data: ', test_rmse)
print('R2 score for training data: ', train_r2)
print('R2 score for testing data: ', test_r2)

RMSE for training data:  0.8718359094448787
RMSE for testing data:  3.8221538776098583
R2 score for training data:  0.6522103389963457
R2 score for testing data:  -1.2466528664556815


In [162]:
# Quartic Function
# Fit a polynomial of degree 4
country_name, train_rmse, train_r2, test_rmse, test_r2 = poly_reg(afghanistan, 4)

# print r2 score
print('RMSE for training data: ', train_rmse)
print('RMSE for testing data: ', test_rmse)
print('R2 score for training data: ', train_r2)
print('R2 score for testing data: ', test_r2)

RMSE for training data:  0.8329835827343558
RMSE for testing data:  4.193473034554029
R2 score for training data:  0.6825173143510768
R2 score for testing data:  -1.7043777149606711


Task 2: Please repeat this process for all the countries in this dataset. Then, you can average the RMSE and R2 scores for all the developing and developed countries.

In [226]:
# read CSV
df = pd.read_csv('Life Expectancy Data.csv')

# Drop rows that have no data in the columns gdp and life expectancy
na_free = df.dropna(subset=['GDP', 'Life expectancy '])
only_na = df[~df.index.isin(na_free.index)]

# Find all countries that have NA values in GDP or Life Expectancy so we can drop them
na_countries = []
for country in only_na['Country']:
    if country not in na_countries:
        na_countries.append(country)

df = df[df.Country.isin(na_countries) == False]

# We also want to split the data into developed and developing dataframes
developed = df[df['Status'] == 'Developed']
developing = df[df['Status'] == 'Developing']


# limit data to country, gdp, life expectancy
developed = developed[['Country', 'Year', 'Life expectancy ', 'GDP']]
developing = developing[['Country', 'Year', 'Life expectancy ', 'GDP']]

# get each country and put it into a list
list_developed = []
list_developing = []

for country in developed['Country']:
    if country not in list_developed:
        list_developed.append(country)

for country in developing['Country']:
    if country not in list_developing:
        list_developing.append(country)

Unnamed: 0,Country,Year,Life expectancy,GDP
0,Afghanistan,2015,65.0,584.259210
1,Afghanistan,2014,59.9,612.696514
2,Afghanistan,2013,59.9,631.744976
3,Afghanistan,2012,59.5,669.959000
4,Afghanistan,2011,59.2,63.537231
...,...,...,...,...
2933,Zimbabwe,2004,44.3,454.366654
2934,Zimbabwe,2003,44.5,453.351155
2935,Zimbabwe,2002,44.8,57.348340
2936,Zimbabwe,2001,45.3,548.587312


In [230]:
# Create a dictionary to hold all scores for countries
developed_scores = pd.DataFrame(columns=['Country', 'Train RMSE', 'Test RMSE', 'Train R2', 'Test R2'])
for country in list_developed:
    # get rows only with only the country's data
    country_df = df.loc[df['Country'] == country]
    country_name, test_r2, test_rmse, train_r2, train_rmse = linear_reg(country_df)
    developed_scores = developed_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)

# Average all columms
mean_rmse_train = developed_scores['Train RMSE'].mean()
mean_rmse_test = developed_scores['Test RMSE'].mean()
mean_r2_train = developed_scores['Train R2'].mean()
mean_r2_test = developed_scores['Test R2'].mean()

# Print the values
print('RMSE for developed training data: ', mean_rmse_train)
print('RMSE for developed testing data: ', mean_rmse_test)
print('R2 score for developed training data: ', mean_r2_train)
print('R2 score for developed testing data: ', mean_r2_test)

RMSE for training data:  1.98012936948412
RMSE for testing data:  2.4042191269958013
R2 score for training data:  0.13755907547154808
R2 score for testing data:  -208.61461071109397


  developed_scores = developed_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developed_scores = developed_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developed_scores = developed_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developed_scores = developed_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developed_scores = developed_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developed_scores = developed_scores.append({'Country': country_name, 'Train RMSE': 

In [232]:
# Create a dictionary to hold all scores for countries
developing_scores = pd.DataFrame(columns=['Country', 'Train RMSE', 'Test RMSE', 'Train R2', 'Test R2'])
for country in list_developing:
    # get rows only with only the country's data
    country_df = df.loc[df['Country'] == country]
    country_name, test_r2, test_rmse, train_r2, train_rmse = linear_reg(country_df)
    developing_scores = developing_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)

# Average all columms
mean_rmse_train = developing_scores['Train RMSE'].mean()
mean_rmse_test = developing_scores['Test RMSE'].mean()
mean_r2_train = developing_scores['Train R2'].mean()
mean_r2_test = developing_scores['Test R2'].mean()

# Print the values
print('RMSE for developing training data: ', mean_rmse_train)
print('RMSE for developing testing data: ', mean_rmse_test)
print('R2 score for developing training data: ', mean_r2_train)
print('R2 score for developing testing data: ', mean_r2_test)

  developing_scores = developing_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developing_scores = developing_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developing_scores = developing_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developing_scores = developing_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developing_scores = developing_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developing_scores = developing_scores.append({'Country': country_name, 'T

RMSE for training data:  1.7199870903038201
RMSE for testing data:  2.609483395470592
R2 score for training data:  0.24229942491722484
R2 score for testing data:  -398.12926632117313


  developing_scores = developing_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developing_scores = developing_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developing_scores = developing_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developing_scores = developing_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developing_scores = developing_scores.append({'Country': country_name, 'Train RMSE': train_rmse, 'Test RMSE': test_rmse, 'Train R2': train_r2, 'Test R2': test_r2}, ignore_index=True)
  developing_scores = developing_scores.append({'Country': country_name, 'T