In [24]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

Task 1: In this task, we will use different kinds of models to explore the relationship between economic status and life expectancy. For Afghanistan for instance, as the following table shows, we can use older data (from 2000 to 2013) to train models and use the trained models to predict life expectancy of 2014 and 2015. The model input can be GDP number and the model output will be life expectancy for that year.

In [45]:
# read CSV
df = pd.read_csv('Life Expectancy Data.csv')

# get rows only with Afghanistan data
afghanistan = df.loc[df['Country'] == 'Afghanistan']

# limit data to country, gdp, life expectancy
afghanistan = afghanistan[['Year', 'Life expectancy ', 'GDP']]

# remove 2014 + 2015 years, can set them to another array to compare later
actual = afghanistan.loc[afghanistan['Year'] > 2013]
afghanistan = afghanistan.loc[afghanistan['Year'] < 2014]

# Sort by year
afghanistan = afghanistan.sort_values(by=['Year'])
afghanistan.reset_index(inplace=True)
afghanistan.drop(columns=['index'], inplace=True)

# train a logistic regression model
x = afghanistan['GDP'].to_numpy().reshape(-1, 1)
y = afghanistan['Life expectancy '].to_numpy()

linreg = LinearRegression()
model = linreg.fit(x, y)

# predict for 2014, 2015 depending on gdp
pred_x = actual['GDP']

# run the gdp through the model
predictions = [] # store predictions into a list
for i in pred_x:
    gdp = np.array([i])
    prediction = model.predict(gdp.reshape(1, -1))
    print("Predicted Response:", prediction)
    predictions.append(prediction)

true_vals = actual['Life expectancy ']

# print r2 score
print('R2 score for training data: ', model.score(x, y))
print('R2 score for testing data: ', r2_score(true_vals, predictions))

# calculate rmse
rmse_pred = []
for rsme_var in x:
    rmse_pred.append(model.predict(rsme_var.reshape(-1, 1)))
print('RMSE for training data: ', mean_squared_error(y, rmse_pred, squared=False))
print('RMSE for testing data: ', mean_squared_error(true_vals, predictions, squared=False))

Predicted Response: [59.00903924]
Predicted Response: [59.1529947]
R2 score for training data:  0.4788209985493368
R2 score for testing data:  -1.8027395415725236
RMSE for training data:  1.067259118407288
RMSE for testing data:  4.269053041258137


In [65]:
x = afghanistan['GDP'].to_numpy()
# Quadratic Function
# Fit a polynomial of degree 2
weights = np.polyfit(x, y, 2)
model = np.poly1d(weights)
predicted = model(x)

# Calculate RMSE and R2 for training data
print('RMSE for training data: ', np.sqrt(mean_squared_error(y, predicted)))
print('R2 for training data: ', r2_score(y, predicted))


predicted = model(pred_x)
# Calculate the RMSE for testing data
print('RMSE for testing data: ', np.sqrt(mean_squared_error(true_vals, predicted)))
print('R2 for testing data: ', r2_score(true_vals, predicted))

RMSE for training data:  0.9894912620578978
R2 for training data:  0.5520071442080716
RMSE for testing data:  4.150947401409653
R2 for testing data:  -1.6498061252240737


In [66]:
# Cubic Function
# Fit a polynomial of degree 3
weights = np.polyfit(x, y, 3)
model = np.poly1d(weights)
predicted = model(x)

# Calculate RMSE and R2 for training data
print('RMSE for training data: ', np.sqrt(mean_squared_error(y, predicted)))
print('R2 for training data: ', r2_score(y, predicted))

predicted = model(pred_x)
# Calculate the RMSE for testing data
print('RMSE for testing data: ', np.sqrt(mean_squared_error(true_vals, predicted)))
print('R2 for testing data: ', r2_score(true_vals, predicted))

RMSE for training data:  0.8718359094448804
R2 for training data:  0.6522103389963443
RMSE for testing data:  3.8221538776098685
R2 for testing data:  -1.2466528664556935


In [68]:
# Quartic Function
# Fit a polynomial of degree 4
weights = np.polyfit(x, y, 4)
model = np.poly1d(weights)
predicted = model(x)

# Calculate RMSE and R2 for training data
print('RMSE for training data: ', np.sqrt(mean_squared_error(y, predicted)))
print('R2 for training data: ', r2_score(y, predicted))

predicted = model(pred_x)
# Calculate the RMSE for testing data
print('RMSE for testing data: ', np.sqrt(mean_squared_error(true_vals, predicted)))
print('R2 for testing data: ', r2_score(true_vals, predicted))

RMSE for training data:  0.8329835827343556
R2 for training data:  0.682517314351077
RMSE for testing data:  4.1934730345539855
R2 for testing data:  -1.7043777149606156
