In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor  
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

## Data loading

In [None]:
df = pd.read_csv('data\Life-Expectancy-Data-Updated.csv')

# Simple renaming to improve readability
df=df.rename(columns={'Thinness_ten_nineteen_years':'Thinness (10-19 years)',\
                      'Thinness_five_nine_years':'Thinness (5-9 years)', \
                      'Economy_status_Developed' : 'Developed', \
                      'Economy_status_Developing' : 'Developing'                  
                     })
df_reduced = df.drop(['Infant_deaths', 'Under_five_deaths', 'Diphtheria', 'Thinness (5-9 years)'], axis=1)

## Modelling

### PCA Analysis

In [None]:
X = df_reduced.drop(['Life_expectancy', 'Country', 'Region'], axis=1)
y = df_reduced['Life_expectancy']

In [None]:
X_scaled = StandardScaler().fit_transform(X)

pca = PCA()
_ = pca.fit_transform(X_scaled)

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Components')
plt.ylabel('Fraction of Total Variance')
plt.show()

In [None]:
pca_components = pca.fit_transform(X_scaled)
plt.figure(figsize=(8,6))
sns.scatterplot(x=pca_components[:, 0], y=pca_components[:, 1], hue=df_reduced['Life_expectancy'], palette='viridis', alpha=0.75);
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

In [None]:
df_train, df_test = train_test_split(df_reduced, test_size=0.2, random_state=42)

X_train = df_train.drop('Life_expectancy', axis=1)
y_train = df_train['Life_expectancy']

X_test = df_test.drop('Life_expectancy', axis=1)
y_test = df_test['Life_expectancy']

columns = ['Adult_mortality','Schooling', 'GDP_per_capita', 'Incidents_HIV', 'Developed']
X_train_subset = X_train[columns]
X_test_subset = X_test[columns]

In [None]:
def plot_results(model):
    model.fit(X_train_subset, y_train)
    y_predict = model.predict(X_test_subset)
    mse = mean_squared_error(y_predict, y_test)
    print(f'The MSE is: {mse:.3f}')
    plt.scatter(y_predict, y_test)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

### Linear model

In [None]:
plot_results(LinearRegression())

### Decision Tree

In [None]:
plot_results(DecisionTreeRegressor(random_state=42))

### RandomForestRegressor

In [None]:
plot_results(RandomForestRegressor(random_state=42))

### Gradient Boost

In [None]:
plot_results(GradientBoostingRegressor(random_state=0))

### XGBoost

In [None]:
plot_results(XGBRegressor(n_estimators=5_000, learning_rate=0.08, n_jobs=8))

### Extra Trees

In [None]:
plot_results(ExtraTreesRegressor(n_estimators=1_000))

### K-Nearest-Neighbours

In [None]:
plot_results(KNeighborsRegressor())

### Permutation importance

In [None]:
model = XGBRegressor()
model.fit(X_train_subset, y_train)

def perm_importance(model, X, y):
    result = permutation_importance(model, X, y, n_repeats=100, random_state=42, n_jobs=2)
    idx = result.importances_mean.argsort()
    return pd.DataFrame(result.importances[idx].T, columns=X.columns[idx])

In [None]:
importance_train = perm_importance(model, X_train_subset, y_train)
ax = importance_train.plot.box(vert=False, whis=10)
ax.set_title("Permutation Importances (Training Set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()
plt.show()

In [None]:
importance_test = perm_importance(model, X_test_subset, y_test)
ax = importance_test.plot.box(vert=False, whis=10)
ax.set_title("Permutation Importances (Test Set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()
plt.show()

In [None]:
def plot_h(train, test):
    ax = plt.subplot(111)
    ind = np.arange(len(train.index))
    height=0.3
    ax.barh(ind+height/2, train, height, label='Train')
    ax.barh(ind-height/2, test, height, label='Test')
    ax.set(yticks=ind, yticklabels=train.index, ylim=[-2*height, len(ind)-height])
    ax.set_xlabel("Decrease in accuracy score")
    ax.figure.tight_layout()
    plt.legend(loc='lower right')
    plt.show()

plot_h(importance_train.mean(axis=0), importance_test.mean(axis=0))