In [8]:
#this version uses groups of features instead of all features
import pandas as pd
import os

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [9]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

In [10]:
folder_loc = 'F:/class/BANA 698/week 8'


file_list = [f for f in os.listdir(folder_loc) if os.path.isfile(os.path.join(folder_loc, f))]
#for file in file_list: print(file)

In [11]:
file = 'Group1DatasetRaw.onehotted.csv'
df = pd.read_csv(os.path.join(folder_loc, file))

In [12]:
#for col in sorted(df.columns): print(col)

In [13]:
target = 'Life expectancy at birth, total (years)'

cols_to_exclude = [
    #target and directly related
    'Life expectancy at birth, total (years)',
    'Life expectancy at birth, female (years)',
    'Life expectancy at birth, male (years)',
    'CountryShortName',
    'Year',
    #perfect correlate determined by correlation_to_all_lister.ipynb
    'Rural population (% of total population)' #'Urban population (% of total population)'
]


In [None]:
#default hyperparameters (don't edit)
dtr_defaults = {
    'ccp_alpha': 0.0,
    'criterion': 'squared_error',
    'max_depth': None,
    'max_features': None,
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'min_weight_fraction_leaf': 0.0,
    'random_state': None,
    'splitter': 'best'
 }
#from:
#model = DecisionTreeRegressor()
#model.get_params()

bag_defaults = {
    'base_estimator': 'deprecated',
    'bootstrap': True,
    'bootstrap_features': False,
    'estimator': None,
    'max_features': 1.0,
    'max_samples': 1.0,
    'n_estimators': 10,
    'n_jobs': None,
    'oob_score': False,
    'random_state': None,
    'verbose': 0,
    'warm_start': False
}
#from:
#model = BaggingRegressor()
#model.get_params()


In [None]:
#in production hyperparameters
dtr_params = {
    'ccp_alpha': 0.0,
    'criterion': 'squared_error',
    'max_depth': None,
    'max_features': None,
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'min_weight_fraction_leaf': 0.0,
    'random_state': 1,
    'splitter': 'best'
 }

bag_params = {
    'bootstrap': True,
    'bootstrap_features': False,
    'estimator': DecisionTreeRegressor(**dtr_params),
    'max_features': 1.0,
    'max_samples': 1.0,
    'n_estimators': 10,
    'n_jobs': -1,
    'oob_score': False,
    'random_state': 1,
    'verbose': 0,
    'warm_start': False
 }

In [16]:
def bagging_features(cols_to_exclude = cols_to_exclude, target = target, df = df, kfold_random_state = 1, n_neighbors = 5):
    X = df.drop(columns = cols_to_exclude)
    y = df[target]

    cv = KFold(n_splits=5, shuffle=True, random_state = kfold_random_state)

    mae_scores = []
    mse_scores = []
    r2_scores = []
    r2_train_scores = []
    feature_importance_list = []

    for train_idx, val_idx in cv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        imputer = KNNImputer(n_neighbors = n_neighbors)
        X_train_imputed = imputer.fit_transform(X_train)
        X_val_imputed = imputer.transform(X_val)        
        X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
        X_val_imputed = pd.DataFrame(X_val_imputed, columns=X_val.columns)
        
        model = BaggingRegressor(**bag_params)
        model.fit(X_train_imputed, y_train)
        y_train_pred = model.predict(X_train_imputed)
        y_pred = model.predict(X_val_imputed)

        mae = mean_absolute_error(y_val, y_pred)
        mse = mean_squared_error(y_val, y_pred)
        r2_train = r2_score(y_train, y_train_pred)
        r2 = r2_score(y_val, y_pred)        
        mae_scores.append(mae)
        mse_scores.append(mse)
        r2_train_scores.append(r2_train)
        r2_scores.append(r2)
        
        feature_importance_fold = np.mean([tree.feature_importances_ for tree in model.estimators_], axis=0)
        feature_importance_list.append(feature_importance_fold)


    avg_mae = np.mean(mae_scores)
    avg_mse = np.mean(mse_scores)
    avg_r2_train = np.mean(r2_train_scores)
    avg_r2 = np.mean(r2_scores)

    print(f'Average MAE: {avg_mae:.4f}')
    print(f'Average MSE: {avg_mse:.4f}')
    print(f'Average Training R²: {avg_r2_train:.4f}')
    print(f'Average Validation R²: {avg_r2:.4f}')

    avg_feature_importance = np.mean(feature_importance_list, axis=0)
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': avg_feature_importance
    })
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)
    feature_importance_df['Rank'] = feature_importance_df.index + 1
    feature_importance_df = feature_importance_df[['Feature', 'Importance', 'Rank']]

    print('\nSorted Feature Importance:')
    print(feature_importance_df.to_string(index=False))

In [17]:
#first run, with all features
bagging_features()

Average MAE: 0.3816
Average MSE: 0.3643
Average Training R²: 0.9989
Average Validation R²: 0.9948

Sorted Feature Importance:
                                                                                                              Feature   Importance  Rank
                                                              Mortality rate, adult, female (per 1,000 female adults) 6.503155e-01     1
                                                                Mortality rate, under-5, male (per 1,000 live births) 1.414440e-01     2
                                                                                 Death rate, crude (per 1,000 people) 7.230752e-02     3
                                                                      Mortality rate, under-5 (per 1,000 live births) 5.595189e-02     4
                                                              Mortality rate, under-5, female (per 1,000 live births) 4.235377e-02     5
                                                    

In [18]:
#run only 1 subgroup of features at a time
file = 'Field Mapping List.csv'
df2 = pd.read_csv(os.path.join(folder_loc, file))
print(df2['Indicator Category'].unique())

['Environmental' 'Health' 'Socioeconomic']


In [19]:
feats_all = df.columns.tolist()
cols_to_exclude_except_enviro = list(set(df2[df2['Indicator Category'] != 'Environmental']['Field Name'].tolist() + cols_to_exclude))
cols_to_exclude_except_health = list(set(df2[df2['Indicator Category'] != 'Health']['Field Name'].tolist() + cols_to_exclude))
cols_to_exclude_except_socio = list(set(df2[df2['Indicator Category'] != 'Socioeconomic']['Field Name'].tolist() + cols_to_exclude))
print(f"{len(feats_all)} total features")
print(f"{len(feats_all) - len(cols_to_exclude_except_enviro)} environmental features")
print(f"{len(feats_all) - len(cols_to_exclude_except_health)} health features")
print(f"{len(feats_all) - len(cols_to_exclude_except_socio)} socioeconomic features")

print(cols_to_exclude_except_enviro)
print(cols_to_exclude_except_health)
print(cols_to_exclude_except_socio)

286 total features
80 environmental features
73 health features
127 socioeconomic features
['School enrollment, primary (% gross)', 'GDP, PPP (constant 2021 international $)', 'Wage and salaried workers, male (% of male employment) (modeled ILO estimate)', 'GOAL 10: Reduced Inequality (5 year moving average)', 'Population ages 0-14, total', 'Labor force participation rate, total (% of total population ages 15-64) (modeled ILO estimate)', 'Probability of dying among children ages 5-9 years (per 1,000)', 'Travel services (% of commercial service imports)', 'Prevalence of overweight (modeled estimate, % of children under 5)', 'Life expectancy at birth, total (years)', 'GDP (constant 2015 US$)', 'GDP per capita growth (annual %)', 'Refugee population by country or territory of asylum', 'Labor force, total', 'Labor Force Survey (Availability score over 10 years)', 'Birth rate, crude (per 1,000 people)', 'Communications, computer, etc. (% of service imports, BoP)', 'International tourism, ex

## Environmental Features

In [20]:
#just the environmental features
bagging_features(cols_to_exclude_except_enviro)

Average MAE: 0.7016
Average MSE: 1.2443
Average Training R²: 0.9961
Average Validation R²: 0.9820

Sorted Feature Importance:
                                                                                         Feature  Importance  Rank
                                                         Access to electricity (% of population)    0.657766     1
                            Access to clean fuels and technologies for cooking (% of population)    0.081420     2
                                            Access to electricity, urban (% of urban population)    0.035695     3
               Access to clean fuels and technologies for cooking, urban (% of urban population)    0.023296     4
                                                               Arable land (hectares per person)    0.019422     5
Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal)    0.012072     6
                                  Carbon intensity of GDP (kg CO2e pe

In [21]:
#also exclude high correlation environmental features from correlation_to_all_within_group_lister.ipynb
addl_cols_to_exclude = [
    'Total greenhouse gas emissions including LULUCF (Mt CO2e)',
    'Carbon dioxide (CO2) emissions (total) excluding LULUCF (Mt CO2e)',
    'Cereal production (metric tons)',
    'Fluorinated greenhouse gases (F-gases) emissions from Industrial Processes (Mt CO2e)',
    'Methane (CH4) emissions (total) excluding LULUCF (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Industrial Combustion (Energy) (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Power Industry (Energy) (Mt CO2e)',
    'Nitrous oxide (N2O) emissions (total) excluding LULUCF (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Building (Energy) (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Fugitive Emissions (Energy) (Mt CO2e)',
    'Land under cereal production (hectares)',
    'Methane (CH4) emissions from Waste (Mt CO2e)',
    'Methane (CH4) emissions from Industrial Combustion (Energy) (Mt CO2e)',
    'Methane (CH4) emissions from Agriculture (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Industrial Processes (Mt CO2e)',
    'Nitrous oxide (N2O) emissions from Industrial Combustion (Energy) (Mt CO2e)',
    'Nitrous oxide (N2O) emissions from Agriculture (Mt CO2e)',
    'Aquaculture production (metric tons)',
    'Total fisheries production (metric tons)',
    'Nitrous oxide (N2O) emissions from Power Industry (Energy) (Mt CO2e)',
    'Forest area (sq. km)',
    'Nitrous oxide (N2O) emissions from Waste (Mt CO2e)',
    'Nitrous oxide (N2O) emissions from Building (Energy) (Mt CO2e)',
    'Access to clean fuels and technologies for cooking (% of population)',
    'Methane (CH4) emissions from Power Industry (Energy) (Mt CO2e)',
    'Methane (CH4) emissions from Fugitive Emissions (Energy) (Mt CO2e)',
    'Carbon dioxide (CO2) emissions from Transport (Energy) (Mt CO2e)',
    'Methane (CH4) emissions from Transport (Energy) (Mt CO2e)',
    'Land area (sq. km)',
    'Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita)',
    'Nitrous oxide (N2O) emissions from Industrial Processes (Mt CO2e)',
    'Methane (CH4) emissions from Building (Energy) (Mt CO2e)',
    'Access to electricity (% of population)',
    'Rural population (% of total population)',
    'Total greenhouse gas emissions excluding LULUCF (Mt CO2e)',
    'Access to electricity, rural (% of rural population)'  
]
new_cols_to_exclude_except_enviro = list(set(cols_to_exclude_except_enviro + addl_cols_to_exclude))
bagging_features(new_cols_to_exclude_except_enviro)

Average MAE: 0.7317
Average MSE: 1.4512
Average Training R²: 0.9958
Average Validation R²: 0.9792

Sorted Feature Importance:
                                                                                         Feature  Importance  Rank
                                            Access to electricity, urban (% of urban population)    0.648290     1
               Access to clean fuels and technologies for cooking, urban (% of urban population)    0.100169     2
                                                               Arable land (hectares per person)    0.049122     3
               Access to clean fuels and technologies for cooking, rural (% of rural population)    0.024997     4
                                  Carbon intensity of GDP (kg CO2e per constant 2021 US$ of GDP)    0.017862     5
Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal)    0.016500     6
                                   Fertilizer consumption (kilograms 

## Health Features

In [22]:
#just the health features
bagging_features(cols_to_exclude_except_health)

Average MAE: 0.3807
Average MSE: 0.3399
Average Training R²: 0.9990
Average Validation R²: 0.9951

Sorted Feature Importance:
                                                                                                            Feature  Importance  Rank
                                                              Mortality rate, under-5, male (per 1,000 live births)    0.372460     1
                                                            Mortality rate, adult, female (per 1,000 female adults)    0.331131     2
                                                                    Mortality rate, under-5 (per 1,000 live births)    0.111142     3
                                                                               Death rate, crude (per 1,000 people)    0.087879     4
                                                            Mortality rate, under-5, female (per 1,000 live births)    0.055580     5
                                                       Domestic privat

In [23]:
#also exclude high correlation health features from correlation_to_all_within_group_lister.ipynb
addl_cols_to_exclude = [
    'Life expectancy at birth, female (years)',
    'Number of deaths ages 15-19 years',
    'Mortality rate, infant (per 1,000 live births)',
    'Life expectancy at birth, total (years)',
    'Number of deaths ages 10-14 years',
    'Mortality rate, infant, female (per 1,000 live births)',
    'Number of deaths ages 20-24 years',
    'Prevalence of anemia among children (% of children ages 6-59 months)',
    'Mortality rate, infant, male (per 1,000 live births)',
    'Population ages 0-14 (% of total population)',
    'Mortality rate, under-5 (per 1,000 live births)',
    'Birth rate, crude (per 1,000 people)',
    'Population ages 0-14, female (% of female population)',
    'Mortality rate, neonatal (per 1,000 live births)',
    'Mortality rate, under-5, female (per 1,000 live births)',
    'Life expectancy at birth, male (years)',
    'Number of deaths ages 5-9 years',
    'Current health expenditure per capita (current US$)',
    'Out-of-pocket expenditure per capita (current US$)',
    'Population ages 0-14, male (% of male population)',
    'Population ages 10-14, female (% of female population)',
    'Population ages 0-14, female',
    'Population ages 10-14, male (% of male population)',
    'Prevalence of anemia among non-pregnant women (% of women ages 15-49)',
    'Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)',
    'People using at least basic sanitation services, rural (% of rural population)',
    'Number of infant deaths',
    'Current health expenditure per capita, PPP (current international $)',
    'Mortality rate, under-5, male (per 1,000 live births)',
    'Maternal mortality ratio (modeled estimate, per 100,000 live births)',
    'Out-of-pocket expenditure (% of current health expenditure)',
    'Population ages 15-19, female (% of female population)',
    'Probability of dying among adolescents ages 15-19 years (per 1,000)',
    'Probability of dying among adolescents ages 10-14 years (per 1,000)',
    'Prevalence of anemia among pregnant women (%)',
    'Domestic general government health expenditure per capita (current US$)',
    'Domestic private health expenditure per capita (current US$)',
    'Lifetime risk of maternal death (%)',
    'Immunization, DPT (% of children ages 12-23 months)',
    'People using at least basic drinking water services, urban (% of urban population)',
    'Population ages 0-14, male',
    'Mortality rate, adult, female (per 1,000 female adults)',
    'Number of maternal deaths',
    'People using at least basic sanitation services, urban (% of urban population)',
    'Number of neonatal deaths',
    'Total alcohol consumption per capita, female (liters of pure alcohol, projected estimates, female 15+ years of age)'    
]
new_cols_to_exclude_except_health = list(set(cols_to_exclude_except_health + addl_cols_to_exclude))
bagging_features(new_cols_to_exclude_except_health)

Average MAE: 0.4412
Average MSE: 0.4660
Average Training R²: 0.9986
Average Validation R²: 0.9933

Sorted Feature Importance:
                                                                                                        Feature  Importance  Rank
                                                 Probability of dying among children ages 5-9 years (per 1,000)    0.694992     1
                                                            Mortality rate, adult, male (per 1,000 male adults)    0.164198     2
                                                                           Death rate, crude (per 1,000 people)    0.103017     3
                       Domestic general government health expenditure per capita, PPP (current international $)    0.005794     4
                                          People using at least basic drinking water services (% of population)    0.003814     5
                                                 Lifetime risk of maternal death (1 in: rate v

## Socioeconomic Features

In [24]:
#just the socioeconomic features
bagging_features(cols_to_exclude_except_socio)

Average MAE: 0.8604
Average MSE: 1.8540
Average Training R²: 0.9949
Average Validation R²: 0.9734

Sorted Feature Importance:
                                                                                                              Feature   Importance  Rank
                                                                                            Region_Sub-Saharan Africa 5.888169e-01     1
                                                                                   GDP per capita (constant 2015 US$) 1.365193e-01     2
                                                                        Commercial bank branches (per 100,000 adults) 3.762669e-02     3
                                                                           GNI per capita, Atlas method (current US$) 2.930651e-02     4
                                                                Adjusted net national income per capita (current US$) 2.387673e-02     5
                                                    

In [25]:
#also exclude high correlation socioeconomic features from correlation_to_all_within_group_lister.ipynb
addl_cols_to_exclude = [
    'General government final consumption expenditure (constant 2015 US$)',
    'Self-employed, female (% of female employment) (modeled ILO estimate)',
    'Self-employed, male (% of male employment) (modeled ILO estimate)',
    'Adjusted net national income (current US$)',
    'Final consumption expenditure (constant 2015 US$)',
    'Self-employed, total (% of total employment) (modeled ILO estimate)',
    'GDP, PPP (constant 2021 international $)',
    'GDP per capita, PPP (constant 2021 international $)',
    'GNI, PPP (current international $)',
    'Vulnerable employment, female (% of female employment) (modeled ILO estimate)',
    'Adjusted net national income per capita (current US$)',
    'GDP, PPP (current international $)',
    'Vulnerable employment, male (% of male employment) (modeled ILO estimate)',
    'Exports of goods, services and primary income (BoP, current US$)',
    'GDP per capita (constant 2015 US$)',
    'Unemployment, male (% of male labor force) (modeled ILO estimate)',
    'Unemployment, female (% of female labor force) (modeled ILO estimate)',
    'Vulnerable employment, total (% of total employment) (modeled ILO estimate)',
    'GDP (constant 2015 US$)',
    'GNI, Atlas method (current US$)',
    'Employment to population ratio, 15+, female (%) (modeled ILO estimate)',
    'Unemployment, total (% of total labor force) (modeled ILO estimate)',
    'Domestic credit to private sector (% of GDP)',
    'GNI per capita, PPP (current international $)',
    'GDP per capita, PPP (current international $)',
    'Employment to population ratio, ages 15-24, male (%) (modeled ILO estimate)',
    'Employment to population ratio, ages 15-24, female (%) (modeled ILO estimate)',
    'Wage and salaried workers, female (% of female employment) (modeled ILO estimate)',
    'School enrollment, primary, female (% gross)',
    'Unemployment, youth female (% of female labor force ages 15-24) (modeled ILO estimate)',
    'Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)',
    'School enrollment, primary, male (% gross)',
    'Share of youth not in education, employment or training, female (% of female youth population) (modeled ILO estimate)',
    'Communications, computer, etc. (% of service exports, BoP)',
    'Communications, computer, etc. (% of service imports, BoP)',
    'Labor force participation rate for ages 15-24, female (%) (modeled ILO estimate)',
    'Primary education, pupils',
    'Monetary Sector credit to private sector (% GDP)',
    'Labor force, female (% of total labor force)',
    'Labor force participation rate, female (% of female population ages 15-64) (modeled ILO estimate)',
    'Labor force participation rate, female (% of female population ages 15+) (modeled ILO estimate)',
    'Labor force participation rate for ages 15-24, male (%) (modeled ILO estimate)',
    'Inflation, GDP deflator (annual %)',
    'GNI per capita, Atlas method (current US$)',
    'GNI (current US$)',
    'GDP growth (annual %)',
    'Employment to population ratio, ages 15-24, total (%) (modeled ILO estimate)',
    'Employment to population ratio, 15+, total (%) (modeled ILO estimate)',
    'Employment to population ratio, 15+, male (%) (modeled ILO estimate)',
    'Wage and salaried workers, male (% of male employment) (modeled ILO estimate)'   
]
new_cols_to_exclude_except_socio = list(set(cols_to_exclude_except_socio + addl_cols_to_exclude))
bagging_features(new_cols_to_exclude_except_socio)

Average MAE: 0.8967
Average MSE: 1.9826
Average Training R²: 0.9941
Average Validation R²: 0.9716

Sorted Feature Importance:
                                                                                                          Feature   Importance  Rank
                                                                                        Region_Sub-Saharan Africa 5.888169e-01     1
                                                                                     GDP per capita (current US$) 2.002121e-01     2
                                                                    Commercial bank branches (per 100,000 adults) 3.981752e-02     3
                                                                                      Pillar 1 - Data Use - Score 1.906746e-02     4
                                                            Domestic credit to private sector by banks (% of GDP) 1.563907e-02     5
                                                             Refugee populat